# This is the notebook for separating code and develop the functions to finally be able to transfer it to scripts

In [10]:
%matplotlib inline
import novosparc
import os
import numpy as np
import pandas as pd
import pickle

import plotly.express as px
import plotly.graph_objects as go

# Input:

* `tissue.gw` (from novosparc reconstruction)
* meta data as csv, read in as pd.df(X,1

In [None]:
data_dir = 'data'
meta_ct_path = os.path.join(data_dir, "mouse_brain_meta_ct.csv")
cell_types_raw = pd.read_csv(meta_ct_path)

In [14]:
%%time
#read the pickle file
tissue_file = "tissue_gw.pkl"
picklefile = open(tissue_file, 'rb')
#unpickle the dataframe
pckl_matrix = pickle.load(picklefile)
#close file
picklefile.close()

gw_matrix = pd.DataFrame(pckl_matrix)

CPU times: user 3.21 ms, sys: 12 ms, total: 15.2 ms
Wall time: 14.4 ms


In [15]:
print(cell_types_raw.shape)
print(gw_matrix.shape)
print(type(gw_matrix))

(2688, 1)
(2688, 2688)
<class 'pandas.core.frame.DataFrame'>


# onehot encoding/df merging

In [None]:
def merge_meta_gw(gw_matrix, meta_matrix):
    """
    gw_matrix - pd.DataFrame, from tissue object (tissue.gw)
    meta_matrix - pd.DataFrame, meta data (x,1), row length (cells?) must match gw_matrix
    """

    # ToDo: Implement Check for matching dimensions

    # use one-hot encoding for meta-data labels
    meta_encode = pd.get_dummies(meta_matrix)
    # get unique lsit of labels
    unque_lbs = list(meta_matrix[0].unique())
    # merge matrixes
    meta_gw_merge = np.dot(gw_matrix.T,meta_encode) # transpose to get locations as rows
    # dataframe with column names
    meta_gw_df = pd.DataFrame(meta_gw_merge, columns = unque_lbs)

    return meta_gw_df

def meta_to_tissue(list_of_metagw_names, list_of_metagw_df, tissue):
    """
    list_of_metagw_names - list containing strings of names of meta_gw matrices to be added to the tissue obj, ordering has to match ordering in list_of_metagw_df
    list_of_metagw_df - list containing dataframes to add, ordering has to match ordering in list_of_metagw_df
    tissue - tissue object, output from novosparc.reconstruct
    """
    # ToDo # this is a quick and dirty way, there is a more clean way here: https://stackoverflow.com/questions/18425225/getting-the-name-of-a-variable-as-a-string but I'm not so sure about using an additional package for this

    # ToDo: implement check for matching length of the both lists

    metadata_mapped = dict()

    for df_name in list_of_metagw_names:
        for df in list_of_metagw_df: # this assumes the usage of only a few metadata sets, not really a scalable approach
            metadata_mapped[df_name] = df

    tissue.metadata = metadata_mapped
    print("tissue object modified with set of metadata dataframes")

**Plotting meta data lable specific works further with:**
```
pl_genes =["some", "genes"]
dataset_to_plot = sc.AnnData(tissue.metadata[">>name_of_metadata_label<<"], dtype=float)
dataset_to_plot.obsm['spatial'] = locations
novosparc.pl.embedding(dataset_to_plot, pl_genes)
```

# Dataframes for "hightes mapping prop" plots

In [None]:
def get_highest_prop_lbl(meta_array, plot_anndata):
    """
    meta_array = pd.Df of the metadata to plot
    plot_anndata = anndata version of the metadata matrix for plotting (compare w dataset_tissue)
    """
    # find max value in the row
    out_array = np.amax(np.array(meta_array), axis=1)
    # find the values column name - get the index
    out_array = np.vstack((out_array, np.argmax(np.array(meta_array), axis=1)))
    # transform to df to enable multiple data types
    out_df = pd.DataFrame(out_array.T)
    # write value and column name into the new df
    # can I add a new column like this? - yes, but it takes already quite some time
    # ToDo: Implement speed up versions Enes wrote to you
    out_df[2] = out_df[1].apply(lambda x: meta_array.columns[x])
    out_df.columns = ["mapping_prop_lbl", "mapped_lbl_idx", "mapped_lbl"]

    # add annotation to the dataset anndata
    for col in out_df.columns:
        plot_anndata.obsm[col] = out_df[col].to_numpy().reshape(-1,1)

    return out_df

In [None]:
def plot_high_prop_label_no_opac(plot_anndata,width=1000,height=800,plot_bgcolor="black",columnname="label"):
    """
    plot_anndata - anndata version of the metadata matrix for plotting (compare w dataset_tissue) after
    "get_highest_prop_lbl" processing
    """

    # ToDo: Implement check for existence of "mapped_lbl" property


    # set coordinates
    xy = plot_anndata.obsm['spatial']
    x = xy[:, 1]
    y = xy[:, 0] if xy.shape[1] > 1 else np.ones_like(x)

    # set values
    values = plot_anndata.obsm["mapped_lbl"]
    plot_df = pd.DataFrame(values, columns=[columnname])

    # set figure
    fig = px.scatter(plot_df, x=x, y=y, color=columnname,
                 width=width, height=height)
    fig.update_xaxes(showgrid=False, zeroline=False)
    fig.update_yaxes(showgrid=False, zeroline=False)
    fig.update_layout(plot_bgcolor=plot_bgcolor)
    # ToDo: add color palette option (especially important when doing plots for multiple metadata sets)
    fig.show()

In [None]:
def flatten(l):
     return [item for sublist in l for item in sublist]

def plot_high_prop_label_var_opac_1(plot_anndata, out_df, labelname = "label"):
    """
    todo
    """

    # ToDo: Implement check for assumptions (e.g. "mapped_lbl")

    # labels as discrete colors with strings as label
    meta_labels = plot_anndata.obsm["mapped_lbl"]
    lbl_idx = out_df["mapped_lbl_idx"]
    plot_df = pd.DataFrame({labelname : plot_anndata.obsm["mapped_lbl"].tolist(),
                            "lbl_idx": plot_anndata.obsm["mapped_lbl_idx"].tolist()
                             })
    # get labels as string list
    lbl_str = flatten(plot_df[labelname])
    # match the lable string wiht their index number (for legend)
    lbl_mtchng = {k:str(v) for k,v in zip(tuple(lbl_idx), tuple(lbl_str))}

    return lbl_mtchng, lbl_idx

def plot_high_prop_label_var_opac_2(plot_anndata, out_df, lbl_idx, lbl_mtchng, labelname = "label",
                                    width=1000,height=800):
    """
    todo
    """

    # ToDo: checks for assumptions and naming conventions

    # plot frame
    xy = plot_anndata.obsm['spatial']
    x = xy[:, 1]
    df = pd.DataFrame({'x_lv': xy[:, 1],
                   'y_lv': xy[:, 0] if xy.shape[1] > 1 else np.ones_like(x),
                   'color': lbl_idx,
                   'alpha': out_df["mapping_prop_lbl"]
                   })

    # color legend
    clrs_dscrt = px.colors.qualitative.Alphabet[:len(df['color'].unique())]
    idx_clr = {idx:str(clr) for idx,clr in zip(df['color'].unique(),set(clrs_dscrt))}

    # plot
    fig = go.Figure()
    fig.update_xaxes(showgrid=False, zeroline=False)
    fig.update_yaxes(showgrid=False, zeroline=False)

    fig.layout.height = height
    fig.layout.width = width

    for c in df['color'].unique():
        df_color = df[df['color'] == c]
        # normalize alpha values
        alpha_raw = df_color['alpha'].to_numpy().reshape(-1,1)
        alpha_norm = (alpha_raw - alpha_raw.min()) / (alpha_raw.max() - alpha_raw.min())
        # plot label color
        fig.add_trace(
         go.Scatter(
             x=df_color['x_lv'],
             y=df_color['y_lv'],
             name=lbl_mtchng[c],
             mode="markers",
             # text = out_df[\ mapped_ct\ ], # todo this could be an array of joint string with cell type + probability,
             showlegend=True,
             marker=go.scatter.Marker(
                 color=idx_clr[c],
                 size=11,
                 opacity=alpha_norm ) # I think this has to be another column in that dataframe then
             ))
        fig.update_layout(legend=dict(
                 bordercolor = idx_clr[c]))

    fig.show()