In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as ss
import scipy.cluster.hierarchy as sch
import pandas as pd
import numpy as np
import sklearn
import scanpy as sc
import os

In [None]:
spatialLIBD_dir = 'data/spatialLIBD_data'

In [None]:
spots = pd.read_csv(os.path.join(spatialLIBD_dir, 'spatialLIBD_spot_counts.csv'), header = 0, index_col = 0, sep =',')
st = pd.read_csv(os.path.join(spatialLIBD_dir, 'spatialLIBD_spot_st.csv'))
gene_meta = pd.read_csv(os.path.join(spatialLIBD_dir, 'gene_meta.csv'))
cell_type = pd.read_csv(os.path.join(spatialLIBD_dir, 'RowDataTable1.csv'))
csr = pd.read_csv(os.path.join(spatialLIBD_dir, 'spatialLIBD_csr_counts_sample_id.csv'), index_col=0)

In [None]:
spots.to_pickle(os.path.join(spatialLIBD_dir, 'spatialLIBD_spot_counts.pkl'))
st.to_pickle(os.path.join(spatialLIBD_dir, 'spatialLIBD_spot_st.pkl'))
gene_meta.to_pickle(os.path.join(spatialLIBD_dir, 'gene_meta.pkl'))
cell_type.to_pickle(os.path.join(spatialLIBD_dir, 'RowDataTable1.pkl'))
csr.to_pickle(os.path.join(spatialLIBD_dir, 'spatialLIBD_csr_counts_sample_id.pkl'))

In [None]:
# spots = pd.read_pickle(os.path.join(spatialLIBD_dir, 'spatialLIBD_spot_counts.pkl'))
# st = pd.read_pickle(os.path.join(spatialLIBD_dir, 'spatialLIBD_spot_st.pkl'))
# gene_meta = pd.read_pickle(os.path.join(spatialLIBD_dir, 'gene_meta.pkl'))
# cell_type = pd.read_pickle(os.path.join(spatialLIBD_dir, 'RowDataTable1.pkl'))
# csr = pd.read_pickle(os.path.join(spatialLIBD_dir, 'spatialLIBD_csr_counts_sample_id.pkl'))

In [None]:
print('spots')
# display(spots)
print('st')
# display(st)
print('gene_meta')
# display(gene_meta)
print('cell_type')
# display(cell_type)
print('csr')
display(csr)

In [None]:
print(spots.columns)

In [None]:
# rename st column names
st.columns = ["spot", "X", "Y"]
print(st.head())

In [None]:
spot = spots[['sample_id', 'key', 'subject', 'replicate', 'Cluster', 'sum_umi', 'sum_gene', 'cell_count', 'in_tissue','spatialLIBD', 'array_col', 'array_row']]
print(spot)

In [None]:
# merge spot and st info -- merging based on index... no other specifying info in st:S, seems okay?
spot_meta = st.join(spot.reset_index())
print(spot_meta)


In [None]:
assert((spot_meta.spot.isin(spot_meta['index'])).all())

In [None]:
def plot_cell_layers(df):
    
    layer_idx = df["spatialLIBD"].unique()
    
    fig, ax = plt.subplots(nrows = 1, ncols = 12, figsize = (50,6))
    samples = df["sample_id"].unique()
    
    for idx, sample in enumerate(samples):
        cells_of_samples = df[df["sample_id"] == sample]
        for index in layer_idx:
            cells_of_layer = cells_of_samples[cells_of_samples["spatialLIBD"] == index]
            ax[idx].scatter(-cells_of_layer["Y"], cells_of_layer["X"], label = index)
        ax[idx].set_title(sample)
    plt.legend()
    plt.show()

In [None]:
print(plot_cell_layers(spot_meta) )

In [None]:
print(cell_type)

In [None]:
cell_type = cell_type.set_index("Symbol")

In [None]:
cell_type_idx_df = cell_type.iloc[:,:3]

In [None]:
cell_type = cell_type.drop(['Unnamed: 0', 'gene_biotype', "ID"], axis=1)

In [None]:
del spots
del spot
del gene_meta
del st

In [None]:
wide = csr.pivot_table(index = ["sample_id", "spot"], columns = "gene", values = "count").fillna(0).astype(pd.SparseDtype("float", 0.0))
# wide = wide.fillna(0)
# wide = wide.astype(pd.SparseDtype("float", 0.0))


In [None]:
counts_df = wide
print(counts_df)

In [None]:
ID_to_symbol_d = cell_type_idx_df.ID.reset_index().set_index('ID')['Symbol'].to_dict()

In [None]:
counts_df.columns = counts_df.columns.map(ID_to_symbol_d, na_action=None)
print(counts_df)

In [None]:
# # working with sampleID 151673 only, for now
# dlpfc = spot_meta[spot_meta['sample_id'] == 151673]
dlpfc = spot_meta

In [None]:
dlpfc = dlpfc.set_index(['sample_id', 'spot'])

In [None]:
print(dlpfc)

In [None]:
temp = pd.concat([dlpfc, counts_df], join='inner', axis=1)
print(temp)

In [None]:
temp = temp.iloc[:,15:]
print(temp)

In [None]:
same_genes = cell_type[cell_type.index.isin(temp.columns)]
print(same_genes)

In [None]:
counts_df.to_pickle(os.path.join(spatialLIBD_dir, 'counts_df.pkl'))

In [None]:
print(dlpfc)

In [None]:
dlpfc.to_pickle(os.path.join(spatialLIBD_dir, 'dlpfc.pkl'))

In [None]:
temp.to_pickle(os.path.join(spatialLIBD_dir, 'temp.pkl'))

In [None]:
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#     display(spots.iloc[0])

In [None]:
# spots[spots.sample_id == 151673].cell_count.value_counts()

In [None]:
# sns.displot(spots[spots.sample_id == 151673].cell_count)

In [None]:
# spots[spots.sample_id == 151673].cell_count.describe()