In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm, Normalize
import seaborn as sb
import os
import sys
sys.setrecursionlimit(1000000)
from scipy.stats import spearmanr
import pathlib


In [None]:
directory = pathlib.PurePath('/camp/lab/znamenskiyp/home/shared/projects/turnerb_MAPseq/A1_MAPseq/FIAA32.6a/Sequencing/Processed_data/BC_split/temp/increased_cutoff/')
barcodes_across_sample = pd.read_pickle(directory/'raw_barcodes_across_sample_higher_cutoff.pkl')
#load registration files containing volume of each brain area within each sample and which RT primer corresponds to which sample name
lcm_reg_dir = pathlib.PurePath('/nemo/lab/znamenskiyp/home/shared/code/MAPseq_processing/AC_MAPseq/Brain1_FIAA32.6a/LCM_registration')
#_3dareas = '/camp/lab/znamenskiyp/home/shared/code/MAPseq_processing/AC_MAPseq/Brain1_FIAA32.6a/LCM_registration/3D_areas_in_sample.csv'
areas = pd.read_csv(lcm_reg_dir/'3d_areas.csv')
RTtosample = pd.read_csv(lcm_reg_dir/'RTprimer_tosample.csv')
areas = areas.merge(RTtosample, how='inner', on='sample')
areas.sort_values("RT_primer", inplace=True)

In [None]:
#spike-in normalisation, generate table of spike counts per sample
spike_counts = pd.DataFrame(columns=["sample", "spike_count"])
for sample in os.listdir(directory):
    if sample.startswith("spikecounts"):
        sample_name = sample.split("spikecounts_", 1)
        sample_name = sample_name[1][: -len(".csv")]
        sample_num = float(sample_name[2 :])
        sample_reading = pd.read_csv(directory/sample)
        sample_reading["counts"] = sample_reading["counts"].astype("int")
        sum_counts = sample_reading["counts"].sum()
        new_row = pd.DataFrame(
            {"sample": sample_num, "spike_count": sum_counts}, index=[0]
        )
        spike_counts = pd.concat([spike_counts, new_row])


In [None]:
#drop samples that contain spike count less than 10, as RT likely failed for these samples
min_spike = 1500
spike_thresholded = spike_counts[spike_counts['spike_count'] >= min_spike]
areas_dropped= areas[areas['RT_primer'].isin(spike_thresholded['sample']) == False].RT_primer
areas= areas[areas['RT_primer'].isin(spike_thresholded['sample']) == True]
barcodes_across_sample = barcodes_across_sample.drop(columns=np.array(areas_dropped))    

In [None]:
#also drop sample 5 that doesn't have reg info
areas = areas.drop([4])
barcodes_across_sample =barcodes_across_sample.drop(columns=[5])

In [None]:
#now remove any barcodes with a count less than 1, then remove barcodes that don't have a count anywhere.\
barcodes_across_sample = barcodes_across_sample.replace(1,0)
barcodes_across_sample.fillna(0,inplace=True)
barcodes_across_sample = barcodes_across_sample.loc[~(barcodes_across_sample==0).all(axis=1)]


In [None]:
index=0
for i in barcodes_across_sample.iloc[2]:
    index =index +1
    print(index, i)
#areas.shape
index=0
for column in barcodes_across_sample.columns:
    index = index+1
    print(index, column)

In [None]:
bla = barcodes_across_sample[[1, 2, 3, 4]]
bla = bla.loc[~(bla==0).all(axis=1)]
bla

In [None]:
#normalise counts by spike-in counts
lowest = min(spike_thresholded["spike_count"])
spike_thresholded["normalisation_factor"] = spike_thresholded["spike_count"] / lowest
#spike_thresholded= spike_thresholded.sort_values("sample", inplace=True)
spike_thresholded =spike_thresholded.set_index('sample')
spike_thresholded.sort_index(inplace=True)
norm = spike_thresholded['normalisation_factor'].T
barcodes_across_sample_changed = barcodes_across_sample.div(norm, axis='columns')

barcodes_across_sample_changed.fillna(0,inplace=True)
#plt heatmap of barcode matrix after spike normalisation
fig, ax = plt.subplots(figsize=(60, 10))
sb.heatmap(barcodes_across_sample, cmap='Blues', norm=LogNorm())
plt.show()

In [None]:
#label sample 29 as caudal striatum 
CStr = areas[areas['RT_primer']==29]
CStr_val = CStr.sum(axis=1)-(CStr['RT_primer']+CStr['sample'])
to_add = pd.DataFrame(columns=areas.columns)
to_add['Cstr'] = CStr_val
to_add =to_add.fillna(0)
to_add

In [None]:
areas['Cstr'] = 0
areas.update(to_add)

In [None]:
group_areas = {
    'tectum': ['SCdg', 'SCdw', 'SCig', 'SCiw', 'SCop', 'SCsg', 'SCzo', 'ICc', 'ICd', 'ICe', 'NB'],
    'thalamus': ['PoT', 'TH', 'MGm', 'MGv', 'MGd', 'LGd-co', 'LP', 'POL', 'PO', 'LD', 'VPL', 'PIL', 'Eth'],
    'SS': ['SSp-bfd', 'SSp-ll', 'SSp-m', 'SSp-n', 'SSp-tr', 'SSp-ul', 'SSp-un', 'SSs'],
    'M': ['MOs', 'MOp'],
    'contra': areas.filter(like="Contra").columns,
    #'AudC': ['Contra-AUDd', 'Contra-AUDp', 'Contra-AUDv'],
    #'VisC': ['Contra-VISa', 'Contra-VISam'],
    'VisIP': ['VISa', 'VISal', 'VISam', 'VISl', 'VISp', 'VISpm', 'VISpor', 'VISrl', 'VISli', 'VISpl'],
    'RStr': ['CP', 'STR', 'ACB'],
    'pons': ['SOCm', 'SOCl', 'POR', 'PRNr', 'PRNc', 'TRN', 'P', 'P-mot', 'PG', 'NLL']
}

In [None]:
for group, columns in group_areas.items():
    areas[group] = areas.filter(items=columns).sum(axis=1)
    areas = areas.drop(columns, axis=1)

In [None]:
areas_only = areas.drop(['sample', 'RT_primer', 'ar', 'bic', 'bsc', 'ccb', 'ccb', 'ccg', 'cing', 'cpd', 'csc', 'cst', 'ec', 'fa', 'fi',
    'fiber tracts', 'fp', 'll', 'mcp', 'ml', 'onl', 'or', 'py', 'root', 'sctv', 'scwm', 'tb', 'CTXsp', 'act', 'alv', 'amc', 'cic', 'VL', 'MRN'], axis=1)

In [None]:
areas_only = areas_only.loc[:, np.sum(areas_only, axis=0)>0]
areas_matrix = areas_only.to_numpy()
areas_matrix /= np.sum(areas_matrix, axis=0)

In [None]:
areas_only

In [None]:

barcodes_across_sample.fillna(0,inplace=True)
barcodes_matrix = barcodes_across_sample.to_numpy()
barcodes_matrix[np.isnan(barcodes_matrix)] = 0
total_projection_strength = np.sum(barcodes_matrix, axis=1)
barcodes_matrix /= total_projection_strength[:, np.newaxis]

barcodes_matrix = barcodes_matrix[total_projection_strength>0, :]

In [None]:
pd.DataFrame(barcodes_matrix)

In [None]:
from sklearn.linear_model import LinearRegression, Lasso

mdl = LinearRegression(fit_intercept=False, positive=True)
mdl.fit(areas_matrix, barcodes_matrix.T)

In [None]:
#raw, not spike normalised, with higher cutoff
plt.figure(figsize=(20,70))
df = pd.DataFrame(mdl.coef_[:15000,:], columns=areas_only.columns)
sb.clustermap(df.T, vmax=0.1, dendrogram_ratio=[0.1, 0.1], yticklabels=True)

In [None]:
areas_comp = ['MOB', 'M', 'Cstr', 'RStr', 'tectum', 'thalamus', 'contra', 'VisIP', 'SS']
areas_grouped = df[areas_comp]

In [None]:
areas_grouped = areas_grouped.loc[~(areas_grouped==0).all(axis=1)]

In [None]:
#raw, not spike normalised, with higher cutoff
plt.figure(figsize=(10,50))

sb.clustermap(areas_grouped.T, cmap="Blues", dendrogram_ratio=(.1, .2), vmax=0.05, cbar_pos=(-0.2, .5, .1, .4), yticklabels=True)

In [None]:
#create a dataframe of the fractions of each brain area contained within each sample
areas_matrix = areas_only.to_numpy()
total_frac = np.sum(areas_matrix, axis=1)
frac_matrix = areas_matrix/total_frac[:, np.newaxis]
areasFrac = pd.DataFrame(frac_matrix, columns=areas_only.columns)

In [None]:
areas_only

In [None]:
#for each barcode, create a matrix of BC count for regions in a sample based on amount of each region in LCM (makes assumption of equal BC distribution)
bc_matrix = np.zeros(shape=((len(barcodes_across_sample), (len(areas_only.columns)))))
bc_matrix = pd.DataFrame(data= bc_matrix, columns=areas_only.columns, index=barcodes_across_sample.index)

In [None]:
for i, row in barcodes_across_sample.iterrows():
    bc_matrix1 =pd.DataFrame(columns=areas_only.columns)
    for samplename in barcodes_across_sample.columns:
        counts = row.to_numpy()
        frac_counts =frac_matrix* counts[:, np.newaxis]
        sample_counts =pd.DataFrame(frac_counts, columns = areas_only.columns)
    for region in sample_counts.columns:
        bc_matrix.at[i, region] = sample_counts[region].sum()/areas_only[region].sum()
#bc_matrix.to_pickle(lcm_reg_dir/'bc_matrix_lcm_2.pkl')

In [None]:
bc_matrix1 =bc_matrix.drop(['AUDpo'], axis=1)

In [None]:
areas_comp = ['MOB', 'M', 'Cstr', 'RStr', 'tectum', 'thalamus', 'contra', 'VisIP', 'SS']
areas_grouped = bc_matrix[areas_comp]

In [None]:
areas_grouped = areas_grouped.loc[~(areas_grouped==0).all(axis=1)]
areas_grouped

In [None]:
# perform hierarchial clustering of all barcodes across samples
sb.clustermap(areas_grouped, metric='euclidean', standard_scale=0, norm=LogNorm(), cmap="Blues", figsize=(10, 10))

In [None]:
sample_counts = pd.DataFrame(frac_counts, columns = areas_only.columns)
for region in sample_counts.columns:
    bc_matrix.at[i, region] = sample_counts[region].sum()/areas_only[region].sum()

In [None]:
for region in sample_counts.columns:
    bc_matrix.at[i, region] = sample_counts[region].sum()/areas_only[region].sum()

In [None]:
sample_counts.sum()

In [None]:
sample_counts['thalamus'].sum()/areas_only[region].sum()

In [None]:
areas_only[region].sum()

In [None]:
barcodes_across_sample

In [None]:
areas_only.iloc[0].sum()

In [None]:
for i in areas_only[region]:
    print(i)

In [None]:
row =barcodes_across_sample.iloc[2].to_numpy()

bl =frac_matrix* row[:, np.newaxis]
f =pd.DataFrame(bl, columns = areas_only.columns)

In [None]:
f

In [None]:
f.iloc[32]

In [None]:
ind = -1
for samplename in barcodes_across_sample.columns:
    ind = +1
    bl = row.iloc[ind]*areasFrac
    print(samplename)

In [None]:
areasFrac.shape

In [None]:
barcodes_across_sample.loc[2]

In [None]:
for i in barcodes_across_sample.loc[2]:
    print (i)

In [None]:
bl =row*areasFrac


In [None]:
areasFrac

In [None]:
#for each barcode, create a matrix of BC count for regions in a sample based on amount of each region in LCM (makes assumption of equal BC distribution)
bc_matrix = np.zeros(shape=((len(newdf), (len(areas_only.columns)))))
bc_matrix = pd.DataFrame(data= bc_matrix, columns=areas_only.columns, index=newdf.index)
for i, row in newdf.iterrows():
    bc_matrix1 =pd.DataFrame(columns=areas_only.columns)
    for samplename in newdf.columns:
        ind = areas.index[areas['RT_primer']==samplename].tolist()
        fractionC = areasFrac.iloc[ind[0]]*row.loc[samplename]
        bc_matrix1 = bc_matrix1.append(fractionC)
    for region in bc_matrix1.columns:
        bc_matrix.at[i, region] = bc_matrix1[region].sum()/areas_only[region].sum()
#bc_matrix.to_pickle(lcm_reg_dir/'bc_matrix_lcm_2.pkl')