In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm, Normalize
import seaborn as sb
import os
import sys
sys.setrecursionlimit(1000000)
from scipy.stats import spearmanr
import pathlib


Load barcodes and LCM registration files and assign counts to areas

In [None]:
directory = pathlib.PurePath('/camp/lab/znamenskiyp/home/shared/projects/turnerb_MAPseq/A1_MAPseq/FIAA32.6a/Sequencing/Processed_data/final_counts/')
barcodes_across_sample = pd.read_pickle(directory/'barcodes_across_sample.pkl')

In [None]:
#load registration files containing volume of each brain area within each sample and which RT primer corresponds to which sample name
lcm_reg_dir = pathlib.PurePath('/nemo/lab/znamenskiyp/home/shared/code/MAPseq_processing/AC_MAPseq/Brain1_FIAA32.6a/LCM_registration')
#_3dareas = '/camp/lab/znamenskiyp/home/shared/code/MAPseq_processing/AC_MAPseq/Brain1_FIAA32.6a/LCM_registration/3D_areas_in_sample.csv'
areas = pd.read_csv(lcm_reg_dir/'3d_areas.csv')
RTtosample = pd.read_csv(lcm_reg_dir/'RTprimer_tosample.csv')
areas = areas.merge(RTtosample, how='inner', on='sample')

group_areas = {
    'SC': ['SCdg', 'SCdw', 'SCig', 'SCiw', 'SCop', 'SCsg', 'SCzo'],
    'IC': ['ICc', 'ICd', 'ICe'],
    'SSp': ['SSp-bfd', 'SSp-ll', 'SSp-m', 'SSp-n', 'SSp-tr', 'SSp-ul', 'SSp-un'],
    'contra': areas.filter(like="Contra").columns,
    'striatum': ['CP', 'STR', 'ACB'],
    'pons': ['SOCm', 'SOCl', 'POR', 'PRNr', 'PRNc', 'TRN', 'P', 'P-mot']
}

for group, columns in group_areas.items():
    areas[group] = areas.filter(items=columns).sum(axis=1)
    areas = areas.drop(columns, axis=1)

areas_only = areas.drop(['sample', 'RT_primer', 'ar', 'bic', 'bsc', 'ccb', 'ccb', 'ccg', 'cing', 'cpd', 'csc', 'cst', 'ec', 'fa', 'fi',
    'fiber tracts', 'fp', 'll', 'mcp', 'ml', 'onl', 'or', 'py', 'root', 'sctv', 'scwm', 'tb', 'CTXsp', 'act', 'alv', 'amc', 'cic', 'TH'], axis=1)

areas_matrix = areas_only.to_numpy()
areas_matrix = areas_matrix / np.sum(areas_matrix, axis=0)

barcodes_matrix = barcodes_across_sample.to_numpy()
barcodes_matrix[np.isnan(barcodes_matrix)] = 0
barcodes_matrix /= np.sum(barcodes_matrix, axis=1)[:, np.newaxis]

In [None]:
from sklearn.linear_model import LinearRegression

mdl = LinearRegression(fit_intercept=False, positive=True)
mdl.fit(areas_matrix, barcodes_matrix.T)

In [None]:
plt.figure(figsize=(20,70))
df = pd.DataFrame(mdl.coef_[:15000,:], columns=areas_only.columns)
sb.clustermap(df.T, vmax=0.01, dendrogram_ratio=[0.1, 0.1], yticklabels=True)


In [None]:
plt.figure(figsize=(20, 5))
sb.barplot(df)
plt.xticks(rotation=90)

In [None]:
#remove barcodes that are only seen in one sample (NB this is not needed, I initially put in as a QC)
barcodes_across_sample['samplesnotin'] =0
for index, row in barcodes_across_sample.iterrows():
    barcodes_across_sample['samplesnotin'].iloc[index]=(row.isna().sum())
barcodes_across_sample = barcodes_across_sample[barcodes_across_sample['samplesnotin']<90]
barcodes_across_sample = barcodes_across_sample.drop('samplesnotin', axis=1)


In [None]:
#remove NaN
barcodes_across_sample = barcodes_across_sample.fillna(0)
#set min val to 1
barcodes_across_sample= barcodes_across_sample.reset_index(drop=True)
for index, row in barcodes_across_sample.iterrows():
    bla = np.array(row)
    smallest = np.min(bla[np.nonzero(bla)])
    barcodes_across_sample.iloc[[index]]=row/smallest

In [None]:
#select rows based on min count at source sites
barcodes_norm_sub1 = barcodes_across_sample.loc[(barcodes_across_sample[40] >= 20)]
barcodes_norm_sub2 = barcodes_across_sample.loc[(barcodes_across_sample[41] >= 20)]
barcodes_norm_sub3 = barcodes_across_sample.loc[(barcodes_across_sample[42] >= 20)]
barcodes_norm_sub4 = barcodes_across_sample.loc[(barcodes_across_sample[43] >= 20)]
barcodes_norm_sub5 = barcodes_across_sample.loc[(barcodes_across_sample[49] >= 20)]
barcodes_norm_sub6 = barcodes_across_sample.loc[(barcodes_across_sample[50] >= 20)]
barcodes_norm_sub7 = barcodes_across_sample.loc[(barcodes_across_sample[51] >= 20)]
barcodes_norm_sub8 = barcodes_across_sample.loc[(barcodes_across_sample[52] >= 20)]
newdf =pd.concat([barcodes_norm_sub1, barcodes_norm_sub2])
newdf =pd.concat([newdf, barcodes_norm_sub3])
newdf =pd.concat([newdf, barcodes_norm_sub4])
newdf =pd.concat([newdf, barcodes_norm_sub5])
newdf =pd.concat([newdf, barcodes_norm_sub6])
newdf =pd.concat([newdf, barcodes_norm_sub7])
newdf =pd.concat([newdf, barcodes_norm_sub8])
newdf = newdf[~newdf.index.duplicated(keep='first')] #remove duplicate barcodes

In [None]:
#plot heatmap showing barcodes in source site
fig, ax = plt.subplots(figsize=(60, 10))
sb.heatmap(barcodes_across_sample, norm=LogNorm())
plt.show()

In [None]:
#plot heatmap showing barcodes in source site with minimum thresholds
fig, ax = plt.subplots(figsize=(60, 10))
sb.heatmap(newdf, norm=LogNorm())
plt.show()

In [None]:
#now remove columns containing source sites and negative control
newdf = newdf.drop([40, 41, 42, 43, 49, 50, 51, 52, 1, 2, 3, 4, 5], axis=1)


In [None]:
#create a dataframe of the fractions of each brain area contained within each sample
total = pd.DataFrame()
total['sum'] = areas_only.sum(axis=1)
areasFrac = pd.DataFrame(columns=areas_only.columns)
for i, row in areas_only.iterrows():
    newrow = row/total['sum'].iloc[i]
    areasFrac =areasFrac.append(newrow)

In [None]:
areas_only / areas_only.sum(axis=1)

In [None]:
#for each barcode, create a matrix of BC count for regions in a sample based on amount of each region in LCM (makes assumption of equal BC distribution)
bc_matrix = np.zeros(shape=((len(newdf), (len(areas_only.columns)))))
bc_matrix = pd.DataFrame(data= bc_matrix, columns=areas_only.columns, index=newdf.index)
for i, row in newdf.iterrows():
    bc_matrix1 =pd.DataFrame(columns=areas_only.columns)
    for samplename in newdf.columns:
        ind = areas.index[areas['RT_primer']==samplename].tolist()
        fractionC = areasFrac.iloc[ind[0]]*row.loc[samplename]
        bc_matrix1 = bc_matrix1.append(fractionC)
    for region in bc_matrix1.columns:
        bc_matrix.at[i, region] = bc_matrix1[region].sum()/areas_only[region].sum()
bc_matrix.to_pickle(lcm_reg_dir/'bc_matrix_lcm_2.pkl')

In [None]:
#load bc_matrix (containing counts in each region for each barcode) if don't want to repeat above  
bc_matrix = pd.read_pickle(lcm_reg_dir/'bc_matrix_lcm_2.pkl')



In [None]:
#remove columns that are all zeros, and rows that are all zeros
for column in bc_matrix.columns:
    if bc_matrix[column].sum() == 0:
        bc_matrix.drop([column], axis=1, inplace=True)
bc_matrix = bc_matrix.loc[~(bc_matrix==0).all(axis=1)]

In [None]:
# perform hierarchial clustering of all barcodes across samples
sb.clustermap(bc_matrix, metric='euclidean', standard_scale=0, cmap="Blues", figsize=(60, 10))

Potentially may want to threshold to minimum barcode counts? I haven't but might be useful set a minimum

In [None]:
#threshold minimum value of counts/cm3 to zero
threshold = 0.0000001
bc_matrix_thresholded = pd.DataFrame(np.where(bc_matrix > threshold, 0, bc_matrix))
#remove columns that are all zeros, and rows that are all zeros
for column in bc_matrix_thresholded.columns:
    if bc_matrix_thresholded[column].sum() == 0:
        bc_matrix_thresholded.drop([column], axis=1, inplace=True)
bc_matrix_thresholded = bc_matrix_thresholded.loc[~(bc_matrix==0).all(axis=1)]
#plot heatmap showing barcodes in source site with minimum thresholds
fig, ax = plt.subplots(figsize=(60, 10))
sb.heatmap(bc_matrix_thresholded, norm=LogNorm())
plt.show()

Looking at barcode distribution across visual areas only

In [None]:
#before selecting subset of areas, set max row projection strength to 1, so preserve relative projection strengths of bc
newbcmatrix = pd.DataFrame(columns = bc_matrix.columns)
for i, row in bc_matrix.iterrows():
    newrow= pd.DataFrame(bc_matrix.loc[i]/bc_matrix.loc[i].max())
    newbcmatrix = pd.concat([newbcmatrix, newrow.T])

In [None]:
#now take only the regions that contain visual areas
visareas= [col for col in newbcmatrix if col.startswith('VIS') and col.startswith('VISC') == False] +  [col for col in newbcmatrix if col.startswith('Contra-VIS')and col.startswith('Contra-VISC') == False]
reg = newbcmatrix.loc[:,visareas]




In [None]:
#remove rows and columns containing only zeros
for column in reg.columns:
    if reg[column].sum() == 0:
        reg.drop([column], axis=1, inplace=True)
reg = reg.loc[~(reg==0).all(axis=1)]

In [None]:
#perform hierarchial clustering of visual areas only
sb.clustermap(reg, metric='euclidean', cmap="Blues", figsize=(30, 10))

(can ignore) looking at qPCR as potential QC check

In [None]:
#plot of qPCR beta actin values against volume for potentially using as QC against sample quality
qPCR = pd.read_csv('/camp/lab/znamenskiyp/home/shared/projects/turnerb_MAPseq/A1_MAPseq/FIAA32.6a/qPCR/qPCR_FIAA326a.csv') 
qPCR['B-act_amount'] = np.power(1.585,(-(qPCR['B-actin Ct'])))
qPCR['vol'] = 0
    
for i, row in qPCR.iterrows():
    ind = areas.index[areas['RT_primer']==qPCR.loc[i, 'RT primer']].tolist()
    qPCR.at[i, 'vol'] = total.iloc[ind[0]]
#qPCR = qPCR.drop([4]) #remove row with no volume
qPCR['logVol'] = np.log(qPCR['vol'])
qPCR['logBetaAct'] = np.log(qPCR['B-act_amount'])
sb.lmplot(data= qPCR, x='logVol', y='logBetaAct')
corr, _ = spearmanr(qPCR["vol"], qPCR["B-act_amount"])
print('Spearmans correlation: %.3f' % corr)