In [None]:
import numpy as np
np.__version__

In [None]:
import pandas as pd
pd.__version__

In [None]:
import matplotlib
matplotlib.__version__

In [None]:
import matplotlib.pyplot as plt

In [None]:
from matplotlib.colors import LogNorm

In [None]:
import scipy
scipy.__version__

In [None]:
from scipy import stats


In [None]:
# Adjust plot size
#options(repr.plot.width=16, repr.plot.height=6)

# For some reason, if this is in the same cell as the import command, it doesn't work.
plt.rcParams["figure.figsize"] = [18.0,8.0]

In [None]:
import seaborn as sns
sns.__version__

In [None]:
from datetime import datetime


In [None]:
import sklearn
sklearn.__version__


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
default_max_columns=pd.get_option('display.max_columns')
default_max_rows=pd.get_option('display.max_rows')
default_precision=pd.get_option('display.precision')
pd.set_option('display.precision', 2)

In [None]:
read_counts = pd.read_csv('barcode_read_counts.csv',sep=",",header=None,index_col=0,names=['barcode','count'])
read_counts.head()

In [None]:
read_counts.shape

In [None]:
metadata = pd.read_csv('mergedAllCells_withCellTypeIdents_CLEAN.csv',sep=",",index_col=0)
metadata.head()

In [None]:
metadata.shape

In [None]:
metadata=metadata[['SOC_indiv_ID','SOC_infection_status','SOC_genetic_ancestry','celltype']]
metadata.head()

In [None]:
metadata.shape

In [None]:
#feature_counts = pd.read_csv('featureCounts.B1.19.transcript.gene_name.csv.gz',sep="\t",
#feature_counts = pd.read_csv('featureCounts.B1.LTR.feature.feature_name.csv.gz',sep="\t",
#feature_counts = pd.read_csv('featureCounts.B1.Other.feature.feature_name.csv.gz',sep="\t",
feature_counts = pd.read_csv('featureCounts.B1.19.exon.gene_name.csv.gz',sep="\t",
                    dtype = {'Chr': str, 'Start': str, 'End': str}, index_col=0,header=1)
feature_counts.head()

In [None]:
feature_counts.dtypes

In [None]:
feature_counts.shape

In [None]:
feature_counts.drop(['Chr','Start','End','Length','Strand'],axis='columns',inplace=True)
feature_counts.head()

In [None]:
feature_counts.sum(axis='columns')

In [None]:
feature_counts.columns=feature_counts.columns.str.rstrip('.bam')
feature_counts

# Normalize

In [None]:
#feature_counts.drop(feature_counts[feature_counts.sum(axis='columns')<=1000].index,inplace=True)
feature_counts=feature_counts[feature_counts.sum(axis='columns')>100]
feature_counts

In [None]:
feature_counts=feature_counts.T.merge(metadata,left_index=True,right_index=True)
feature_counts

In [None]:
feature_counts=feature_counts.reset_index()
feature_counts=feature_counts.rename(columns={"index": "barcode"})
feature_counts=feature_counts.set_index(
    ['barcode','SOC_indiv_ID','SOC_infection_status','SOC_genetic_ancestry','celltype'])
feature_counts

In [None]:
feature_counts.shape

In [None]:
data_columns=feature_counts.columns

In [None]:
def ttests_boxplots_and_heatmaps(localdf,columns,group_column,box_p=0.05,heat_p=0.15):
    select_p_values=[]
#    localdf.reset_index(inplace=True)
    groups=localdf[group_column].unique().tolist()
    print("T-Testing")
    print(groups)
    for col in columns:
        if ( col not in localdf.columns ):
            continue
#        print(col)
        for group in groups:
            others = groups.copy()
            others.remove(group)
            #print(localdf[localdf[group_column]==group][col])
            #print(localdf[localdf[group_column]==group][col])

            for other in others:
                t,p = stats.ttest_ind(
#                t,p = stats.ttest_rel(
                    localdf[localdf[group_column]==group][col],
                    localdf[localdf[group_column]==other][col])
                #if(p<0.1):
                #    print(col,group,other,t,p)
                if(( p > 0 ) and ( p < heat_p )):
                    select_p_values.append([abs(t),p,col])
#                    print([abs(t),p,col])
                    
    pdf = pd.DataFrame(select_p_values, columns=['t','p','col'])                        
    pdf=pdf.drop_duplicates().sort_values('p')
    print("Boxplotting")
    for index, row in pdf.iterrows():
        if(( row['p'] > 0 ) and ( row['p'] < box_p )):
            print(" p : "+str(row['p'])+"  ( t : "+str(row['t'])+" ) :  "+str(row['col']))
            
            localdf.boxplot(column=row['col'],by=[group_column])
            plt.title(str(row['col']))
            plt.xticks(rotation=75,ha='right')
            plt.show()                            


    print("Clustermapping")

    selected=['barcode',group_column]
#    selected=['subject',group_column]
#    selected=['sample',group_column]
    selected=np.append(selected,pdf['col'])
    selected=np.unique(selected)
    print(selected)
    
    if len(selected) > 3: #7:
        tmp=localdf[selected].copy()
        tmp.set_index(['barcode',group_column],inplace=True)

        #tmp-=tmp.min() # This may almost always be 0 now.
#        tmp/=tmp.max() # given that there should be no NAs now, coule let clustermap do this
        # ...
        #standard_scale int or None, optional
        #Either 0 (rows) or 1 (columns).
        #Whether or not to standardize that dimension, meaning for each row or column,
        #subtract the minimum and divide each by its maximum.
        # how would that deal with 0s or empty cells

        tmp=tmp.fillna(0) # Somehow, some scaled values become NaN so keep this

        #tmp.reset_index(inplace=True)
        tmp=tmp.reset_index()

        tmp=tmp.sort_values([group_column])
        tmp.set_index(['barcode'],inplace=True)

        diagnoses=tmp.pop(group_column)
        tmp=tmp.T

        network_pal = sns.cubehelix_palette(len(diagnoses.unique()),
                            light=.9, dark=.1, reverse=True,
                            start=1, rot=-2)
        network_pal
        network_lut = dict(zip(diagnoses.unique(), network_pal))

        # Convert the palette to vectors that will be drawn on the side of the matrix
        network_labels = diagnoses
        network_colors = pd.Series(network_labels, index=tmp.columns).map(network_lut)                


        g=sns.clustermap(tmp,col_cluster=False,figsize=(15,20),cmap="Spectral",
                         col_colors=network_colors,xticklabels=True,norm=LogNorm())
        g.fig.suptitle("Clustermap 1 : " + group_column + " : select p < " + str(heat_p) )
        plt.setp(g.ax_heatmap.get_xticklabels(), rotation=75, horizontalalignment='right')

        # Draw the legend bar for the classes                 
        for label in network_labels.unique():
            g.ax_col_dendrogram.bar(0, 0, color=network_lut[label],
                                    label=label, linewidth=0)
        g.ax_col_dendrogram.legend(loc="center", ncol=4)

        # Adjust the postion of the main colorbar for the heatmap
        g.cax.set_position([0, .15, .005, .6])
        plt.show()

        g=sns.clustermap(tmp,figsize=(15,20),cmap="Spectral",
                         col_colors=network_colors,xticklabels=True,norm=LogNorm())
        g.fig.suptitle("Clustermap 2 : " + group_column + " : select p < " + str(heat_p) )
        plt.setp(g.ax_heatmap.get_xticklabels(), rotation=75, horizontalalignment='right')

        # Draw the legend bar for the classes                 
        for label in network_labels.unique():
            g.ax_col_dendrogram.bar(0, 0, color=network_lut[label],
                                    label=label, linewidth=0)
        g.ax_col_dendrogram.legend(loc="center", ncol=4)

        # Adjust the postion of the main colorbar for the heatmap
        # [dist from left,dist from bottom?,width?,height?]
        g.cax.set_position([0, .15, .005, .6])
        plt.show()  
        


        pcadf=localdf[selected].copy()

        pcadf.set_index(['barcode',group_column],inplace=True)
        
        elements=pcadf.columns
        
        #pcadf.reset_index(inplace=True)
        pcadf=pcadf.reset_index()
        
        scaler = StandardScaler()
        scaler.fit(pcadf[elements])
        scaled_data = scaler.transform(pcadf[elements])
        
        #ValueError: n_components=8 must be between 0 and min(n_samples, n_features)=2 with svd_solver='full'
        n=min([len(pcadf),len(pcadf.columns)-2,8])
        pca = PCA(n_components=n)
        
        pca.fit(scaled_data)
        x_pca = pca.transform(scaled_data)
        
        # for color
        groups=pcadf[group_column].unique().tolist()
        pcadf['group_index']=pcadf[group_column].apply(lambda x: groups.index(x))
        
        fig = plt.figure(figsize = (18,8))
        ax = fig.add_subplot(1,1,1) 
        ax.set_xlabel('Principal Component 1', fontsize = 15)
        ax.set_ylabel('Principal Component 2', fontsize = 15)
        ax.set_title(''+str(n)+' component PCA '+group_column + " : select p < " + str(heat_p) , fontsize = 20)

        colors = ['r','g','b','c','m','y','k']
        for target, color in zip(groups,colors):
            indicesToKeep = pcadf[group_column] == target
            ax.scatter(x_pca[indicesToKeep, 0], x_pca[indicesToKeep, 1], c = color, s = 50)
        ax.legend(groups)
        ax.grid()
        plt.show()

    else:
        print("Not enough data to produce cluster maps")    
    
                

In [None]:
#ttests_boxplots_and_heatmaps(
#    feature_counts.reset_index(), data_columns[0:100],
#    'SOC_infection_status',box_p=0.005,heat_p=0.05)
#for column in ['SOC_infection_status','SOC_genetic_ancestry','celltype']:
#    print(column)
#    ttests_boxplots_and_heatmaps(feature_counts.reset_index(),data_columns,column,box_p=0.005,heat_p=0.05)    
#ttests_boxplots_and_heatmaps(
#    feature_counts.reset_index(),data_columns[0:10],'celltype',box_p=0.0005,heat_p=0.005)    

#ttests_boxplots_and_heatmaps(
#    feature_counts.reset_index()[0:5000],["ISG15","NOC2L","HES4"],'celltype',box_p=0.0005,heat_p=0.005)    

ttests_boxplots_and_heatmaps(
    feature_counts.reset_index(),data_columns,'celltype',box_p=0.0001,heat_p=0.005)    
