In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

import scipy.stats as sst

from statannot import add_stat_annotation

import functools

import itertools

In [2]:
def get_statistics(path,col):
    count_data=pd.read_csv(path,index_col=0)
    count_data['Group']=count_data['sample'].str.endswith('surgery')*1
    count_data_dict=count_data.groupby('sample').sum()['count'].to_dict()
    count_data['percentage']=count_data.apply(lambda x:x['count']/count_data_dict[x[0]],axis=1)
    count_data['group class']=count_data.apply(lambda x:'Biopsy' if x['Group']==0 else 'Surgery',axis=1)
    cell_type_order=list(count_data[col].value_counts().index)
    treatment=['Biopsy','Surgery']
    box_pairs=get_pairs(cell_type_order,treatment)
    return count_data,box_pairs,cell_type_order
def get_pairs(order,treatment):
    box_pairs=[]
    for group in order:
        box_pairs.append([(group,x) for x in treatment])
    return box_pairs

In [3]:
def get_ratio_df(T_cell_subtypes,colnames):
    T_cell_subtypes_pre=[]
    T_cell_subtypes_post=[]
    for index,value in T_cell_subtypes.groupby(colnames):
        key=','.join(index)
        if 'biopsy' in key:
            T_cell_subtypes_pre.append(key)
        elif 'surgery' in key:
            T_cell_subtypes_post.append(key)

    patient_T_Cell_type={}
    for index,value in T_cell_subtypes.groupby(colnames):
        key=','.join(index)
        patient_T_Cell_type[key]=value['percentage'].sum()

    T_cell_type_ratio={}
    tumor_change_IM={}
    for key1,key2 in zip(T_cell_subtypes_pre,T_cell_subtypes_post):
        pateint_val=key2.split(',')[0]
        T_cell_type_ratio[key2]=patient_T_Cell_type[key2]/patient_T_Cell_type[key1]
        tumor_change_IM[key2]=tumor_change[pateint_val]

    T_cell_type_ratio_df=pd.DataFrame.from_dict(T_cell_type_ratio,orient='index')
    T_cell_type_ratio_df.columns=['ratio']
    tumor_changeIM_df=pd.DataFrame.from_dict(tumor_change_IM,orient='index')
    tumor_changeIM_df.columns=['tumor change']


    T_cell_type_ratio_df['tumor change'] = tumor_changeIM_df[['tumor change']]

    T_cell_type_ratio_df['cell_types']=T_cell_type_ratio_df.index.str.split(',')

    T_cell_type_ratio_df['cell_type']=T_cell_type_ratio_df.apply(lambda x:x['cell_types'][1],axis=1)

    T_cell_type_ratio_df['log ratio'] =np.log10(T_cell_type_ratio_df['ratio'])
    return T_cell_type_ratio_df

In [4]:
def get_baseline_df(T_cell_subtypes,colnames):
    T_cell_subtypes_pre=[]
    T_cell_subtypes_post=[]
    for index,value in T_cell_subtypes.groupby(colnames):
        key=','.join(index)
        if 'biopsy' in key:
            T_cell_subtypes_pre.append(key)
        elif 'surgery' in key:
            T_cell_subtypes_post.append(key)

    patient_T_Cell_type={}
    for index,value in T_cell_subtypes.groupby(colnames):
        key=','.join(index)
        patient_T_Cell_type[key]=value['percentage'].sum()

    T_cell_type_ratio={}
    tumor_change_IM={}
    for key1,key2 in zip(T_cell_subtypes_pre,T_cell_subtypes_post):
        pateint_val=key2.split(',')[0]
        T_cell_type_ratio[key2]=patient_T_Cell_type[key1]
        tumor_change_IM[key2]=tumor_change[pateint_val]

    T_cell_type_ratio_df=pd.DataFrame.from_dict(T_cell_type_ratio,orient='index')
    T_cell_type_ratio_df.columns=['ratio']
    tumor_changeIM_df=pd.DataFrame.from_dict(tumor_change_IM,orient='index')
    tumor_changeIM_df.columns=['tumor change']


    T_cell_type_ratio_df['tumor change'] = tumor_changeIM_df[['tumor change']]

    T_cell_type_ratio_df['cell_types']=T_cell_type_ratio_df.index.str.split(',')

    T_cell_type_ratio_df['cell_type']=T_cell_type_ratio_df.apply(lambda x:x['cell_types'][1],axis=1)

    #T_cell_type_ratio_df['log ratio'] =np.log10(T_cell_type_ratio_df['ratio'])
    return T_cell_type_ratio_df

In [21]:
all_subtypes,box_pairs_all,cell_type_orderall=get_statistics("statistics/celltype_counts.csv",'cell_type')

In [22]:
tumor_change={'guozhixing_surgery':-45.57,'zhuanglili_surgery':-31.26,'sunzhengyun_surgery':-42.0,'nianshizhu_surgery':-20.52,'lilanying_surgery':-14.84,'wangzhu_surgery':-65.63,'zhouyunmei_surgery':-20.45,'wandeyuan_surgery':-31.7}

In [23]:
all_tumor_change=get_ratio_df(all_subtypes,['sample','cell_type'])

  T_cell_type_ratio[key2]=patient_T_Cell_type[key2]/patient_T_Cell_type[key1]
  T_cell_type_ratio[key2]=patient_T_Cell_type[key2]/patient_T_Cell_type[key1]
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [24]:
all_tumor_baseline=get_baseline_df(all_subtypes,['sample','cell_type'])

In [25]:
def Ti_pi(slope,r_value):
    ti=-slope/abs(slope)*r_value*r_value
    return ti

In [26]:
comprehensive_cell_type={}
for index,value in all_tumor_change.groupby('cell_type'):
    value1=value.loc[~value['log ratio'].isin([np.inf,-np.inf,np.nan])].copy()
    number=len(value1)
    try:
        slope, intercept, r_value, p_value, std_err=sst.linregress(value1['log ratio'],value1['tumor change'])
    except ValueError:
        print(index)
        continue
    ti_score=Ti_pi(slope,r_value)
    comprehensive_cell_type[index]=[ti_score,slope,r_value,p_value,number]
    #print(index,slope,intercept,r_value,p_value,sep="\t")

In [27]:
def get_pi(all_tumor_change,col):
    comprehensive_cell_type={}
    for index,value in all_tumor_change.groupby(col):
        value1=value.loc[~value['ratio'].isin([np.inf,-np.inf,np.nan])].copy()
        number=len(value1)
        try:
            slope, intercept, r_value, p_value, std_err=sst.linregress(value1['ratio'],value1['tumor change'])
        except ValueError:
            print(index)
            continue
        ti_score=Ti_pi(slope,r_value)
        comprehensive_cell_type[index]=[ti_score,slope,r_value,p_value,number]
    comprehensive_cell_type_ti=pd.DataFrame.from_dict(comprehensive_cell_type,orient='index')
    comprehensive_cell_type_ti.columns=['Ti','slop','r value','p value','number']
    comprehensive_cell_type_ti['log pvalue']=-np.log10(comprehensive_cell_type_ti['p value'])
    return comprehensive_cell_type_ti

In [28]:
all_type_base_line_ti=get_pi(all_tumor_baseline,'cell_type')

In [29]:
all_type_base_line_ti.to_csv("analysis_result/baseline_results/all_type_pi_update.txt",sep="\t")

In [33]:
comprehensive_cell_type_ti=pd.DataFrame.from_dict(comprehensive_cell_type,orient='index')
comprehensive_cell_type_ti.columns=['Ti','slop','r value','p value','number']

In [34]:
comprehensive_cell_type_ti['log pvalue']=-np.log10(comprehensive_cell_type_ti['p value'])

In [35]:
comprehensive_cell_type_ti.to_csv("analysis_result/comprehensive_cell_types_clinical/comprehensive_type_ti_update.txt",sep="\t")

In [36]:
comprehensive_cell_type_ti

Unnamed: 0,Ti,slop,r value,p value,number,log pvalue
B cell,0.057213,-8.368844,-0.239192,0.568327,8,0.245401
Endothelial,0.086281,-7.229001,-0.293736,0.480104,8,0.318664
Epithelial,0.106461,-6.316938,-0.326283,0.430253,8,0.366276
Fibroblast,0.060174,-4.36857,-0.245305,0.558172,8,0.253232
Malignant cell,-0.172681,147.944645,0.415549,0.727179,3,0.138359
Mast cell,-0.128577,13.541463,0.358577,0.429629,7,0.366906
Myeloid,0.011621,-5.540251,-0.1078,0.799436,8,0.097216
NK cell,0.05609,-10.910588,-0.236833,0.572263,8,0.242404
Neutrophil,-0.038316,3.43628,0.195744,0.642248,8,0.192297
Plasma,-0.042033,5.398257,0.205019,0.626226,8,0.203269


In [None]:
##T_cell types


In [178]:
T_cell_types,box_pairs,cell_type_order=get_statistics("statistics/T_cell_subtype.csv",'T_cell_subtype')

In [180]:
T_type_tumor_change=get_ratio_df(T_cell_types,['sample','T_cell_subtype'])
T_type_base=get_baseline_df(T_cell_types,['sample','T_cell_subtype'])

In [12]:
def get_ti_pi(all_tumor_change,col):
    comprehensive_cell_type={}
    for index,value in all_tumor_change.groupby(col):
        value1=value.loc[~value['log ratio'].isin([np.inf,-np.inf,np.nan])].copy()
        number=len(value1)
        try:
            slope, intercept, r_value, p_value, std_err=sst.linregress(value1['log ratio'],value1['tumor change'])
        except ValueError:
            print(index)
            continue
        ti_score=Ti_pi(slope,r_value)
        comprehensive_cell_type[index]=[ti_score,slope,r_value,p_value,number]
    comprehensive_cell_type_ti=pd.DataFrame.from_dict(comprehensive_cell_type,orient='index')
    comprehensive_cell_type_ti.columns=['Ti','slop','r value','p value','number']
    comprehensive_cell_type_ti['log pvalue']=-np.log10(comprehensive_cell_type_ti['p value'])
    return comprehensive_cell_type_ti

In [181]:
#T_cell_type_baseline=get_baseline_df(T_cell_types,['sample','T_cell_subtype'])
T_cell_type_baseline_line_pi=get_pi(T_cell_type_baseline,'cell_type')

In [188]:
T_cell_type_ti=get_ti_pi(T_type_tumor_change,'cell_type')

In [190]:
T_cell_type_ti.to_csv("analysis_result/T_cell_type_ti.txt",sep="\t")

In [183]:
T_cell_type_baseline_line_pi.to_csv("analysis_result/baseline_results/T_cell_type_baseline_pi.txt",sep="\t")

In [6]:
T_cell_subtypes,box_pairs,cell_type_order_subtypes=get_statistics("statistics/T_further_subtypes_counts.csv",'T_cell_further_subtype')

In [9]:
T_subtype_baseline=get_baseline_df(T_cell_subtypes,['sample','T_cell_further_subtype'])

In [10]:
T_subtype_tumor_ratio=get_ratio_df(T_cell_subtypes,['sample','T_cell_further_subtype'])

  T_cell_type_ratio[key2]=patient_T_Cell_type[key2]/patient_T_Cell_type[key1]


In [16]:
T_subtype_pi=get_pi(T_subtype_baseline,'cell_type')

In [17]:
T_subtype_pi

Unnamed: 0,Ti,slop,r value,p value,number,log pvalue
CD4 SOCS3,-0.147614,134.946607,0.384206,0.347367,8,0.459211
CD4 Treg_1,0.151142,-84.458359,-0.38877,0.341175,8,0.467023
CD4 Treg_2,0.508055,-1311.317722,-0.71278,0.047209,8,1.325979
CD4 naive,-0.001793,8.773146,0.042339,0.92071,8,0.035877
CD8 GZMB,0.646105,-84.502446,-0.803807,0.016211,8,1.790201
CD8 GZMK_1,0.009062,-11.495496,-0.095195,0.822585,8,0.084819
CD8 GZMK_2,-0.406283,132.9298,0.637404,0.089121,8,1.050019
CD8 KLRB1,-0.190221,155.151694,0.436143,0.280018,8,0.552813
CD8 Trm,-0.106345,316.723484,0.326106,0.430518,8,0.366009
CD8 proliferating,0.880695,-675.477357,-0.938454,0.000556,8,3.254718


In [195]:
T_subtype_pi

Unnamed: 0,Ti,slop,r value,p value,number,log pvalue
CD4 SOCS3,-0.147614,134.946607,0.384206,0.347367,8,0.459211
CD4 Treg_1,0.151142,-84.458359,-0.38877,0.341175,8,0.467023
CD4 Treg_2,0.508055,-1311.317722,-0.71278,0.047209,8,1.325979
CD4 naive,-0.001793,8.773146,0.042339,0.92071,8,0.035877
CD8 GZMB,0.646105,-84.502446,-0.803807,0.016211,8,1.790201
CD8 GZMK_12,-0.123196,54.317952,0.350993,0.393942,8,0.404568
CD8 KLRB1,-0.190221,155.151694,0.436143,0.280018,8,0.552813
CD8 Trm,-0.106345,316.723484,0.326106,0.430518,8,0.366009
CD8 proliferating,0.880695,-675.477357,-0.938454,0.000556,8,3.254718
NK_1,-0.662576,140.521407,0.813988,0.013929,8,1.856077


In [196]:
T_subtype_pi.to_csv("analysis_result/T_cell_class/T_sub_type_pi.txt",sep="\t")

In [18]:
T_subtype_pi.to_csv("analysis_result/T_cell_class/T_sub_type_pi_12.txt",sep="\t")

In [19]:
T_cell_subtype_ti=get_ti_pi(T_subtype_tumor_ratio,'cell_type')

In [21]:
T_cell_subtype_ti.to_csv("analysis_result/T_cell_class/T_sub_type_ti_12.txt",sep="\t")

In [197]:
#T_subtype_tumor_change=get_ratio_df(T_cell_subtypes,['sample','T_cell_further_subtype'])

In [198]:
T_cell_subtype_ti=get_ti_pi(T_subtype_tumor_ratio,'cell_type')

In [199]:
T_cell_subtype_ti

Unnamed: 0,Ti,slop,r value,p value,number,log pvalue
CD4 SOCS3,-0.044357,5.446307,0.210611,0.616627,8,0.209977
CD4 Treg_1,-0.557242,30.012838,0.746487,0.033381,8,1.476506
CD4 Treg_2,-0.195149,15.965329,0.441757,0.273157,8,0.563587
CD4 naive,-0.268446,52.40029,0.518118,0.188386,8,0.724952
CD8 GZMB,-0.224253,15.629703,0.473554,0.235901,8,0.62727
CD8 GZMK_12,0.855544,-81.327724,-0.924956,0.000998,8,3.000889
CD8 KLRB1,0.136929,-13.341045,-0.370039,0.366911,8,0.435439
CD8 Trm,-0.281619,21.225059,0.530677,0.278708,6,0.55485
CD8 proliferating,-0.550123,16.306954,0.741702,0.035168,8,1.453858
NK_1,0.00312,-2.549611,-0.055853,0.895494,8,0.047937


In [200]:
T_cell_subtype_ti.to_csv("analysis_result/T_cell_sub_class/T_cell_subtype_ti.txt",sep="\t")


In [104]:
#T_cell_subtypes12,box_pairs,cell_type_order_subtypes=get_statistics("statistics/T_further_subtypes_counts_GZMK12.csv",'T_cell_further_subtype')

In [105]:
#T_subtype_tumor_changeGZMK12=get_ratio_df(T_cell_subtypes12,['sample','T_cell_further_subtype'])

  T_cell_type_ratio[key2]=patient_T_Cell_type[key2]/patient_T_Cell_type[key1]


In [106]:
#T_cell_subtype12_ti=get_ti_pi(T_subtype_tumor_changeGZMK12,'cell_type')

In [108]:
#T_cell_subtype12_ti.to_csv("analysis_result/T_cell_sub_class_Treg_GZMK12/T_cellsubtype12ti.txt",sep="\t")

In [109]:
myeloid_subtypes_counts,box_pairs,cell_type_order_subtypes=get_statistics("statistics/myeloid_subtypes_counts.csv",'myeloid_type')

In [203]:
myeloid_tumor_change=get_ratio_df(myeloid_subtypes_counts,['sample','myeloid_type'])

  T_cell_type_ratio[key2]=patient_T_Cell_type[key2]/patient_T_Cell_type[key1]
  T_cell_type_ratio[key2]=patient_T_Cell_type[key2]/patient_T_Cell_type[key1]
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [204]:
myeloid_baseline=get_baseline_df(myeloid_subtypes_counts,['sample','myeloid_type'])

In [205]:
myeloid_ti=get_ti_pi(myeloid_tumor_change,'cell_type')

Macro MRC1


In [207]:
myeloid_pi=get_pi(myeloid_baseline,'cell_type')

In [208]:
myeloid_pi

Unnamed: 0,Ti,slop,r value,p value,number,log pvalue
DC CD1C,-0.442155,108.770208,0.664948,0.071987,8,1.142749
DC IDO1,0.078703,-214.036495,-0.28054,0.500935,8,0.300219
DC TYMS,-0.026732,95.325142,0.1635,0.698857,8,0.155612
Macro CD3,0.132131,-61.047789,-0.363498,0.376098,8,0.424699
Macro CD5L,-0.511981,266.930462,0.715528,0.045971,8,1.337513
Macro FBP1,0.000146,-9.614122,-0.012096,0.977323,8,0.009962
Macro GPNMB,0.004073,-7.988316,-0.063823,0.880657,8,0.055193
Macro IGLC2,0.611129,-256.225583,-0.781748,0.021922,8,1.659121
Macro MRC1,-0.004403,4315.457143,0.066352,0.875954,8,0.057519
Macro SPINK1,-0.195222,389.335377,0.441839,0.273058,8,0.563745


In [209]:
myeloid_pi.to_csv("analysis_result/myeloid_sub_class/myeloid_pi.txt",sep="\t")

In [210]:
myeloid_ti.to_csv("analysis_result/myeloid_sub_class/myeloid_ti.txt",sep="\t")

In [213]:
myeloid_ti

Unnamed: 0,Ti,slop,r value,p value,number,log pvalue
DC CD1C,0.158048,-12.197669,-0.397553,0.329405,8,0.48227
DC IDO1,-0.16412,21.894303,0.405117,0.319423,8,0.495634
DC TYMS,-0.01909,5.499051,0.138168,0.767672,7,0.114824
Macro CD3,-0.136988,15.170067,0.370118,0.3668,8,0.43557
Macro CD5L,0.558781,-24.100469,-0.747517,0.252483,4,0.597768
Macro FBP1,0.314984,-9.935398,-0.561235,0.246538,6,0.608116
Macro GPNMB,0.006413,-2.668763,-0.080081,0.85049,8,0.070331
Macro IGLC2,0.87299,-21.108786,-0.93434,0.06566,4,1.182696
Macro SPINK1,0.99971,-7.509431,-0.999855,0.010839,3,1.964995
Mono CCL20,-0.001642,0.477479,0.040523,0.974196,3,0.011354


In [119]:
T_subtype_tumor_myeloid_ti.to_csv("statistics/myloid_ti.txt",sep="\t")

In [120]:
T_subtype_tumor_myeloid_ti

Unnamed: 0,Ti,slop,r value,p value,number,log pvalue
DC CD1C,0.158048,-12.197669,-0.397553,0.329405,8,0.48227
DC IDO1,-0.16412,21.894303,0.405117,0.319423,8,0.495634
DC TYMS,-0.01909,5.499051,0.138168,0.767672,7,0.114824
Macro CD3,-0.136988,15.170067,0.370118,0.3668,8,0.43557
Macro CD5L,0.558781,-24.100469,-0.747517,0.252483,4,0.597768
Macro FBP1,0.314984,-9.935398,-0.561235,0.246538,6,0.608116
Macro GPNMB,0.006413,-2.668763,-0.080081,0.85049,8,0.070331
Macro IGLC2,0.87299,-21.108786,-0.93434,0.06566,4,1.182696
Macro SPINK1,0.99971,-7.509431,-0.999855,0.010839,3,1.964995
Mono CCL20,-0.001642,0.477479,0.040523,0.974196,3,0.011354


In [47]:
all_tumor_change_nona=all_tumor_change.loc[~all_tumor_change['ratio'].isin([np.inf,-np.inf,np.nan,0])].copy()