In [1]:
import pandas as pd
from pandasai import PandasAI

## ## validate CNV based on 76 genome data

In [16]:
## read gene copy count in 76
df_pan=pd.read_csv("merged_gemoma_output_phenology.count", delimiter='\t', header=None)
df_pan.columns=['V3','count','var','type']
df_pan['V3'] = df_pan['V3'].str.strip()
#df_pan['V3']=df_pan['V3'].str.replace("HORVU.MOREX.r3.2HG0130900.3","HORVU.MOREX.r3.2HG0130900.1")
df_pan.head()

Unnamed: 0,V3,count,var,type
0,HORVU.MOREX.r3.1HG0024860.1,1,10TJ18,D
1,HORVU.MOREX.r3.1HG0031260.1,1,10TJ18,D
2,HORVU.MOREX.r3.1HG0031480.1,1,10TJ18,D
3,HORVU.MOREX.r3.1HG0036390.1,1,10TJ18,D
4,HORVU.MOREX.r3.2HG0119220.1,1,10TJ18,D


In [17]:
## calculate mean and std for each gene
df_pan_group = df_pan.groupby(['V3']).agg({'count': ['mean', 'min', 'max','std']})
df_pan_group.columns = df_pan_group.columns.get_level_values(1)
df_pan_group=df_pan_group.reset_index()
df_pan_group['V3'] = df_pan_group['V3'].str[:-2]
df_pan_group.head()  

Unnamed: 0,V3,mean,min,max,std
0,HORVU.MOREX.r3.1HG0024860,1.0,1,1,0.0
1,HORVU.MOREX.r3.1HG0031260,1.0,1,1,0.0
2,HORVU.MOREX.r3.1HG0031480,1.0,1,1,0.0
3,HORVU.MOREX.r3.1HG0036390,1.0,0,2,0.163299
4,HORVU.MOREX.r3.1HG0054220,1.0,1,1,0.0


In [18]:
## divide gene with/without CNV
df_pan_group['CNV'] = df_pan_group['std'].apply(lambda x: 'Yes' if x > 0 else 'No')
df_pan_group.head()

Unnamed: 0,V3,mean,min,max,std,CNV
0,HORVU.MOREX.r3.1HG0024860,1.0,1,1,0.0,No
1,HORVU.MOREX.r3.1HG0031260,1.0,1,1,0.0,No
2,HORVU.MOREX.r3.1HG0031480,1.0,1,1,0.0,No
3,HORVU.MOREX.r3.1HG0036390,1.0,0,2,0.163299,Yes
4,HORVU.MOREX.r3.1HG0054220,1.0,1,1,0.0,No


In [19]:
## read normalized mosdepth data
df_mos = pd.read_csv("processed_coverage_matrix.csv",header=0,index_col=0)
df_mos=df_mos.T
df_mos = df_mos.reset_index().rename(columns={'index': 'V3'})
df_mos.head()

Unnamed: 0,V3,ERS2903440,ERS2903441,ERS2903442,ERS2903444,ERS2903445,ERS2903446,ERS2903447,ERS2903450,ERS2903453,...,ERS2904375,ERS2904376,ERS2904377,ERS2904378,ERS2904381,ERS2904382,ERS2904383,ERS2904384,ERS2904385,ERS2904386
0,HORVU.MOREX.r3.1HG0024860,0.935491,0.952896,0.922124,0.972762,0.812351,0.89722,0.887102,0.855745,0.836628,...,0.842934,0.839529,0.858549,0.852662,0.962207,0.872503,0.954897,0.990851,0.924117,0.933392
1,HORVU.MOREX.r3.1HG0031260,0.901748,0.904623,0.926726,0.95413,0.799304,0.969633,0.940704,0.888658,0.922885,...,0.90638,0.848908,0.850631,0.981797,0.986844,0.948527,0.939259,1.041309,0.996056,0.980625
2,HORVU.MOREX.r3.1HG0031480,0.001374,0.00117,0.003186,0.001597,0.001957,0.000934,0.001508,0.003855,0.001406,...,0.00644,0.005948,0.000856,0.002061,0.000394,0.003048,0.00214,0.0,0.0,0.001574
3,HORVU.MOREX.r3.1HG0036390,0.107184,0.123464,0.115221,0.120309,0.131333,0.128475,0.112563,0.133729,0.11579,...,0.118068,0.131991,0.10978,0.113337,0.120522,0.113613,0.114239,0.121615,0.102179,0.116073
4,HORVU.MOREX.r3.1HG0054220,0.011146,0.014043,0.010088,0.011357,0.010872,0.014483,0.007538,0.014826,0.011876,...,0.014311,0.015098,0.016692,0.011677,0.009953,0.006603,0.010041,0.012199,0.010518,0.011196


In [20]:
## add CNV column to mosdepth data
df_merge = df_pan_group.merge(df_mos,on='V3',how='inner')
df_merge.head()

Unnamed: 0,V3,mean,min,max,std,CNV,ERS2903440,ERS2903441,ERS2903442,ERS2903444,...,ERS2904375,ERS2904376,ERS2904377,ERS2904378,ERS2904381,ERS2904382,ERS2904383,ERS2904384,ERS2904385,ERS2904386
0,HORVU.MOREX.r3.1HG0024860,1.0,1,1,0.0,No,0.935491,0.952896,0.922124,0.972762,...,0.842934,0.839529,0.858549,0.852662,0.962207,0.872503,0.954897,0.990851,0.924117,0.933392
1,HORVU.MOREX.r3.1HG0031260,1.0,1,1,0.0,No,0.901748,0.904623,0.926726,0.95413,...,0.90638,0.848908,0.850631,0.981797,0.986844,0.948527,0.939259,1.041309,0.996056,0.980625
2,HORVU.MOREX.r3.1HG0031480,1.0,1,1,0.0,No,0.001374,0.00117,0.003186,0.001597,...,0.00644,0.005948,0.000856,0.002061,0.000394,0.003048,0.00214,0.0,0.0,0.001574
3,HORVU.MOREX.r3.1HG0036390,1.0,0,2,0.163299,Yes,0.107184,0.123464,0.115221,0.120309,...,0.118068,0.131991,0.10978,0.113337,0.120522,0.113613,0.114239,0.121615,0.102179,0.116073
4,HORVU.MOREX.r3.1HG0054220,1.0,1,1,0.0,No,0.011146,0.014043,0.010088,0.011357,...,0.014311,0.015098,0.016692,0.011677,0.009953,0.006603,0.010041,0.012199,0.010518,0.011196


In [21]:
df_merge.shape

(160, 508)