In [1]:
'''
load original df
'''

import pandas as pd
import os
import numpy as np
from functions import *
from scipy.stats import mannwhitneyu
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.cbook import boxplot_stats
import re
import warnings
warnings.filterwarnings('ignore') #supresses warnings for now
#warnings.filterwarnings(action='once') #shows warnings once
from statannotations.Annotator import Annotator

dataFilePath = os.path.join(os.getcwd(),'Data')

kicsTEdf = pd.read_csv(os.path.join(dataFilePath,'kics_transposable_elements.txt'), sep = '\t', header = 0)
lfsTEdf = pd.read_csv(os.path.join(dataFilePath,'lfs_transposable_elements.txt'), sep = '\t', header = 0)
lfsClinicdf = pd.read_csv(os.path.join(dataFilePath,'lfs_clinical_main.tsv'), sep = '\t', header = 0)
kicsClinicdf = pd.read_csv(os.path.join(dataFilePath,'kics_clinical_edited.csv'), encoding='latin-1')

In [2]:
#setting the font of the graphs
sns.set(font_scale=1.5)

In [3]:
"""
Kics clinic df cleaning
"""
#removal of these Kics ID
listValSearch = ['63', '83', '156', '171', '219', '120', '141', '232']

kicsClinicdf['KiCS ID'] = kicsClinicdf['KiCS ID'].astype(str)
kicsClinicdf['KiCS ID'] = kicsClinicdf['KiCS ID'].str.replace(" ", "")

for i in listValSearch:
    kicsClinicdf.drop(kicsClinicdf[kicsClinicdf['KiCS ID']==i].index, inplace = True)

#these two lines are causing us to go from 140->90 ish samples
#drop columns with '?' and na cancer types
kicsClinicdf.drop(kicsClinicdf.loc[kicsClinicdf['tissue_type']=='?'].index, inplace=True)
kicsClinicdf.drop(kicsClinicdf.loc[pd.isna(kicsClinicdf['lfs_cancer_type_diagnosis'])].index, inplace=True)


In [4]:
"""
lfs clinic df cleaning
"""
#sample id cleaning
lfsClinicdf['sample'] = lfsClinicdf['sample'].astype(str)
lfsClinicdf['sample'] = lfsClinicdf['sample'].str.replace(" ", "")

for i in listValSearch:
    lfsClinicdf.drop(lfsClinicdf[lfsClinicdf['sample']==i].index, inplace = True)

#tissue type and cancer diagnosis cleaning
lfsClinicdf.drop(lfsClinicdf.loc[lfsClinicdf['tissue_type']=='?'].index, inplace=True)
lfsClinicdf.drop(lfsClinicdf.loc[pd.isna(lfsClinicdf['cancer_diagnosis'])].index, inplace=True)

In [5]:
"""
'ALT' column clean up
"""

kicsTEdf['ALT'] = kicsTEdf['ALT'].str.replace("<INS:ME:","")
kicsTEdf['ALT'] = kicsTEdf['ALT'].str.replace(">","")

lfsTEdf['ALT'] = lfsTEdf['ALT'].str.replace("<INS:ME:","")
lfsTEdf['ALT'] = lfsTEdf['ALT'].str.replace(">","")
print(lfsTEdf['ALT'].value_counts())

ALU      90163
LINE1     4342
SVA       2712
Name: ALT, dtype: int64


In [6]:
"""
Filtering and cleaning Samples_ID
"""
kicsTEF = kicsTEdf.loc[(kicsTEdf['Annotation_mode'] == 'full')]
lfsTEF = lfsTEdf.loc[(lfsTEdf['Annotation_mode'] == 'full')]

kicsTEF['SV_chrom'] = kicsTEF['SV_chrom'].astype(str)
lfsTEF['SV_chrom'] = lfsTEF['SV_chrom'].astype(str)

kicsTEF['Samples_ID'] = kicsTEF['Samples_ID'].astype(str).str.replace(".realigned-recalibrated","")
lfsTEF['Samples_ID'] = lfsTEdf['Samples_ID'].astype(str).str.replace('.realigned-recalibrated', '')

kicsTEF.drop(kicsTEF.loc[pd.isna(kicsTEF['Samples_ID'])].index, inplace=True)
lfsTEF.drop(lfsTEF.loc[pd.isna(lfsTEF['Samples_ID'])].index, inplace=True)

kicsTEnum = kicsTEF['Samples_ID'].count()
lfsTEnum = lfsTEF['Samples_ID'].count()

listValSearch = ['291775', '298313', '315666', '320118', '18_1907']
# searched for kID of 63, 83, 156, 171, 219 > lynch ids
# kID 120 141 and 232 do not have any matches...
for i in listValSearch: 
    kicsTEF.drop(kicsTEF[kicsTEF['Samples_ID'].str.contains(i)==True].index, inplace = True)
    

mergedTEdf = pd.DataFrame(data = {'kics':kicsTEF['SV_type'], 'kChrom':kicsTEF['SV_chrom'], 
                                  'kAlt':kicsTEF['ALT'], 'lAlt':lfsTEF['ALT'],
                                  'kID':kicsTEF['Samples_ID'], 'lID': lfsTEF['Samples_ID'],
                                  'lfs':lfsTEF['SV_type'], 'lChrom':lfsTEF['SV_chrom']})

In [7]:
"""
Make big k df
"""

UberbigKdf = kIdAbbv(kicsClinicdf, kicsTEF, 'CCP germline', 'Samples_ID')

bigKdf = UberbigKdf[['CCP germline', 'Samples_ID', 'tumour_class','tissue_type',
                    'lfs_cancer_type_diagnosis', 'abbv_id','diagnosis_age (days)', 'SV_type','abbv_id','ALT']]

79


In [8]:
"""
Make a big lfs df
"""
UberbigLdf = pd.merge(lfsTEF[['Samples_ID','SV_type','ALT']],
                 lfsClinicdf,
                 left_on = 'Samples_ID',
                 right_on = 'sample',
                 how='inner')

bigLdf = UberbigLdf[['Samples_ID','SV_type','ALT','sample','tissue_type','cancer_diagnosis','ageofonset']]
             
bigKdf = bigKdf.rename(columns={'diagnosis_age (days)': 'ageofonset', 'CCP germline':'sample',
                                'lfs_cancer_type_diagnosis':'cancer_diagnosis'})

# RMS and Soft Tissue

In [9]:
#RMS and Soft tissue breakdown
kDf = UberbigKdf[['Samples_ID', 'ALT', 'abbv_id', 'KiCS ID', 'ICDO-diagnosis',
       'tissue_type', 'lfs_cancer_type_diagnosis','Previous cancers', 'Germline Sample from',
       'Germline sample type', 'CCP germline.1', 'WGS DNA germline',
       'WGS germline', 'pathology_id', 'Sample type', 'CCP tumor', 'CCP PPID',
       'CCP', 'WGS tumor ID', 'WGS Tumor', 'RNA ID', 'RNA',
       'single_somatic_specimen', 'disease_state', 'sample_type',
       'post_treatment', 'sample site', 'tumour_class', 'Sex',
       'diagnosis_age (days)', 'no_chemo_or_xrt']]

#use ICDO-diagnosis
lDf = UberbigLdf[['Samples_ID','ALT', 'sample',
       'tm_donor', 'tissue_type', 'cancer_diagnosis', 'active_cancer',
       'gender', 'systemic_treatment_atdraw', 'ageofonset', 'agesamplecollection',
       'cancer_num', 'cancer1_age_diff', 'cancer1', 'cancer1_ageofonset',
       'cancer2', 'cancer2_ageofonset', 'cancer3', 'cancer3_ageofonset',
       'cancer4', 'cancer4_ageofonset', 'cancer5', 'cancer5_ageofonset',
       'cancer6', 'cancer6_ageofonset', 'cancer7', 'cancer7_ageofonset',
       'cancer8', 'cancer8_ageofonset']]
#use Cancer 1 as detailed cancer diagnsosis

#overall diagnosis -> RMS and overall Soft Tissue tissue type <-
kST = kDf.loc[kDf['tissue_type']=='Soft Tissue']
kST = kST[['Samples_ID', 'ALT', 'ICDO-diagnosis', 'lfs_cancer_type_diagnosis']]
lST = lDf.loc[lDf['tissue_type']=='Soft Tissue']
lST = lST[['Samples_ID', 'ALT', 'cancer1', 'cancer_diagnosis']]

group = kST.drop(['ALT'],axis=1).groupby(['Samples_ID','ICDO-diagnosis']).value_counts().to_frame()
print('Overall KiCS Soft Tissue Mappings')
print(group)
print()

group1 = lST.drop(['ALT'],axis=1).groupby(['Samples_ID','cancer1']).value_counts().to_frame()
print('Overall LFS Soft Tissue Mappings')
print(group1)
print()

krms = kDf.loc[kDf['lfs_cancer_type_diagnosis']=='RMS']
krms = krms[['Samples_ID', 'ALT', 'ICDO-diagnosis', 'tissue_type']]
lrms = lDf.loc[lDf['cancer_diagnosis']=='RMS']
lrms = lrms[['Samples_ID', 'ALT', 'cancer1', 'tissue_type']]

group2 = krms.drop(['ALT'],axis=1).groupby(['Samples_ID','ICDO-diagnosis']).value_counts().to_frame()
print('Overall KiCS RMS Mappings')
print(group2)
print()

group2 = lrms.drop(['ALT'],axis=1).groupby(['Samples_ID','cancer1']).value_counts().to_frame()
print('Overall LFS RMS Mappings')
print(group2)
print()

Overall KiCS Soft Tissue Mappings
                                                                                  0
Samples_ID ICDO-diagnosis                           lfs_cancer_type_diagnosis      
273308     8920/3 - Alveolar rhabdomyosarcoma       RMS                        1948
284386     8910/3 - Embryonal Rhabdomyosarcoma, NOS RMS                        4516
297043     8910/3 - Embryonal rhabdomyosarcoma, NOS RMS                        1073
298866     8910/3 - Embryonal Rhabdomyosarcoma, NOS RMS                        1096
305082     8910/3 - Embryonal rhabdomyosarcoma, NOS RMS                        2178

Overall LFS Soft Tissue Mappings
                                                                 0
Samples_ID cancer1                        cancer_diagnosis        
1092       Malignant fibrous histiocytoma MFH                 1079
1355       Anaplastic ERMS                RMS                 1058
1843       High Grade Sarcoma             High Grade Sarcoma  1080
2085     

In [10]:
titles = ['kics','lfs']
columns = titles

In [11]:
"""

1. Graph for frequency of SV types
2. Graphs for frequency of SV types specific to chromosomes

Function to graph bar graphs

"""
    
xLabels = ['DEL','DUP','INV']
uniqueLabels = ['1','2','3','4','5','6','7','8','9','10',
                '11','12','13','14','15','16','17','18','19','20','21','22','X','Y']


In [24]:
"""
Creating variables and df used to make the SV specific graphs
"""
#probably a better way to do this

kicsSVTypedf = pd.DataFrame(data = {'kics':kicsTEF['SV_type'], 
                                  'kId':kicsTEF['Samples_ID']})
lfsSVTypedf = pd.DataFrame(data = {'lId':lfsTEF['Samples_ID'],
                                  'lfs':lfsTEF['SV_type']})

##kics
ktemp = kicsSVTypedf.groupby(['kId']).value_counts()

##lfs
ltemp = lfsSVTypedf.groupby(['lId']).value_counts()

#get unique identifiers
uniqueK = kicsSVTypedf['kId'].unique()
uniqueL = lfsSVTypedf['lId'].unique()

kId       kics
19_18442  INS     1067
245676    INS     1030
271113    INS     1095
271114    INS     1086
271115    INS     1120
                  ... 
323274    INS     1085
328067    INS     1181
5510      INS     1081
5511      INS     1107
5547      INS     1087
Length: 79, dtype: int64
[1067, 1030, 1095, 1086, 1120, 1064, 1091, 974, 1082, 1059, 1336, 948, 1052, 1129, 1091, 1071, 1087, 1080, 1059, 1072, 1092, 1120, 1118, 1126, 1108, 1083, 1148, 1114, 1026, 1130, 1087, 1074, 1107, 1273, 1099, 1089, 1061, 1093, 1114, 1097, 1073, 1083, 1087, 1182, 1132, 1079, 1096, 1093, 1183, 1076, 1064, 1140, 1068, 1073, 1298, 1085, 1303, 1069, 1083, 1053, 1058, 1036, 1105, 1069, 1078, 1102, 1089, 1073, 1131, 1080, 1117, 1073, 1092, 1116, 1085, 1181, 1081, 1107, 1087]


# SV Type

In [22]:
kINSData = makeDataSet(uniqueK, ktemp, 'INS')
lINSData = makeDataSet(uniqueL, ltemp, 'INS')
tempINSDF = makeUnequalDF(kINSData, lINSData)

[1116, 1073, 1117, 1091, 1120, 1030, 1108, 1080, 1114, 1107, 1102, 1132, 1064, 1131, 1074, 1303, 1083, 974, 1089, 1181, 1061, 1092, 1273, 1298, 1089, 1052, 1069, 1091, 1095, 1058, 1118, 1336, 1069, 1087, 1120, 1087, 948, 1126, 1073, 1093, 1087, 1059, 1093, 1078, 1081, 1099, 1092, 1036, 1085, 1097, 1140, 1068, 1130, 1085, 1083, 1182, 1076, 1114, 1183, 1064, 1105, 1087, 1086, 1072, 1073, 1073, 1083, 1026, 1067, 1148, 1059, 1079, 1129, 1082, 1096, 1053, 1080, 1071, 1107]
    kics     lfs
0   1116  1047.0
1   1073   846.0
2   1117  1077.0
3   1091  1082.0
4   1120  1079.0
5   1030  1053.0
6   1108  1080.0
7   1080  1092.0
8   1114  1081.0
9   1107  1057.0
10  1102  1029.0
11  1132  1086.0
12  1064  1096.0
13  1131  1325.0
14  1074  1091.0
15  1303  1032.0
16  1083  1064.0
17   974  1021.0
18  1089  1092.0
19  1181  1099.0
20  1061  1098.0
21  1092  1050.0
22  1273  1049.0
23  1298  1083.0
24  1089  1076.0
25  1052  1059.0
26  1069  1142.0
27  1091  1055.0
28  1095  1094.0
29  1058  1028.0


## INS

In [None]:
boxplotPoints(titles, columns, tempINSDF, 10, 10)

print(tempINSDF.columns)
print(tempINSDF.loc[tempINSDF['lfs']<200])

In [None]:
"""
Chromosome Specific
Cleaned up
"""
orderList = ['1','2','3','4','5','6','7','8','9','10',
                 '11','12','13','14','15','16','17','18','19','20','21','22','X','Y']

kicsMore = pd.DataFrame(data = {'SV':kicsTEF['SV_type'], 'chrom':kicsTEF['SV_chrom'], 
                                  'id':kicsTEF['Samples_ID']})
lfsMore = pd.DataFrame(data = {'id':lfsTEF['Samples_ID'], 'chrom':lfsTEF['SV_chrom'],
                                  'SV':lfsTEF['SV_type']})

numBPChrom = [249250621, 243199373, 198022430, 191154276, 180915260, 
              171115067, 159138663, 146364022, 141213431, 135534747,
              135006516, 133851895, 115169878, 107349540, 102531392,
              90354753, 81195210, 78077248, 59128983, 63025520,
              48129895, 51304566, 155270560, 59373566]

grouping = ['id', 'SV', 'chrom']

In [None]:
"""
ALU, LINE1, SVA
"""

kicsALT = pd.DataFrame(data={'ALT':kicsTEF['ALT'], 'id':kicsTEF['Samples_ID']})
lfsALT = pd.DataFrame(data={'ALT':lfsTEF['ALT'],'id':lfsTEF['Samples_ID']})

kicsALT = kicsALT.groupby('id')
kicsALT = kicsALT.value_counts().to_frame()
kicsALT = kicsALT.unstack(1)
kicsALT.columns = kicsALT.columns.droplevel(0)
kicsALT['dataset'] = 'kics'

lfsALT = lfsALT.groupby('id')
lfsALT = lfsALT.value_counts().to_frame()
lfsALT = lfsALT.unstack(1)
lfsALT.columns = lfsALT.columns.droplevel(0)
lfsALT['dataset'] = 'lfs'

mergedAlt = pd.concat([kicsALT,lfsALT])
mergedAlt = mergedAlt.reset_index()

#plotting the "overall" TE class boxplots
sns.boxplot(data=mergedAlt, x='dataset', y='ALU', medianprops={"linewidth": 4, "color": 'black'})
plt.show()

print(mannwhitneyu(kicsALT['ALU'].dropna(),lfsALT['ALU'].dropna()))

sns.boxplot(data=mergedAlt, x='dataset', y='LINE1', medianprops={"linewidth": 4, "color": 'black'})
plt.show()

print(mannwhitneyu(kicsALT['LINE1'].dropna(),lfsALT['LINE1'].dropna()))

sns.boxplot(data=mergedAlt, x='dataset', y='SVA', medianprops={"linewidth": 4, "color": 'black'})
plt.show()

print(mannwhitneyu(kicsALT['SVA'].dropna(),lfsALT['SVA'].dropna()))

In [None]:
"""
For each TE class, create chromosome 2 specific boxplot (normalized by bp and not normalized)
ALU
"""

kicsChromAlt = pd.DataFrame(data={'ALT':kicsTEF['ALT'], 'id':kicsTEF['Samples_ID'], 'chrom':kicsTEF['SV_chrom']})
lfsChromAlt = pd.DataFrame(data={'ALT':lfsTEF['ALT'],'id':lfsTEF['Samples_ID'], 'chrom':lfsTEF['SV_chrom']})

AltKdf, normAltKdf = formatDataFrame(kicsChromAlt, ['id','ALT','chrom'], numBPChrom, uniqueK, uniqueLabels, 'ALU')
AltLdf, normAltLdf = formatDataFrame(lfsChromAlt, ['id','ALT','chrom'], numBPChrom, uniqueL, uniqueLabels, 'ALU')

AltKdf['dataset'] = 'kics'
AltLdf['dataset'] = 'lfs'
normAltKdf['dataset'] = 'kics'
normAltLdf['dataset'] = 'lfs'

AltMergedDf = pd.concat([AltKdf, AltLdf])
AltMergedDf.rename(columns = {0:'occ'}, inplace = True)
AltMergedDf.reset_index(inplace=True)
AltMergedDf.rename(columns = {'index':'chrom'}, inplace = True)

##ALU

plt.rcParams["figure.figsize"] = [10, 10]
BoxGraphMulti(AltMergedDf, 'chrom', 'occ', 'dataset',orderList)

normAltDf = pd.concat([normAltKdf, normAltLdf])
normAltDf.reset_index(inplace=True)

BoxGraphMulti(normAltDf, 'chrom', 'normalized', 'dataset',orderList)
    

In [None]:
"""
Normalised and Regular box plot chrom specific for LINE1
"""

AltKdf, normAltKdf = formatDataFrame(kicsChromAlt, ['id','ALT','chrom'], numBPChrom, uniqueK, uniqueLabels, 'LINE1')
AltLdf, normAltLdf = formatDataFrame(lfsChromAlt, ['id','ALT','chrom'], numBPChrom, uniqueL, uniqueLabels, 'LINE1')

AltKdf['dataset'] = 'kics'
AltLdf['dataset'] = 'lfs'
normAltKdf['dataset'] = 'kics'
normAltLdf['dataset'] = 'lfs'

AltMergedDf = pd.concat([AltKdf, AltLdf])
AltMergedDf.rename(columns = {0:'occ'}, inplace = True)
AltMergedDf.reset_index(inplace=True)
AltMergedDf.rename(columns = {'index':'chrom'}, inplace = True)

###LINE1

BoxGraphMulti(AltMergedDf, 'chrom', 'occ', 'dataset',orderList)

normAltDf = pd.concat([normAltKdf, normAltLdf])
normAltDf.reset_index(inplace=True)

BoxGraphMulti(normAltDf, 'chrom', 'normalized', 'dataset',orderList)

In [None]:
"""
Normalised and Regular box plot chrom specific for SVA
"""

AltKdf, normAltKdf = formatDataFrame(kicsChromAlt, ['id','ALT','chrom'], numBPChrom, uniqueK, uniqueLabels, 'SVA')
AltLdf, normAltLdf = formatDataFrame(lfsChromAlt, ['id','ALT','chrom'], numBPChrom, uniqueL, uniqueLabels, 'SVA')

AltKdf['dataset'] = 'kics'
AltLdf['dataset'] = 'lfs'
normAltKdf['dataset'] = 'kics'
normAltLdf['dataset'] = 'lfs'

AltMergedDf = pd.concat([AltKdf, AltLdf])
AltMergedDf.rename(columns = {0:'occ'}, inplace = True)
AltMergedDf.reset_index(inplace=True)
AltMergedDf.rename(columns = {'index':'chrom'}, inplace = True)

BoxGraphMulti(AltMergedDf, 'chrom', 'occ', 'dataset',orderList)

normAltDf = pd.concat([normAltKdf, normAltLdf])
normAltDf.reset_index(inplace=True)

BoxGraphMulti(normAltDf, 'chrom', 'normalized', 'dataset',orderList)

In [None]:
"""
Normalised and Regular box plot chrom specific for INS
"""
newKdf, normKdf = formatDataFrame(kicsMore, grouping, numBPChrom, uniqueK, uniqueLabels, 'INS')
newLdf, normLdf = formatDataFrame(lfsMore, grouping, numBPChrom, uniqueL, uniqueLabels, 'INS')

newKdf['dataset'] = 'kics'
newLdf['dataset'] = 'lfs'
normKdf['dataset'] = 'kics'
normLdf['dataset'] = 'lfs'

newMergedDf = pd.concat([newKdf, newLdf])
newMergedDf.rename(columns = {0:'occ'}, inplace = True)
newMergedDf.reset_index(inplace=True)
newMergedDf.rename(columns = {'index':'chrom'}, inplace = True)

BoxGraphMulti(newMergedDf, 'chrom', 'occ', 'dataset',orderList)

normMergedDf = pd.concat([normKdf, normLdf])
normMergedDf.reset_index(inplace=True)

BoxGraphMulti(normMergedDf, 'chrom', 'normalized', 'dataset',orderList)

# Copy and Pasted from Cancer Graphs

In [None]:
""" 
Sizing of boxplots/figures
"""
plt.rcParams["figure.figsize"] = [20, 15]


In [None]:
"""
Age of Onset graphs
"""
lfsCancer = bigLdf[bigLdf['tissue_type']!='Unaffected'].drop(['sample', 'tissue_type',
                                                             'cancer_diagnosis','SV_type'],axis=1)
lfsUnaffected = bigLdf[bigLdf['tissue_type']=='Unaffected'].drop(['sample', 'tissue_type',
                                                            'cancer_diagnosis','SV_type'],axis=1)

kicsCancer = bigKdf.drop(['abbv_id', 'SV_type', 'sample', 'tumour_class',
                          'tissue_type','cancer_diagnosis'],axis=1)

kicsCancer['ageofonset']=kicsCancer['ageofonset'].astype('float')

lfsCancer['age6'] = lfsCancer.apply(lambda row: ageOfOnsetLfs(row), axis=1)
kicsCancer['age6'] = kicsCancer.apply(lambda row: ageOfOnsetKics(row), axis=1)
lfsUnaffected['age6'] = 'Unaffected'

LCAlu = lfsCancer.loc[lfsCancer['ALT'] == 'ALU']
LUAlu = lfsUnaffected.loc[lfsUnaffected['ALT'] == 'ALU']
KCAlu = kicsCancer.loc[kicsCancer['ALT'] == 'ALU']

LCLine1 = lfsCancer.loc[lfsCancer['ALT'] == 'LINE1']
LULine1 = lfsUnaffected.loc[lfsUnaffected['ALT'] == 'LINE1']
KCLine1 = kicsCancer.loc[kicsCancer['ALT'] == 'LINE1']

LCSva = lfsCancer.loc[lfsCancer['ALT'] == 'SVA']
LUSva = lfsUnaffected.loc[lfsUnaffected['ALT'] == 'SVA']
KCSva = kicsCancer.loc[kicsCancer['ALT'] == 'SVA']

#create boxplot for "overall" cancer vs unaffected
mergeDFs(lfsCancer,lfsUnaffected, kicsCancer)

# Alu

In [None]:
mergeDFs(LCAlu, LUAlu, KCAlu)

# Line1

In [None]:
mergeDFs(LCLine1, LULine1, KCLine1)

# Sva

In [None]:
mergeDFs(LCSva, LUSva, KCSva)

In [None]:
"""
Diagnosis and Tissue_Type comparisons intracohort
"""
#Overall
ldf = bigLdf.drop(['sample','SV_type', 'ageofonset'], axis=1)
kdf = bigKdf.drop(['sample','SV_type','abbv_id','ageofonset'],axis=1)

ldfGenTissue = ldf.drop(['ALT','cancer_diagnosis'],axis=1)
ldfGenTissue = ldfGenTissue.value_counts().to_frame().reset_index()

graphBoxGen('tissue_type', ldfGenTissue)

In [None]:
ldfGenDiag = ldf.drop(['ALT','tissue_type'],axis=1)
ldfGenDiag = ldfGenDiag.value_counts().to_frame().reset_index()

graphBoxGen('cancer_diagnosis', ldfGenDiag)

In [None]:
kdfGenTissue = kdf.drop(['ALT', 'cancer_diagnosis'], axis=1)
kdfGenTissue = kdfGenTissue.value_counts().to_frame().reset_index()
graphBoxGen('tissue_type', kdfGenTissue)

In [None]:
kdfGenDiag = kdf.drop(['ALT', 'tissue_type'], axis=1)
kdfGenDiag = kdfGenDiag.value_counts().to_frame().reset_index()
graphBoxGen('cancer_diagnosis', kdfGenDiag)

In [None]:
"""
TE class specific (ALU LINE1 SVA) Kics then Lfs
Tissue types graphed first
Diagnosis types graphed after
"""
kT = kdf.drop(['cancer_diagnosis'], axis=1).groupby(['Samples_ID', 'ALT','tissue_type']).size()
kT = kT.unstack(1)
lT = ldf.drop(['cancer_diagnosis'], axis=1).groupby(['Samples_ID', 'ALT', 'tissue_type']).size()
lT = lT.unstack(1)

kD = kdf.drop(['tissue_type'], axis=1).groupby(['Samples_ID', 'ALT','cancer_diagnosis']).size()
kD = kD.unstack(1)
lD = ldf.drop(['tissue_type'], axis=1).groupby(['Samples_ID', 'ALT','cancer_diagnosis']).size()
lD = lD.unstack(1)

graphBoxGen('tissue_type', kT.drop(['LINE1', 'SVA'], axis=1).reset_index(),'ALU')

In [None]:
graphBoxGen('tissue_type', kT.drop(['ALU', 'SVA'], axis=1).reset_index(),'LINE1')

In [None]:
graphBoxGen('tissue_type', kT.drop(['LINE1', 'ALU'], axis=1).reset_index(),'SVA')

In [None]:
#lfs tissue
graphBoxGen('tissue_type', lT.drop(['LINE1', 'SVA'], axis=1).reset_index(),'ALU')

In [None]:
graphBoxGen('tissue_type', lT.drop(['SVA', 'ALU'], axis=1).reset_index(),'LINE1')

In [None]:
graphBoxGen('tissue_type', lT.drop(['LINE1', 'ALU'], axis=1).reset_index(),'SVA')

In [None]:
#kics Diagnosis
graphBoxGen('cancer_diagnosis', kD.drop(['LINE1', 'SVA'], axis=1).reset_index(),'ALU')

In [None]:
graphBoxGen('cancer_diagnosis', kD.drop(['ALU', 'SVA'], axis=1).reset_index(),'LINE1')

In [None]:
graphBoxGen('cancer_diagnosis', kD.drop(['LINE1', 'ALU'], axis=1).reset_index(),'SVA')

In [None]:
graphBoxGen('cancer_diagnosis', lD.drop(['LINE1', 'SVA'], axis=1).reset_index(),'ALU')

In [None]:
graphBoxGen('cancer_diagnosis', lD.drop(['ALU', 'SVA'], axis=1).reset_index(),'LINE1')

In [None]:
graphBoxGen('cancer_diagnosis', lD.drop(['LINE1', 'ALU'], axis=1).reset_index(),'SVA')

In [None]:
"""
LFS vs Unaffected
3 graphs for TE elements (ALU LINE1 SVA) showing "overall"
"""
#checked if unaffected tissue type is unaffected diag and vice versa -> they are
unaffectedDf = bigLdf[bigLdf['tissue_type']=='Unaffected'].drop(['sample', 'tissue_type',
                                                                 'cancer_diagnosis','SV_type','ageofonset'], axis=1)
cancerDf = bigLdf[bigLdf['tissue_type']!='Unaffected'].drop(['sample', 'tissue_type',
                                                                 'cancer_diagnosis','SV_type','ageofonset'], axis=1)
uAll = unaffectedDf.drop(['ALT'],axis=1).value_counts().to_frame().reset_index()
uAll = uAll.rename(columns={0:'freq'})
uAll['dataset'] = 'unaffected'
cAll = cancerDf.drop(['ALT'],axis=1).value_counts().to_frame().reset_index()
cAll = cAll.rename(columns={0:'freq'})
cAll['dataset'] = 'cancer'
mergedAll = pd.DataFrame(data={'unaffected':uAll['freq'], 'cancer':cAll['freq']})
print('ALL')
outliersInDf(pd.concat([uAll,cAll]),'dataset','freq')


uAlu = unaffectedDf[unaffectedDf['ALT'] == 'ALU'].value_counts().to_frame().reset_index()
uAlu = uAlu.rename(columns={0:'freq'})
uAlu['dataset'] = 'unaffected'

cAlu = cancerDf[cancerDf['ALT'] == 'ALU'].value_counts().to_frame().reset_index()
cAlu = cAlu.rename(columns={0:'freq'})
cAlu['dataset'] = 'cancer'
mergedAlu = pd.DataFrame(data = {'unaffected':uAlu['freq'],'cancer':cAlu['freq']})
print('ALU')
outliersInDf(pd.concat([uAlu,cAlu]),'dataset','freq')



uLine = unaffectedDf[unaffectedDf['ALT'] == 'LINE1'].value_counts().to_frame().reset_index()
uLine = uLine.rename(columns={0:'freq'})
uLine['dataset'] = 'unaffected'
cLine = cancerDf[cancerDf['ALT'] == 'LINE1'].value_counts().to_frame().reset_index()
cLine = cLine.rename(columns={0:'freq'})
cLine['dataset'] = 'cancer'
mergedLine = pd.DataFrame(data = {'unaffected':uLine['freq'],'cancer':cLine['freq']})
print('LINE1')
outliersInDf(pd.concat([uLine,cLine]),'dataset','freq')

uSva = unaffectedDf[unaffectedDf['ALT'] == 'SVA'].value_counts().to_frame().reset_index()
uSva = uSva.rename(columns={0:'freq'})
uSva['dataset'] = 'unaffected'
cSva = cancerDf[cancerDf['ALT'] == 'SVA'].value_counts().to_frame().reset_index()
cSva = cSva.rename(columns={0:'freq'})
cSva['dataset'] = 'cancer'
mergedSva = pd.DataFrame(data = {'unaffected':uSva['freq'],'cancer':cSva['freq']})
print('SVA')
outliersInDf(pd.concat([uSva,cSva]),'dataset','freq')

title = ['unaffected','cancer']

boxplotPoints(title, title, mergedAlu)
print('LINE1')
boxplotPoints(title, title, mergedLine)
print('SVA')
boxplotPoints(title, title, mergedSva)

print('ALLL')
boxplotPoints(title, title, mergedAll)


In [None]:
kDiagList = pd.DataFrame(data={'diag':bigKdf['cancer_diagnosis'], 'id':bigKdf['Samples_ID']}) 
lDiagList = pd.DataFrame(data={'diag':bigLdf['cancer_diagnosis'], 'id':bigLdf['Samples_ID']})

ktList = pd.DataFrame(data={'tissue_type':bigKdf['tissue_type'], 'id':bigKdf['Samples_ID']})
ltList = pd.DataFrame(data={'tissue_type':bigLdf['tissue_type'], 'id':bigLdf['Samples_ID']})

In [None]:
kicsTissue = bigKdf.drop(['SV_type','abbv_id','sample', 'tumour_class', 'cancer_diagnosis','ageofonset'], axis=1)
kicsDiag = bigKdf.drop(['SV_type','abbv_id','sample', 'tumour_class','ageofonset', 'tissue_type'], axis=1)

lfsTissue = bigLdf.drop(['cancer_diagnosis', 'sample','SV_type','ageofonset'],axis=1)
lfsDiag = bigLdf.drop(['tissue_type', 'sample','SV_type','ageofonset'],axis=1)

graphSVTissue(kicsDiag, lfsDiag, False)

In [None]:
graphSVTissue(kicsTissue, lfsTissue, True)

In [None]:
"""
Creating dfs needed for lfs vs kics tissue and diag
"""
kDF = bigKdf.drop(['SV_type', 'abbv_id', 'sample', 'tumour_class','ageofonset'],axis=1)
lDF = bigLdf.drop(['SV_type','sample','ageofonset'],axis=1)

kAlu = kDF.loc[kDF['ALT'] == 'ALU']
kLine = kDF.loc[kDF['ALT'] == 'LINE1']
kSva = kDF.loc[kDF['ALT'] == 'SVA']

lAlu = lDF.loc[lDF['ALT'] == 'ALU']
lLine = lDF.loc[lDF['ALT'] == 'LINE1']
lSva = lDF.loc[lDF['ALT'] == 'SVA']

In [None]:
graphSVTissue(kAlu.drop(['cancer_diagnosis'],axis=1), lAlu.drop(['cancer_diagnosis'],axis=1), True)

In [None]:
graphSVTissue(kLine.drop(['cancer_diagnosis'],axis=1), lLine.drop(['cancer_diagnosis'],axis=1), True)

In [None]:
graphSVTissue(kSva.drop(['cancer_diagnosis'],axis=1), lSva.drop(['cancer_diagnosis'],axis=1), True)

In [None]:
#diagnosis
graphSVTissue(kAlu.drop(['tissue_type'],axis=1), lAlu.drop(['tissue_type'],axis=1), False)

In [None]:
graphSVTissue(kLine.drop(['tissue_type'],axis=1), lLine.drop(['tissue_type'],axis=1), False)

In [None]:
graphSVTissue(kSva.drop(['tissue_type'],axis=1), lSva.drop(['tissue_type'],axis=1), False)

In [None]:
"""
Create graph for RMS subtypes
"""
rmsKDf = UberbigKdf[['Samples_ID','ICDO-diagnosis','tissue_type','lfs_cancer_type_diagnosis']]
rmsLDf = UberbigLdf[['Samples_ID','cancer1','tissue_type','cancer_diagnosis']]
rmsKDf = rmsKDf.loc[rmsKDf['lfs_cancer_type_diagnosis']=='RMS']

#rms (cancer type) and soft tissue (tissue type) in kics has the same samples
rmsKDf = rmsKDf.drop(['tissue_type','lfs_cancer_type_diagnosis'], axis=1).groupby(['Samples_ID']).value_counts().to_frame().reset_index()
rmsKDf['dataset']='kics'

#rms and soft tissue in lfs do not have the same samples
stLDf = rmsLDf.loc[rmsLDf['tissue_type']=='Soft Tissue']
stLDf = stLDf.drop(['tissue_type'],axis=1).groupby(['Samples_ID','cancer_diagnosis']).value_counts().to_frame().reset_index()

# graphing soft tissue mappings to cancer diagnosis column (lfs)
ax = sns.boxplot(data=stLDf, x='cancer_diagnosis', y=0, medianprops={"linewidth": 4, 'color':'black'})
    
uniqueList = list(stLDf['cancer_diagnosis'].unique())
listPairing = [(a,b) for x, a in enumerate(uniqueList) for b in uniqueList[x+1:]]
        
annot = Annotator(ax, listPairing, data=stLDf, x='cancer_diagnosis', y=0)
annot.configure(test='Mann-Whitney',
                    text_format='star', loc='outside', verbose=2)
annot.apply_and_annotate()
plt.show()

rmsLDf = rmsLDf.loc[rmsLDf['cancer_diagnosis']=='RMS']
rmsLDf = rmsLDf.drop(['tissue_type','cancer_diagnosis'], axis=1).groupby(['Samples_ID']).value_counts().to_frame().reset_index()
rmsLDf['dataset']='lfs'

rmsLDf = rmsLDf.rename(columns={'cancer1': 'type',0:'freq'})
rmsKDf = rmsKDf.rename(columns={'ICDO-diagnosis':'type', 0:'freq'})

rmsLDf['rms_type'] = rmsLDf.apply(lambda row: rmsType(row), axis=1)
rmsKDf['rms_type'] = rmsKDf.apply(lambda row: rmsType(row), axis=1)

mergedRMS = pd.concat([rmsKDf,rmsLDf])

#kics vs lfs, graphing RMS subtypes 
bx = sns.boxplot(data=mergedRMS, x='rms_type', y='freq',hue = 'dataset', medianprops={"linewidth": 2, 'color':'black'})

listPairing = [(('ERMS','kics'),('ERMS','lfs')), (('ERMS','kics'),('ARMS','lfs')),
               (('ARMS','kics'),('ERMS','lfs')), (('ARMS','kics'),('ARMS','lfs')),
               (('ARMS','kics'),('ERMS','kics'))]

annot = Annotator(bx, listPairing, data=mergedRMS, x='rms_type', y='freq',hue = 'dataset')
annot.configure(test='Mann-Whitney',
                    text_format='star', loc='outside', verbose=2)
annot.apply_and_annotate()
plt.show()