In [338]:
import pandas as pd 
import numpy as np

import sbmlcore
import pytest

import warnings
warnings.filterwarnings('ignore')

Load MIC (continous, but also binary available) phenotypic data from CRyPTIC table

In [339]:
PHENOTYPES = pd.read_pickle('data/UKMYC_PHENOTYPES.pkl.gz')
PHENOTYPES.reset_index(inplace = True)
PHENOTYPES.set_index('UNIQUEID', inplace = True)
PHENOTYPES = PHENOTYPES[(PHENOTYPES.DRUG == 'LEV') | (PHENOTYPES.DRUG == 'MXF')]
PHENOTYPES

Unnamed: 0_level_0,DRUG,PLATEDESIGN,BELONGS_GPI,SITEID,DILUTION,PHENOTYPE_QUALITY,READINGDAY,PRIMARY_DILUTION,PRIMARY_METHOD,AMYGDA_DILUTION,BASHTHEBUG_DILUTION,BASHTHEBUGPRO_DILUTION,PHENOTYPE_DESCRIPTION,BASHTHEBUG_NUMBER_CLASSIFICATIONS,BASHTHEBUGPRO_NUMBER_CLASSIFICATIONS,MIC,LOG2MIC,BINARY_PHENOTYPE
UNIQUEID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
site.06.subj.06TB_1032.lab.06MIL2037.iso.1,MXF,UKMYC6,True,06,3.0,HIGH,14,3.0,VZ,3.0,8.0,,"VZ,IM AGREE",11.0,,0.25,-2.00,S
site.06.subj.06TB_0404.lab.06MIL1373.iso.1,MXF,UKMYC6,True,06,3.0,HIGH,21,3.0,VZ,3.0,8.0,,"VZ,IM AGREE",11.0,,0.25,-2.00,S
site.02.subj.1518.lab.2013102308.iso.1,LEV,UKMYC6,False,02,3.0,LOW,14,3.0,VZ,2.0,4.0,3.0,ALL DISAGREE,11.0,3.0,0.5,-1.00,S
site.06.subj.SSM_0197-R.lab.06MIL0117.iso.1,MXF,UKMYC5,True,06,1.0,HIGH,14,1.0,VZ,1.0,8.0,,"VZ,IM AGREE",11.0,,<=0.06,-4.06,S
site.06.subj.06TB_1545.lab.06MIL2824.iso.1,MXF,UKMYC6,True,06,2.0,LOW,14,2.0,VZ,4.0,8.0,4.0,ALL DISAGREE,11.0,3.0,0.12,-3.06,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
site.02.subj.0016.lab.2014231005.iso.1,LEV,UKMYC5,True,02,2.0,HIGH,14,2.0,VZ,2.0,2.0,,"VZ,IM AGREE",11.0,,0.25,-2.00,S
site.05.subj.CA-0547.lab.CO-11838-18.iso.1,LEV,UKMYC6,False,05,3.0,HIGH,14,3.0,VZ,3.0,3.0,,"VZ,IM AGREE",11.0,,0.5,-1.00,S
site.03.subj.GB-90840137.lab.IML-01497.iso.1,LEV,UKMYC6,False,03,2.0,LOW,14,2.0,VZ,1.0,5.0,2.0,ALL DISAGREE,11.0,3.0,0.25,-2.00,S
site.05.subj.LS-1114.lab.MA-00010-19.iso.1,LEV,UKMYC6,True,05,3.0,HIGH,14,3.0,VZ,3.0,4.0,,"VZ,IM AGREE",11.0,,0.5,-1.00,S


Load mutation data for gyrA/B genes

In [340]:
MUTATIONS = pd.read_pickle('data/MUTATIONS_SAMPLE.pkl.gz')
MUTATIONS = MUTATIONS.reset_index()
MUTATIONS.set_index('UNIQUEID', inplace = True)

#R463L is a known phylogenetic marker for Beijing lineage
MUTATIONS = MUTATIONS[(~MUTATIONS.IS_NULL) & (MUTATIONS.IS_FILTER_PASS) & (~MUTATIONS.IS_HET) & (MUTATIONS.IS_SNP) \
                    & (MUTATIONS.IN_CDS) & (MUTATIONS.GENE == 'katG') & (MUTATIONS.IS_NONSYNONYMOUS) & (~(MUTATIONS.MUTATION == 'R463L'))]
MUTATIONS

Unnamed: 0_level_0,GENE,MUTATION,POSITION,AMINO_ACID_NUMBER,GENOME_INDEX,NUCLEOTIDE_NUMBER,REF,ALT,IS_SNP,IS_INDEL,...,IS_HET,IS_NULL,IS_FILTER_PASS,ELEMENT_TYPE,MUTATION_TYPE,INDEL_LENGTH,INDEL_1,INDEL_2,SITEID,NUMBER_NUCLEOTIDE_CHANGES
UNIQUEID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
site.02.subj.0122.lab.22A076.iso.1,katG,P232A,232.0,232.0,,,ccg,gcg,True,False,...,False,False,True,GENE,AAM,,,,02,1
site.05.subj.LR-3059.lab.CR-00865-16.iso.1,katG,S315T,315.0,315.0,,,agc,acc,True,False,...,False,False,True,GENE,AAM,,,,05,1
site.05.subj.LR-2342.lab.FN-01704-18.iso.1,katG,S315T,315.0,315.0,,,agc,acc,True,False,...,False,False,True,GENE,AAM,,,,05,1
site.05.subj.LI2163509.lab.14722_6_94.iso.1,katG,S315T,315.0,315.0,,,agc,acc,True,False,...,False,False,True,GENE,AAM,,,,05,1
site.05.subj.LI2149109.lab.14893_2_65.iso.1,katG,S315T,315.0,315.0,,,agc,acc,True,False,...,False,False,True,GENE,AAM,,,,05,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
site.10.subj.YA00127822.lab.YA00127822.iso.1,katG,S315T,315.0,315.0,,,agc,acc,True,False,...,False,False,True,GENE,AAM,,,,10,1
site.10.subj.PL00350106.lab.PL00350106.iso.1,katG,S315T,315.0,315.0,,,agc,acc,True,False,...,False,False,True,GENE,AAM,,,,10,1
site.10.subj.YA00024313.lab.YA00024313.iso.1,katG,S315T,315.0,315.0,,,agc,acc,True,False,...,False,False,True,GENE,AAM,,,,10,1
site.10.subj.YA00025292.lab.YA00025292.iso.1,katG,S315G,315.0,315.0,,,agc,ggc,True,False,...,False,False,True,GENE,AAM,,,,10,1


In [341]:
#read in mutations (pre-filtered for gyrA/B mutations due to size of mutations data file)
MUTATIONS = pd.read_csv('data/MUTATIONS-gyrA_B.csv.gz', compression='gzip', header = None)

#restore header through shortened mutations file
HEADER = pd.read_pickle('data/MUTATIONS_SAMPLE.pkl.gz', compression='gzip')

HEADER.reset_index(inplace = True)
HEADER = HEADER.columns.to_list()
MUTATIONS.columns = HEADER

#remove non-synonymous, promoter and mutations with inconclusive sequencing, as well as indels
MUTATIONS = MUTATIONS[(~MUTATIONS.IS_SYNONYMOUS) & (~MUTATIONS.IN_PROMOTER) & (MUTATIONS.IS_SNP) & (~MUTATIONS.IS_NULL) & (MUTATIONS.IS_FILTER_PASS)]
MUTATIONS['GENE_MUTATION'] = MUTATIONS['GENE'] + '_' + MUTATIONS['MUTATION']

#label lineage mutations
#ref for lineage mutations: Miotto ERJ 2017
LINEAGE_MUTATIONS=['gyrA_S95T','gyrA_E21Q','gyrA_T80A','gyrA_A90G','gyrA_G247S','gyrA_A384V',
             'gyrA_G668D','gyrA_L712V', 'gyrA_S250A','gyrA_R252L','gyrA_L398F','gyrA_A463S','gyrA_D639A','gyrA_V742L',
              'gyrB_V301L','gyrB_M291I','gyrB_A403S']
MUTATIONS['LINEAGE_MUTATION'] = MUTATIONS.GENE_MUTATION.isin(LINEAGE_MUTATIONS)

#record how many mutations we see for each sample per UNIQUEID -> we can only perform mutation based approach for samples with < 2 non-lineage mutations in pncA
MUTATIONS.set_index('UNIQUEID', inplace = True)

number_mutations = MUTATIONS[MUTATIONS.LINEAGE_MUTATION == False].index.value_counts()
number_mutations.name = 'MUTATION_COUNT'
MUTATIONS = MUTATIONS.join(number_mutations.to_frame())

print("Out of the", len(MUTATIONS[(MUTATIONS.LINEAGE_MUTATION == False)]), 
      "samples with non-lineage mutations in gyrA/B", 
      len(MUTATIONS[(MUTATIONS.LINEAGE_MUTATION == False) & (MUTATIONS.MUTATION_COUNT < 2)]), 
          "show only one such mutation per sample")

Out of the 5063 samples with non-lineage mutations in gyrA/B 3974 show only one such mutation per sample


In [342]:
#amount of eligible samples where phenotypic data is available
print("Good-quality Phenotypic data is available for", len(PHENOTYPES[PHENOTYPES.index.isin(MUTATIONS[(MUTATIONS.LINEAGE_MUTATION == False) 
                                                                                         & (MUTATIONS.MUTATION_COUNT < 2) ].index.to_list()) 
                                                                      & (PHENOTYPES.PHENOTYPE_QUALITY!='LOW')].index.unique()),
      "of samples that show only one non-lineage mutation per sample in gyrA/B")

Good-quality Phenotypic data is available for 2441 of samples that show only one non-lineage mutation per sample in gyrA/B


Remove low quality phenotypes and divide data by drug (LEV, MXF)

In [343]:
LEV_PHENOTYPES = PHENOTYPES[(PHENOTYPES.index.isin(MUTATIONS[(MUTATIONS.LINEAGE_MUTATION == False) 
                                           & (MUTATIONS.MUTATION_COUNT < 2)].index.to_list())) 
           & (PHENOTYPES.DRUG == 'LEV') & (PHENOTYPES.PHENOTYPE_QUALITY!='LOW')]
LEV_PHENOTYPES.BINARY_PHENOTYPE.value_counts()

BINARY_PHENOTYPE
R    1236
S     903
I       0
Name: count, dtype: int64

In [344]:
MXF_PHENOTYPES = PHENOTYPES[(PHENOTYPES.index.isin(MUTATIONS[(MUTATIONS.LINEAGE_MUTATION == False) 
                                           & (MUTATIONS.MUTATION_COUNT < 2)].index.to_list())) 
           & (PHENOTYPES.DRUG == 'MXF') & (PHENOTYPES.PHENOTYPE_QUALITY!='LOW')]
MXF_PHENOTYPES.BINARY_PHENOTYPE.value_counts()

BINARY_PHENOTYPE
R    1031
S     981
I       0
Name: count, dtype: int64

Join the mutations data with the phenotypic data frame for both drugs

In [345]:
MUTATIONS = MUTATIONS[(MUTATIONS.LINEAGE_MUTATION == False) & (MUTATIONS.MUTATION_COUNT < 2)]

LEV_DATASET = pd.merge(LEV_PHENOTYPES, MUTATIONS, how = 'left', on = 'UNIQUEID')
LEV_DATASET = LEV_DATASET[['DRUG', 'MIC', 'LOG2MIC', 'BINARY_PHENOTYPE', 'PHENOTYPE_QUALITY', 'GENE', 'MUTATION', 'POSITION', 'GENE_MUTATION']]
LEV_DATASET

Unnamed: 0_level_0,DRUG,MIC,LOG2MIC,BINARY_PHENOTYPE,PHENOTYPE_QUALITY,GENE,MUTATION,POSITION,GENE_MUTATION
UNIQUEID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
site.06.subj.06TB_0027.lab.06MIL0488.iso.1,LEV,2.0,1.0,R,MEDIUM,gyrA,A90V,90.0,gyrA_A90V
site.06.subj.06TB_0843.lab.06MIL1790.iso.1,LEV,>8,4.0,R,HIGH,gyrA,D94Y,94.0,gyrA_D94Y
site.04.subj.00624.lab.709302.iso.1,LEV,>8,4.0,R,HIGH,gyrA,D94N,94.0,gyrA_D94N
site.11.subj.XTB-18-196.lab.XTB-18-196.iso.1,LEV,2.0,1.0,R,MEDIUM,gyrB,N499D,499.0,gyrB_N499D
site.10.subj.YA00021495.lab.YA00021495.iso.1,LEV,0.5,-1.0,S,HIGH,gyrA,N193S,193.0,gyrA_N193S
...,...,...,...,...,...,...,...,...,...
site.10.subj.XD02435891.lab.XD02435891.iso.1,LEV,8.0,3.0,R,HIGH,gyrA,D94N,94.0,gyrA_D94N
site.02.subj.0482.lab.235081-14.iso.1,LEV,1.0,0.0,S,HIGH,gyrB,G512R,512.0,gyrB_G512R
site.05.subj.LR-2321.lab.FN-01077-18.iso.1,LEV,>8,4.0,R,HIGH,gyrA,G88C,88.0,gyrA_G88C
site.11.subj.XTB_18-072.lab.XTB_18-072.iso.1,LEV,8.0,3.0,R,MEDIUM,gyrA,D94N,94.0,gyrA_D94N


In [346]:
MXF_DATASET = pd.merge(MXF_PHENOTYPES, MUTATIONS, how = 'left', on = 'UNIQUEID')
MXF_DATASET = MXF_DATASET[['DRUG', 'MIC', 'LOG2MIC', 'BINARY_PHENOTYPE', 'PHENOTYPE_QUALITY', 'GENE', 'MUTATION', 'POSITION', 'GENE_MUTATION']]
MXF_DATASET

Unnamed: 0_level_0,DRUG,MIC,LOG2MIC,BINARY_PHENOTYPE,PHENOTYPE_QUALITY,GENE,MUTATION,POSITION,GENE_MUTATION
UNIQUEID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
site.04.subj.00097.lab.633433.iso.1,MXF,2.0,1.00,R,HIGH,gyrA,D94G,94.0,gyrA_D94G
site.10.subj.UH00935117.lab.UH00935117.iso.1,MXF,4.0,2.00,R,HIGH,gyrA,A90V,90.0,gyrA_A90V
site.11.subj.XTB-18-164.lab.XTB-18-164.iso.1,MXF,>4,3.00,R,MEDIUM,gyrA,D94N,94.0,gyrA_D94N
site.04.subj.00121.lab.633619.iso.1,MXF,2.0,1.00,R,HIGH,gyrA,D94G,94.0,gyrA_D94G
site.14.subj.2591.lab.2591.iso.1,MXF,>4,3.00,R,HIGH,gyrA,D94N,94.0,gyrA_D94N
...,...,...,...,...,...,...,...,...,...
site.04.subj.01310.lab.724468.iso.1,MXF,1.0,0.00,S,HIGH,gyrA,S91P,91.0,gyrA_S91P
site.11.subj.XTB-18-186.lab.XTB-18-186.iso.1,MXF,2.0,1.00,R,MEDIUM,gyrA,A90V,90.0,gyrA_A90V
site.10.subj.UH01302806.lab.UH01302806.iso.1,MXF,2.0,1.00,R,HIGH,gyrA,A90V,90.0,gyrA_A90V
site.05.subj.CA-0020.lab.CO-05421-18.iso.1,MXF,0.12,-3.06,S,HIGH,gyrB,G594R,594.0,gyrB_G594R


Create features using sbml core

In [347]:
def make_segid(row):
    return row['GENE'][3:]

MXF_DATASET['segid'] = MXF_DATASET.apply(make_segid, axis = 1)
MXF_DATASET['mutation'] = MXF_DATASET['MUTATION']
MXF_DATASET.drop('MUTATION', axis = 1, inplace = True)

LEV_DATASET['segid'] = LEV_DATASET.apply(make_segid, axis = 1)
LEV_DATASET['mutation'] = LEV_DATASET['MUTATION']
LEV_DATASET.drop('MUTATION', axis = 1, inplace = True)

In [348]:
MXF_DATASET

Unnamed: 0_level_0,DRUG,MIC,LOG2MIC,BINARY_PHENOTYPE,PHENOTYPE_QUALITY,GENE,POSITION,GENE_MUTATION,segid,mutation
UNIQUEID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
site.04.subj.00097.lab.633433.iso.1,MXF,2.0,1.00,R,HIGH,gyrA,94.0,gyrA_D94G,A,D94G
site.10.subj.UH00935117.lab.UH00935117.iso.1,MXF,4.0,2.00,R,HIGH,gyrA,90.0,gyrA_A90V,A,A90V
site.11.subj.XTB-18-164.lab.XTB-18-164.iso.1,MXF,>4,3.00,R,MEDIUM,gyrA,94.0,gyrA_D94N,A,D94N
site.04.subj.00121.lab.633619.iso.1,MXF,2.0,1.00,R,HIGH,gyrA,94.0,gyrA_D94G,A,D94G
site.14.subj.2591.lab.2591.iso.1,MXF,>4,3.00,R,HIGH,gyrA,94.0,gyrA_D94N,A,D94N
...,...,...,...,...,...,...,...,...,...,...
site.04.subj.01310.lab.724468.iso.1,MXF,1.0,0.00,S,HIGH,gyrA,91.0,gyrA_S91P,A,S91P
site.11.subj.XTB-18-186.lab.XTB-18-186.iso.1,MXF,2.0,1.00,R,MEDIUM,gyrA,90.0,gyrA_A90V,A,A90V
site.10.subj.UH01302806.lab.UH01302806.iso.1,MXF,2.0,1.00,R,HIGH,gyrA,90.0,gyrA_A90V,A,A90V
site.05.subj.CA-0020.lab.CO-05421-18.iso.1,MXF,0.12,-3.06,S,HIGH,gyrB,594.0,gyrB_G594R,B,G594R


In [261]:
# def update_binary_phenotype(row):

#     target_string = 'gyrA_D94G'  # The string to filter rows
    
#     if row['GENE_MUTATION'] == target_string:
        
#         filtered_df = MXF_DATASET[MXF_DATASET['GENE_MUTATION'] == target_string]
#         most_common_value = filtered_df['BINARY_PHENOTYPE'].mode().iloc[0]

#         return most_common_value
#     else:
#         return row['BINARY_PHENOTYPE']
    
# MXF_DATASET['BINARY_PHENOTYPE'] = MXF_DATASET.apply(update_binary_phenotype, axis=1)

In [425]:
def update_binary_phenotype(row, target_strings):

    if row['GENE_MUTATION'] in target_strings:
        
        target_string = row['GENE_MUTATION']
        filtered_df = MXF_DATASET[MXF_DATASET['GENE_MUTATION'] == target_string]
        
        if len(filtered_df) < 2:
            
            return row['BINARY_PHENOTYPE']
        
        else:
            
            most_common_value = filtered_df['BINARY_PHENOTYPE'].mode().iloc[0]

            return most_common_value
    
    else:
        return row['BINARY_PHENOTYPE']

In [426]:
target_strings_MXF = MXF_DATASET.GENE_MUTATION.unique()
target_strings_LEV = LEV_DATASET.GENE_MUTATION.unique()
target_strings = np.union1d(target_strings_MXF, target_strings_LEV)

In [427]:
MXF_DATASET['BINARY_PHENOTYPE'] = MXF_DATASET.apply(lambda row: update_binary_phenotype(row, target_strings), axis=1)

In [428]:
LEV_DATASET['BINARY_PHENOTYPE'] = LEV_DATASET.apply(lambda row: update_binary_phenotype(row, target_strings), axis=1)

In [429]:
MXF_DATASET[MXF_DATASET['GENE_MUTATION'] == 'gyrA_D94G'].BINARY_PHENOTYPE.value_counts()

BINARY_PHENOTYPE
R    619
Name: count, dtype: int64

In [430]:
MXF_DATASET[MXF_DATASET['GENE_MUTATION'] == 'gyrA_D94G'].BINARY_PHENOTYPE.value_counts()

BINARY_PHENOTYPE
R    619
Name: count, dtype: int64

In [431]:
LEV_DATASET[LEV_DATASET['GENE_MUTATION'] == 'gyrA_D94Y'].BINARY_PHENOTYPE.value_counts()

BINARY_PHENOTYPE
R    58
Name: count, dtype: int64

In [432]:
MXF_DATASET[MXF_DATASET['GENE_MUTATION'] == 'gyrA_Q613E'].BINARY_PHENOTYPE

UNIQUEID
site.10.subj.YA00009954.lab.YA00009954.iso.1    S
site.20.subj.SA00329756.lab.YA00135259.iso.1    S
site.10.subj.YA00023537.lab.YA00023537.iso.1    S
site.10.subj.YA00106034.lab.YA00106034.iso.1    S
site.10.subj.YA00127803.lab.YA00127803.iso.1    S
                                               ..
site.10.subj.TRL0024265.lab.TRL0024265.iso.1    S
site.10.subj.YA00022119.lab.YA00022119.iso.1    S
site.10.subj.YA00089269.lab.YA00089269.iso.1    S
site.20.subj.SCH7908405.lab.YA00131558.iso.1    S
site.10.subj.YA00028138.lab.YA00028138.iso.1    S
Name: BINARY_PHENOTYPE, Length: 75, dtype: object

In [433]:
MXF_DATASET.GENE_MUTATION.value_counts()[:20]

GENE_MUTATION
gyrA_D94G     619
gyrA_A90V     336
gyrA_D94N     136
gyrB_P94L     131
gyrA_D94A      86
gyrA_Q613E     75
gyrA_S91P      65
gyrA_D94Y      60
gyrA_D94H      33
gyrA_G88C      22
gyrA_N193S     20
gyrB_E501D     15
gyrA_P472S     14
gyrA_T267I     12
gyrA_P8A       12
gyrB_D461N     11
gyrB_R446C     10
gyrA_P154R      9
gyrB_A130S      9
gyrB_I84V       8
Name: count, dtype: int64

For now, drop rows with mutations outside of crystallized regions, might impute values later on (segid A: 15-501, segid B: 425-675)

In [434]:
MXF_DATASET = MXF_DATASET[(MXF_DATASET.segid == 'A') & MXF_DATASET.POSITION.astype(int).isin(list(range(15,502))) 
            | (MXF_DATASET.segid == 'B') & MXF_DATASET.POSITION.astype(int).isin(list(range(425,676)))]

LEV_DATASET = LEV_DATASET[(LEV_DATASET.segid == 'A') & LEV_DATASET.POSITION.astype(int).isin(list(range(15,502))) 
            | (LEV_DATASET.segid == 'B') & LEV_DATASET.POSITION.astype(int).isin(list(range(425,676)))]

In [435]:
import MDAnalysis as mda

pdb = mda.Universe('data/5bs8.pdb')

FileNotFoundError: [Errno 2] No such file or directory: 'data/5bs8.pdb'

In [436]:
protein = pdb.select_atoms("protein")
protein
#protein.write("data/5bs8_atoms.pdb")

<AtomGroup with 23125 atoms>

In [437]:
snap2 = pd.read_csv('data/5bs8_SNAP2_complete.csv')
snap2

Unnamed: 0,Variant,Predicted Effect,Score,Expected Accuracy,segid
0,T1A,effect,25,63%,A
1,T1R,effect,57,75%,A
2,T1N,neutral,-80,87%,A
3,T1D,effect,21,63%,A
4,T1C,effect,13,59%,A
...,...,...,...,...,...
15115,V253S,effect,52,75%,B
15116,V253T,effect,30,66%,B
15117,V253W,effect,66,80%,B
15118,V253Y,effect,56,75%,B


In [438]:
def correct_position(row):
    
    if row.segid == 'A':
        increase_by = 1
    else:
        increase_by = 422

    letter1 = row.Variant[0]
    letter2 = row.Variant[-1]  # Extract the letters (all characters except the last one)
    number = int(row.Variant[1:-1])  # Extract the number and convert it to an integer
    updated_number = number + increase_by  # Increase the number by the specified amount
    updated_string = f'{letter1}{updated_number}{letter2}'  # Create the updated string

    return updated_string

In [439]:
snap2['Variant'] = snap2.apply(correct_position, axis = 1)
snap2.to_csv('data/5bs8_SNAP2_complete_clean.csv', index=False)

In [440]:
data_dict = {}
names = ['MXF_df', 'LEV_df']
i=0

for dataset in [MXF_DATASET, LEV_DATASET]:

    sbml_dataset = dataset[['segid', 'mutation']]
    sbml_dataset.drop_duplicates(subset='mutation', keep='first', inplace=True)

    features = sbmlcore.FeatureDataset(sbml_dataset, species='M. tuberculosis', protein = 'DNA Gyrase')

    a = sbmlcore.AminoAcidVolumeChange()
    b1 = sbmlcore.AminoAcidHydropathyChangeKyteDoolittle()
    b2 = sbmlcore.AminoAcidHydropathyChangeWimleyWhite()
    c = sbmlcore.AminoAcidMWChange()
    d = sbmlcore.AminoAcidPiChange()
    e = sbmlcore.Stride('data/pdb/5bs8_atoms.pdb')
    f = sbmlcore.FreeSASA('data/pdb/5bs8_atoms.pdb')
    g = sbmlcore.SNAP2("data/5bs8_SNAP2_complete_clean.csv", offsets = {'A': 0, 'B': 0})
    h1 = sbmlcore.StructuralDistances('data/pdb/5bs8_no_PTR_su_AB_MGB.pdb', distance_selection="resname MG", distance_name='dist_MGB')
    h2 = sbmlcore.StructuralDistances('data/pdb/5bs8_no_PTR_su_AB_MGE.pdb', distance_selection="resname MG", distance_name='dist_MGE')
    # h3 = sbmlcore.StructuralDistances('data/pdb/5bs8_no_PTR_su_AB_MGE.pdb', distance_selection="resname MG", distance_name='dist_MGE')
    # h4 = sbmlcore.StructuralDistances('data/pdb/5bs8_no_PTR_su_AB_MGF.pdb', distance_selection="resname MG", distance_name='dist_MGF')
    k = sbmlcore.StructuralDistances('data/pdb/5bs8_no_PTR_su_AB_one_drug.pdb', distance_selection="resname MFX", distance_name='dist_MFX')

    features.add_feature([a, b1, b2, c, d, e, f, g, h1, h2, k])
    features = features.df

    dataset.reset_index(inplace = True)

    variable_name = names[i]
    i = i+1
    
    data_dict[variable_name] = pd.merge(dataset, features, how='left', on=['segid','mutation'])

In [441]:
df = data_dict['MXF_df']
df.columns

Index(['UNIQUEID', 'DRUG', 'MIC', 'LOG2MIC', 'BINARY_PHENOTYPE',
       'PHENOTYPE_QUALITY', 'GENE', 'POSITION', 'GENE_MUTATION', 'segid',
       'mutation', 'd_volume', 'd_hydropathy_KD', 'd_hydropathy_WW', 'd_MW',
       'd_Pi', 'secondary_structure', 'secondary_structure_long', 'phi', 'psi',
       'residue_sasa', 'n_hbond_acceptors', 'n_hbond_donors', 'B', 'C', 'E',
       'G', 'H', 'T', 'SASA', 'snap2_score', 'snap2_accuracy', 'dist_MGB',
       'dist_MGE', 'dist_MFX'],
      dtype='object')

In [442]:
df.GENE_MUTATION.unique().shape

(152,)

In [443]:
g = sbmlcore.TempFactors('data/5bs8.pdb')
g

AssertionError: File does not exist!

Further features to be added soon (SNAP2, DeepDG, StructuralDistances)

In [444]:
for dataset in ['LEV_df', 'MXF_df']:

    df = data_dict[dataset]
    print(df.info())
    
    name = f'data/ML_DATA_{dataset[:3]}.csv'
    df.to_csv(name, index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1735 entries, 0 to 1734
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   UNIQUEID                  1735 non-null   object  
 1   DRUG                      1735 non-null   category
 2   MIC                       1722 non-null   category
 3   LOG2MIC                   1722 non-null   float64 
 4   BINARY_PHENOTYPE          1734 non-null   object  
 5   PHENOTYPE_QUALITY         1735 non-null   category
 6   GENE                      1735 non-null   object  
 7   POSITION                  1735 non-null   float64 
 8   GENE_MUTATION             1735 non-null   object  
 9   segid                     1735 non-null   object  
 10  mutation                  1735 non-null   object  
 11  d_volume                  1735 non-null   float64 
 12  d_hydropathy_KD           1735 non-null   float64 
 13  d_hydropathy_WW           1735 non-null   float6

In [445]:
df

Unnamed: 0,UNIQUEID,DRUG,MIC,LOG2MIC,BINARY_PHENOTYPE,PHENOTYPE_QUALITY,GENE,POSITION,GENE_MUTATION,segid,...,E,G,H,T,SASA,snap2_score,snap2_accuracy,dist_MGB,dist_MGE,dist_MFX
0,site.04.subj.00097.lab.633433.iso.1,MXF,2.0,1.00,R,HIGH,gyrA,94.0,gyrA_D94G,A,...,False,False,True,False,88.155089,63,80,18.494689,7.838828,14.503318
1,site.10.subj.UH00935117.lab.UH00935117.iso.1,MXF,4.0,2.00,R,HIGH,gyrA,90.0,gyrA_A90V,A,...,False,False,True,False,55.555816,-64,82,14.434102,5.483332,10.609526
2,site.11.subj.XTB-18-164.lab.XTB-18-164.iso.1,MXF,>4,3.00,R,MEDIUM,gyrA,94.0,gyrA_D94N,A,...,False,False,True,False,88.155089,56,75,18.494689,7.838828,14.503318
3,site.04.subj.00121.lab.633619.iso.1,MXF,2.0,1.00,R,HIGH,gyrA,94.0,gyrA_D94G,A,...,False,False,True,False,88.155089,63,80,18.494689,7.838828,14.503318
4,site.14.subj.2591.lab.2591.iso.1,MXF,>4,3.00,R,HIGH,gyrA,94.0,gyrA_D94N,A,...,False,False,True,False,88.155089,56,75,18.494689,7.838828,14.503318
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1664,site.04.subj.01310.lab.724468.iso.1,MXF,1.0,0.00,R,HIGH,gyrA,91.0,gyrA_S91P,A,...,False,False,True,False,65.228155,63,80,13.712391,5.381126,11.010229
1665,site.11.subj.XTB-18-186.lab.XTB-18-186.iso.1,MXF,2.0,1.00,R,MEDIUM,gyrA,90.0,gyrA_A90V,A,...,False,False,True,False,55.555816,-64,82,14.434102,5.483332,10.609526
1666,site.10.subj.UH01302806.lab.UH01302806.iso.1,MXF,2.0,1.00,R,HIGH,gyrA,90.0,gyrA_A90V,A,...,False,False,True,False,55.555816,-64,82,14.434102,5.483332,10.609526
1667,site.05.subj.CA-0020.lab.CO-05421-18.iso.1,MXF,0.12,-3.06,S,HIGH,gyrB,594.0,gyrB_G594R,B,...,False,False,True,False,2.551348,33,66,29.300727,43.081517,41.932227
