In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors 

In [10]:
df_data = pd.read_csv('data.csv', index_col=0).drop(columns=['Unnamed: 0'])
df_bac = pd.read_csv('bacterial_descriptors.csv', index_col=0)
df_drug = pd.read_csv('drug_descriptors.csv', index_col=0)

In [38]:
df_data.head(1)

Unnamed: 0,Bacteria,NP_Synthesis,Drug,Drug_class_drug_bank,Drug_dose,NP_concentration,NP size_min,NP size_max,NP size_avg,shape,method,ZOI_drug,ZOI_NP,ZOI_drug_NP,fold_increase_in_antibacterial_activity (%),MDR_check
0,Escherichia coli,chem_synthesis_reduction_by_D_maltose,Amoxicillin,Lactams,,,4.0,10.0,8.0,spherical,MIC,32+,6.3,32+,,1


In [33]:
df_drug.head(1)

Unnamed: 0,drug,chemID,prefered_name,smiles
0,Amoxicillin,CHEMBL1082,AMOXICILLIN,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccc(O)cc3)...


In [13]:
df_bac.head(1)

Unnamed: 0_level_0,Bacteria,kingdom,subkingdom,clade,phylum,class,order,family,genus,species,gram,"min_Incub_period, h","avg_Incub_period, h","max_Incub_period, h","growth_temp, C",biosafety_level,isolated_from
Tax_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
['470'],Acinetobacter baumannii,Bacteria,,,Pseudomonadota,Gammaproteobacteria,Moraxellales,Moraxellaceae,Acinetobacter,Acinetobacter calcoaceticus/baumannii complex,n,24.0,36.0,48.0,37,2,urine


In [8]:
drugs_in_drugs = set(df_drug.drug.unique())
drugs_in_data = set(df_data.Drug.unique())
drugs_in_data.difference(drugs_in_drugs)

{'Neomycin', nan}

In [12]:
# лень разбираться с апи pubchem, но нужно добавить неомицин
Neomycin = {
    'drug': 'Neomycin',
    'smiles': 'C1[C@H]([C@@H]([C@H]([C@@H]([C@H]1N)O[C@@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CN)O)O)N)O[C@H]3[C@@H]([C@@H]([C@H](O3)CO)O[C@@H]4[C@@H]([C@H]([C@@H]([C@@H](O4)CN)O)O)N)O)O)N',
    'chemID': np.nan,
    'prefered_name': np.nan
}
row_to_append = pd.DataFrame(Neomycin, index=[0])
df_drug = pd.concat([row_to_append, df_drug]).reset_index(drop=True)

# Мерджим все бд в одну

In [15]:
data_with_bac = pd.merge(df_data, df_bac, left_on='Bacteria', right_on='Bacteria', how='outer')

In [16]:
df = pd.merge(data_with_bac, df_drug, left_on='Drug', right_on='drug', how='outer')
df.head(1)

Unnamed: 0,Bacteria,NP_Synthesis,Drug,Drug_class_drug_bank,Drug_dose,NP_concentration,NP size_min,NP size_max,NP size_avg,shape,...,"min_Incub_period, h","avg_Incub_period, h","max_Incub_period, h","growth_temp, C",biosafety_level,isolated_from,drug,smiles,chemID,prefered_name
0,Escherichia coli,chem_synthesis_reduction_by_D_maltose,Amoxicillin,Lactams,,,4.0,10.0,8.0,spherical,...,48.0,84.0,120.0,37.0,2.0,urine,Amoxicillin,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccc(O)cc3)...,CHEMBL1082,AMOXICILLIN


# Adding descriptors

In [44]:
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

import pandas as pd
from pandas import DataFrame

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors 


def add_desc_to_df(df: DataFrame, desc: list):
    calc = MoleculeDescriptors.MolecularDescriptorCalculator(desc)
    desc_df = pd.DataFrame(
        df['smiles'].map(lambda x: calc.CalcDescriptors(Chem.MolFromSmiles(x))).to_list()
    )
    desc_df.columns = desc
    return df.join(desc_df)


def fill_nan_desc(df: DataFrame, non_desc_cols: list = None):
    '''
    Fill NaN with calculated descriptors

    Parameters
    ----------
    df: DataFrame

    non_desc_cols: columns from df cant be calculated via rdkit 
    '''
    cols_to_drop = ['SMILES']
    cols_to_drop.extend(non_desc_cols)
    for col in df.drop(columns=cols_to_drop).columns:
        col_type = df[col].dtype
        calc = MoleculeDescriptors.MolecularDescriptorCalculator([col])
        mask = df[col].isna()
        df.loc[mask, col] = df.loc[mask, 'SMILES'].map(
            lambda x: calc.CalcDescriptors(Chem.MolFromSmiles(x))[0]
        ).astype(col_type)
    return df

In [45]:
df_drug = add_desc_to_df(df_drug, ['LabuteASA', 'NumHDonors'])

In [25]:
df = pd.merge(data_with_bac, df_drug, left_on='Drug', right_on='drug', how='outer')

In [29]:
df[df.Drug.isna()]

Unnamed: 0,Bacteria,NP_Synthesis,Drug,Drug_class_drug_bank,Drug_dose,NP_concentration,NP size_min,NP size_max,NP size_avg,shape,...,"min_Incub_period, h","avg_Incub_period, h","max_Incub_period, h","growth_temp, C",biosafety_level,isolated_from,drug,smiles,chemID,prefered_name
633,Escherichia coli,green_synthesis using extract of acinetobacter...,,,,,1.00,9.00,4.7,spherical,...,48.0,84.0,120.0,37.0,2.0,urine,,,,
634,Escherichia coli,green_synthesis using extract of acinetobacter...,,,,,1.00,9.00,4.7,spherical,...,48.0,84.0,120.0,37.0,2.0,urine,,,,
635,Escherichia coli,chemical_synthesis using sodium borohydride an...,,,,100,2.26,10.34,3.0,spherical,...,48.0,84.0,120.0,37.0,2.0,urine,,,,
636,Escherichia coli,chemical_synthesis using sodium borohydride an...,,,,100,2.26,10.34,3.0,spherical,...,48.0,84.0,120.0,37.0,2.0,urine,,,,
637,Escherichia coli,chemical_synthesis using sodium borohydride an...,,,,100,2.26,10.34,3.0,spherical,...,48.0,84.0,120.0,37.0,2.0,urine,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
898,Staphylococcus sp.,,,,,,,,,,...,1.0,3.5,6.0,37.0,2.0,skin,,,,
899,Streptococcus pyogenes,,,,,,,,,,...,24.0,36.0,48.0,37.0,2.0,mouth,,,,
900,Streptococcus viridans,,,,,,,,,,...,168.0,252.0,336.0,37.0,1.0,mouth,,,,
901,Trichoderma harzianum,,,,,,,,,,...,48.0,84.0,120.0,30.0,1.0,soil,,,,
