In [1]:
%%capture
!pip install catboost

In [50]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.linear_model import LinearRegression
import shap
import matplotlib.pyplot as plt
import random

In [3]:
df_bacts = pd.read_csv('/content/bacteriai_list_400.csv')

df_small = pd.read_csv('/content/kegg_ko_matrix_july.csv')
df_medium = pd.read_csv('/content/kegg_pathway_matrix_july.csv')
df_big = pd.read_csv('/content/gene_presence_absence_july.csv')

In [4]:
df_bacts.drop(columns='Unnamed: 0', inplace=True)

In [5]:
df_all = pd.read_excel('/content/all_seltox_np_MIC_dataset.xlsx')

In [6]:
df_all.isna().sum()

Unnamed: 0,0
sn,0
np,0
coating,0
bacteria,0
mdr,0
strain,739
np_synthesis,0
method,0
MIC_NP(μg/ml),0
np_size_min (nm),204


In [7]:
columns_to_drop = ['np_size_min (nm)', 'np_size_max (nm)', 'np_size_avg (nm)',
                   'shape', 'time_set', 'zeta_potential', 'reference', 'doi',
                   'Solvent for extract', 'Temperature for extract, C', 'Duration preparing extract, min',
                   'Precursor of NP', 'Concentration of precursor (mM)', 'hydrodynamic diameter',
                   'pH during synthesis']

In [8]:
df_train = df_all.drop(columns = columns_to_drop).copy(deep=True)

In [9]:
df_train['Name_Strain'] = df_train['bacteria'] + " " + df_train['strain'].fillna("nan")

In [10]:
df_train['Name_Strain']

Unnamed: 0,Name_Strain
0,Enterococcus faecalis nan
1,Escherichia coli nan
2,Proteus vulgaris nan
3,Salmonella typhi nan
4,Staphylococcus aureus nan
...,...
1709,Staphylococcus aureus ATCC 29213
1710,Escherichia coli MTCC 443
1711,Escherichia coli MTCC 739
1712,Escherichia coli MTCC 1302


In [11]:
df_train.columns

Index(['sn', 'np', 'coating', 'bacteria', 'mdr', 'strain', 'np_synthesis',
       'method', 'MIC_NP(μg/ml)', 'Name_Strain'],
      dtype='object')

In [12]:
df_bacts['strain'].iloc[0]

nan

In [13]:
df_train

Unnamed: 0,sn,np,coating,bacteria,mdr,strain,np_synthesis,method,MIC_NP(μg/ml),Name_Strain
0,1,Ag,0,Enterococcus faecalis,0,,green_synthesis,MIC,32,Enterococcus faecalis nan
1,2,Ag,0,Escherichia coli,0,,green_synthesis,MIC,64,Escherichia coli nan
2,3,Ag,0,Proteus vulgaris,0,,green_synthesis,MIC,16,Proteus vulgaris nan
3,4,Ag,0,Salmonella typhi,0,,green_synthesis,MIC,32,Salmonella typhi nan
4,5,Ag,0,Staphylococcus aureus,0,,green_synthesis,MIC,4,Staphylococcus aureus nan
...,...,...,...,...,...,...,...,...,...,...
1709,2111,Ag,0,Staphylococcus aureus,0,ATCC 29213,chemical_synthesis purchased from US Research ...,MIC,256,Staphylococcus aureus ATCC 29213
1710,2112,Ag,0,Escherichia coli,0,MTCC 443,chemical_synthesis using sodium borohydride an...,MIC,40,Escherichia coli MTCC 443
1711,2113,Ag,0,Escherichia coli,0,MTCC 739,chemical_synthesis using sodium borohydride an...,MIC,180,Escherichia coli MTCC 739
1712,2114,Ag,0,Escherichia coli,0,MTCC 1302,chemical_synthesis using sodium borohydride an...,MIC,120,Escherichia coli MTCC 1302


In [14]:
df_small

Unnamed: 0.1,Unnamed: 0,K11309,K09786,K17497,K11038,K09802,K09157,K23064,K09497,K03559,...,K25495,K05769,K00413,K01200,K26735,K21025,K10122,K02850,K20956,K07155
0,Citrobacter freundii nan,0,1,0,0,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
1,Klebsiella pneumoniae ATCC 70063,0,1,0,0,1,0,0,0,0,...,1,0,0,1,1,0,0,0,0,0
2,Mycobacterium smegmatis nan,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Pseudomonas aeruginosa RME 27,0,1,0,0,0,0,1,0,1,...,0,0,1,0,1,1,0,0,0,0
4,Enterococcus faecium DSM 13590,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
347,Campylobacter jejuni 2674,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
348,Staphylococcus aureus QAU Islambad 9861,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
349,Bacillus cereus nan,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
350,Vibrio vulnificus MTCC 1146,0,1,0,0,1,0,0,0,1,...,0,0,1,1,0,0,0,0,1,0


In [15]:
not_eq = list()

for un in df_train['Name_Strain'].unique():
  if un not in df_small['Unnamed: 0'].unique():
    not_eq.append(un)

In [16]:
len(not_eq)

112

In [17]:
not_eq[0]

'Salmonella typhi nan'

In [18]:
def merge_two_df(df_t, df_spec):
  entry = df_t[df_t['Name_Strain'].isin(df_spec['Unnamed: 0'].unique())]
  df_fin = entry.merge(df_spec, how='left', left_on='Name_Strain', right_on='Unnamed: 0')
  df_fin.drop(columns=['strain', 'bacteria', 'Unnamed: 0', 'sn', 'np', 'method', 'Name_Strain'], inplace = True)
  df_fin['MIC_NP(μg/ml)'] = df_fin['MIC_NP(μg/ml)'].apply(lambda x: int(x.split('>')[1]) if isinstance(x, str) else x)
  return df_fin

In [19]:
res_df_small = merge_two_df(df_t=df_train, df_spec=df_small)

In [20]:
res_df_small

Unnamed: 0,coating,mdr,np_synthesis,MIC_NP(μg/ml),K11309,K09786,K17497,K11038,K09802,K09157,...,K25495,K05769,K00413,K01200,K26735,K21025,K10122,K02850,K20956,K07155
0,0,0,green_synthesis,32.0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,green_synthesis,64.0,0,1,0,0,1,0,...,1,0,0,0,1,0,0,1,0,0
2,0,0,green_synthesis,16.0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,green_synthesis,4.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,green_synthesis,16.0,0,1,0,0,1,0,...,0,0,1,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1479,0,0,chemical_synthesis purchased from US Research ...,256.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1480,0,0,chemical_synthesis using sodium borohydride an...,40.0,0,1,0,0,1,0,...,1,0,0,0,1,0,0,1,0,0
1481,0,0,chemical_synthesis using sodium borohydride an...,180.0,0,1,0,0,1,0,...,1,0,0,0,1,0,0,1,0,0
1482,0,0,chemical_synthesis using sodium borohydride an...,120.0,0,1,0,0,1,0,...,1,0,0,0,1,0,0,1,0,0


In [30]:
def create_datas(df):
  X = df.drop(columns = 'MIC_NP(μg/ml)')
  y = df['MIC_NP(μg/ml)']
  return X, y

In [31]:
X, y = create_datas(res_df_small)

In [47]:
def pipeline_train_feat(X, y):
  model_cat = CatBoostRegressor(iterations = 100, verbose=False)
  model_lin = LinearRegression()
  data_cat = Pool(data=X, label=y, cat_features=['np_synthesis'])
  X_lin = X.drop(columns = 'np_synthesis')
  print('Data prepared')

  model_cat.fit(data_cat)
  shap_cat = shap.TreeExplainer(model_cat)
  shap_values_cat = shap_cat(X)
  print('Saving feature importance for catboost')
  shap.plots.beeswarm(shap_values_cat, show=False)
  plt.savefig("beeswarm_plot_cat.png")
  plt.close()

  importance = model_cat.get_feature_importance()
  features = model_cat.feature_names_

  plt.barh(features, importance)
  plt.title("Feature Importance (CatBoost)")
  plt.savefig("feature_importance.png", bbox_inches="tight")
  plt.close()
  print('Saving base plot feature importance catboost')

  model_lin.fit(X_lin, y)
  explainer_lin = shap.LinearExplainer(model_lin, X_lin)
  shap_values_lin = explainer_lin(X_lin)
  print('Saving feature importance for linear model')
  shap.plots.beeswarm(shap_values_lin, show=False)
  plt.savefig("beeswarm_plot_lin.png")
  plt.close()

In [48]:
pipeline_train_feat(X, y)

Data prepared
Saving feature importance for catboost
Saving base plot feature importance catboost
Saving feature importance for linear model


**Random feature imputed**

In [54]:
X['random_feature'] = [random.choice([0, 1]) for i in range(X.shape[0])]

In [None]:
pipeline_train_feat(X, y)

Data prepared
Saving feature importance for catboost
