In [201]:
%%capture
!pip install catboost

In [278]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.linear_model import LinearRegression
import shap
import matplotlib.pyplot as plt
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [279]:
df_small = pd.read_csv('/content/ko_matrix_merged_3056rows.csv')

  df_small = pd.read_csv('/content/ko_matrix_merged_3056rows.csv')


In [None]:
df_medium = pd.read_csv('/content/path_matrix_merged_3056rows.csv')

In [None]:
df_big = pd.read_csv('/content/gene_matrix_merged_3056rows_105249cols.csv')

In [None]:
def drop_columns(df):
  mic_vals = ['MIC', 'MBC', 'MBEC', 'MBIC', 'MIc', 'MFC', 'MMC']
  mic = df[df['method'].isin(mic_vals)]
  zoi = df[df['method'] == 'ZOI']

  # check percentage of missing vals in columns(Concentration of precursor (mM))
  missing = mic['Concentration of precursor (mM)'].isna().sum().item()
  if missing / mic.shape[0] > 0.5:
    mic.drop(columns = ['Concentration of precursor (mM)'], inplace = True)
  missing = zoi['Concentration of precursor (mM)'].isna().sum().item()
  if missing / zoi.shape[0] > 0.5:
    zoi.drop(columns = ['Concentration of precursor (mM)'], inplace = True)

  # check missing for hydrodynamic diameter
  missing = mic['hydrodynamic diameter'].isna().sum().item()
  if missing / mic.shape[0] > 0.5:
    mic.drop(columns = ['hydrodynamic diameter'], inplace = True)
  missing = zoi['hydrodynamic diameter'].isna().sum().item()
  if missing / zoi.shape[0] > 0.5:
    zoi.drop(columns = ['hydrodynamic diameter'], inplace = True)

  # check missing for ph During
  missing = mic['pH during synthesis'].isna().sum().item()
  if missing / mic.shape[0] > 0.5:
    mic.drop(columns = ['pH during synthesis'], inplace = True)
  missing = zoi['pH during synthesis'].isna().sum().item()
  if missing / zoi.shape[0] > 0.5:
    zoi.drop(columns = ['pH during synthesis'], inplace = True)

  mic.drop(columns = ['np'], inplace = True)
  mic.drop(columns = ['concentration for ZOI (µg/ml)'])
  zoi.drop(columns = ['np'], inplace = True)

  return mic, zoi

In [None]:
%%capture
mic_clear, zoi_clear = drop_columns(df_small)

**Preprocess mic and zoi**

In [None]:
mic_clear.isna().sum()[mic_clear.isna().sum() > 0]

In [None]:
def clear_dataset(mic_clear):
  mic_clear.drop(columns = ['strain'], inplace = True)
  # input avg, if min and max missed
  mask_avg = mic_clear['np_size_avg (nm)'].notna()
  mic_clear.loc[mask_avg & mic_clear['np_size_min (nm)'].isna(), 'np_size_min (nm)'] = mic_clear.loc[mask_avg, 'np_size_avg (nm)']
  mic_clear.loc[mask_avg & mic_clear['np_size_max (nm)'].isna(), 'np_size_max (nm)'] = mic_clear.loc[mask_avg, 'np_size_avg (nm)']

# if min, max, calc avg
  mask_minmax = mic_clear['np_size_min (nm)'].notna() & mic_clear['np_size_max (nm)'].notna()
  mic_clear.loc[mask_minmax & mic_clear['np_size_avg (nm)'].isna(), 'np_size_avg (nm)'] = (
    mic_clear.loc[mask_minmax, 'np_size_min (nm)'] + mic_clear.loc[mask_minmax, 'np_size_max (nm)']
  ) / 2

# drop vals where all are missing
  mask_all_na = mic_clear[['np_size_min (nm)', 'np_size_max (nm)', 'np_size_avg (nm)']].isna().all(axis=1)
  mic_clear = mic_clear[~mask_all_na]

  mic_clear['shape'].fillna(value = mic_clear['shape'].mode()[0], inplace = True)
  mic_clear['time_set (hours)'].fillna(value = mic_clear['time_set (hours)'].mean(), inplace = True)
  mic_clear.drop(columns = ['zeta_potential (mV)'], inplace = True)
  mic_clear['Solvent for extract'].fillna(value = mic_clear['Solvent for extract'].mode()[0], inplace = True)
  mic_clear['Temperature for extract, C'].fillna(value = mic_clear['Temperature for extract, C'].mode()[0], inplace = True)
  mic_clear['Precursor of NP'].fillna(value = mic_clear['Precursor of NP'].mode()[0], inplace = True)
  mic_clear['Duration preparing extract, min'].fillna(value = mic_clear['Duration preparing extract, min'].mean(), inplace = True)
  # mic_clear['Concentration of precursor (mM)'].fillna(value = mic_clear['Concentration of precursor (mM)'].mean(), inplace = True)
  mic_clear.drop(columns = ['Strain', 'Unnamed: 44', 'Clade', 'accept/reject', 'comment', 'accept/reject', 'entry_status',
                          'has_mistake_in_matadata', 'has_mistake_in_data', 'verification_date', 'verified_by'], inplace = True)
  mic_clear.dropna(subset=['bac_type'], inplace = True)

  return mic_clear

In [None]:
%%capture
mic_clear = clear_dataset(mic_clear)

In [None]:
%%capture
zoi_clear = clear_dataset(zoi_clear)

In [None]:
%%capture
mic_med, zoi_med = drop_columns(df_small)

In [None]:
%%capture
mic_med = clear_dataset(mic_med)

In [None]:
%%capture
mic_big, zoi_big = drop_columns(df_big)

In [None]:
%%capture
mic_big = clear_dataset(mic_big)

In [None]:
%%capture
zoi_big = clear_dataset(zoi_big)

In [None]:
def create_subsets(df, target = 'MIC'):
  target_cols = {'MIC':'MIC_NP (µg/mL)', 'ZOI': 'zoi_np (mm)'}
  if target == 'MIC':
    X = df.drop(columns = [target_cols[target], 'concentration for ZOI (µg/ml)',
                           'reference', 'doi', 'article_list', 'journal_name',
                           'publisher', 'year', 'title', 'journal_is_oa', 'is_oa',
                           'oa_status', 'verification required','verified_by', 'verification_date',
                           'has_mistake_in_data','has_mistake_in_matadata', 'entry_status', 'comment',
                           'accept/reject', 'Unnamed: 44'])
    y = df[target_cols[target]]
    x_train, y_train, x_val, y_val = train_test_split(X, y)
  else:
    X = df.drop(columns = [target_cols[target], 'reference', 'doi', 'article_list', 'journal_name',
                           'publisher', 'year', 'title', 'journal_is_oa', 'is_oa',
                           'oa_status', 'verification required','verified_by', 'verification_date',
                           'has_mistake_in_data','has_mistake_in_matadata', 'entry_status', 'comment',
                           'accept/reject', 'Unnamed: 44'])
    y = df[target_cols[target]]
    x_train, y_train, x_val, y_val = train_test_split(X, y)
  return x_train, y_train, x_val, y_val

In [None]:
def pipeline_train_feat(df):
    model_cat = CatBoostRegressor(iterations=100, verbose=False, random_state=42)
    model_lin = LinearRegression()

    x_train, x_val, y_train, y_val = create_subsets(df)

    cat_features = [
        'np', 'bacteria', 'strain', 'np_synthesis', 'method', 'shape',
        'Solvent for extract', 'Precursor of NP', 'Bacteria', 'Strain',
        'Superkingdom', 'Kingdom', 'Clade', 'Phylum', 'Class', 'Order',
        'Family', 'Genus', 'Species', 'bac_type'
    ]

    cat_features_exist = [f for f in cat_features if f in x_train.columns]

    train_pool = Pool(data=x_train, label=y_train, cat_features=cat_features_exist)

    print('Data prepared. Training models...')

    model_cat.fit(train_pool)
    y_pred_cat = model_cat.predict(x_val)
    metrics_cat = {
        'mae': mean_absolute_error(y_val, y_pred_cat),
        'mse': mean_squared_error(y_val, y_pred_cat),
        'rmse': np.sqrt(mean_squared_error(y_val, y_pred_cat)),
        'r2': r2_score(y_val, y_pred_cat)
    }

    x_train_lin = x_train.drop(columns=cat_features_exist)
    x_val_lin = x_val.drop(columns=cat_features_exist)
    model_lin.fit(x_train_lin, y_train)
    y_pred_lin = model_lin.predict(x_val_lin)
    metrics_lin = {
        'mae': mean_absolute_error(y_val, y_pred_lin),
        'mse': mean_squared_error(y_val, y_pred_lin),
        'rmse': np.sqrt(mean_squared_error(y_val, y_pred_lin)),
        'r2': r2_score(y_val, y_pred_lin)
    }

    print('Calculating and saving feature importances...')
    X = pd.concat([x_train, x_val])

    shap_cat = shap.TreeExplainer(model_cat)
    shap_values_cat = shap_cat(X)
    shap.plots.beeswarm(shap_values_cat, show=False)
    plt.savefig("beeswarm_plot_cat.png", bbox_inches="tight")
    plt.close()

    importance = model_cat.get_feature_importance()
    features = model_cat.feature_names_
    plt.barh(features, importance)
    plt.title("Feature Importance (CatBoost)")
    plt.savefig("feature_importance.png", bbox_inches="tight")
    plt.close()

    X_lin = X.drop(columns=cat_features_exist)
    explainer_lin = shap.LinearExplainer(model_lin, X_lin)
    shap_values_lin = explainer_lin(X_lin)
    shap.plots.beeswarm(shap_values_lin, show=False)
    plt.savefig("beeswarm_plot_lin.png", bbox_inches="tight")
    plt.close()

    print('Done.')
    return {'catboost': metrics_cat, 'linear_regression': metrics_lin}

In [None]:
pipeline_train_feat(zoi_clear)