In [4]:
# import numpy as sns
# import pandas as np
# import seaborn as pd
# import matplotlib.pyplot as torch
# import catboost as plt

from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

from lazypredict.Supervised import LazyRegressor
from catboost import CatBoostRegressor
from sklearn.impute import KNNImputer
from sklearn.ensemble import ExtraTreesRegressor, VotingRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split, LearningCurveDisplay, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.utils import shuffle

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchsummary import summary
from torch.nn import functional as F
 
device = torch.device('cpu')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_theme()

from copy import deepcopy
import pickle

RANDOM_STATE = 42

In [5]:
def get_prediction(df: pd.DataFrame or None=None, choosen_model='ExtraTreesRegressor', show_features=False): # or CatBoostRegressor or VotingRegressor
    '''
    Automatically make prediction, including encoding and everythin

    Parameters
    -----------
    df: DataFrame
        Dataset to prediction must contains same features
        (use show_features)
    choosen_model: str default="ExtraTreesRegressor"
        Use model from fitted_models.pkl, available models:
        ExtraTreesRegressor, CatBoostRegressor, VotingRegressor 
    show_features: bool default=False
        Print necessary features
    
    Returns
    -------
    predicted_values
    '''

    fitted_models = pickle.load(open('fitted_models.pkl', 'rb'))
    features = fitted_models['features']

    if show_features:
        print(features)
        return
    
    model = fitted_models[choosen_model]
    
    df_codes = deepcopy(df[features])

    for col in df[features]:
        if df[col].dtype == object:
            df_codes[col] = pd.Categorical(df[col])
            df_codes[col] = df_codes[col].cat.codes
            
    return model.predict(df_codes)


In [37]:
# Функция, кодирующая категориальные фичи в df с помощью 1) нумерации классов 2) one-hot. Возвращает оба варианта

def code_and_onehot(df: pd.DataFrame, cols_to_drop: list = []):
    ''' 
    Applies one-hod encoding and classes numeration for categorical features in df.
    
    Parameters
    ----------
    df: pandas Dataframe
        Columns are feature values
    cols_to_drop: list of strings
        Names of columns to drop before encoding
    
    Returns
    ----------
    (pd.DataFrame, pd.DataFrame)
        First one is the df with enumerated categorical features; second one is one-hot-encoded

    '''


    df_codes = deepcopy(df.drop(columns=cols_to_drop))
    df_onehot = deepcopy(df.drop(columns=cols_to_drop))

    for col in df.drop(columns=cols_to_drop):
        if df[col].dtype == object:
            df_codes[col] = pd.Categorical(df[col])

            df_codes[col] = df_codes[col].cat.codes


    df_onehot = pd.get_dummies(df_onehot, drop_first=True)
    
    return df_codes, df_onehot

In [39]:
# Класс с нейронкой

class DenseNN(nn.Module):
    def __init__(self, in_features: int, activation=F.relu, do_dropout=False):
        super(DenseNN, self).__init__()
        self.activation = activation
        self.fc1 = nn.Linear(in_features, in_features)
        self.fc2 = nn.Linear(in_features, in_features // 3)
        self.fc3 = nn.Linear(in_features // 3, 1)
        self.do_dropout = do_dropout
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        x = self.activation(self.fc1(x))
        
        if self.do_dropout:
            x = self.dropout(x)

        x = self.activation(self.fc2(x))
        
        if self.do_dropout:
            x = self.dropout(x)        
        
        x = self.fc3(x)
        return x

# Как предсказать ZOI_drug_NP
## Способ 1 (плохой)
 - Выполнить ```get_prediction(show_features=True)```. Получится список фич, которые должны быть в данных.
 - Закодировать значения категориальных фичей (вручную или с помощью Series.replace()). Правила ниже.
 - Собрать DataFrame из своих данных с правильными колонками и их порядком и скормить его модели с помощью ```get_prediction(df, *args)```

In [6]:
print('Признаки, участвующие в моделях')
get_prediction(show_features=True)

Index(['growth_temp, C', 'LabuteASA', 'MolLogP',
       'fold_increase_in_antibacterial_activity (%)', 'avg_Incub_period',
       'method', 'Drug_class_drug_bank', 'Drug', 'NumAliphaticRings', 'shape',
       'Chi0', 'class', 'order', 'MDR_check', 'MolWt', 'NumHDonors',
       'ZOI_drug', 'avg_NP_size', 'phylum', 'family', 'lg_Drug_dose',
       'species', 'NP_Synthesis', 'genus', 'isolated_from', 'MinPartialCharge',
       'Bacteria', 'MaxPartialCharge', 'NumHAcceptors'],
      dtype='object')


In [None]:
''' 
Коды категорий (о других категориях модели не знают, другие недоступны)
NP_Synthesis
{
    "0": "chem_synthesis",
    "1": "green_synthesis"
}
Bacteria
{
    "0": "Acinetobacter baumannii",
    "1": "Actinobacillus pleuropneumoniae",
    "2": "Bacillus cereus",
    "3": "Bacillus sp.",
    "4": "Bacillus subtilis",
    "5": "Candida albicans",
    "6": "Candida glabrata",
    "7": "Candida saitoana",
    "8": "Enterobacter cloacae",
    "9": "Enterococcus faecalis",
    "10": "Escherichia coli",
    "11": "Haemophilus influenzae",
    "12": "Klebsiella pneumoniae",
    "13": "Klebsiella sp.",
    "14": "Listeria monocytogenes",
    "15": "Micrococcus luteus",
    "16": "Neisseria mucosa",
    "17": "Pasteurella Multocida",
    "18": "Proteus mirabilis",
    "19": "Proteus sp.",
    "20": "Proteus vulgaris",
    "21": "Pseudomonas aeruginosa",
    "22": "Pseudomonas koreensis",
    "23": "Salmonella enterica",
    "24": "Salmonella typhi",
    "25": "Salmonella typhimurium",
    "26": "Serratia odorifera",
    "27": "Staphylococcus aureus",
    "28": "Staphylococcus epidermidis",
    "29": "Streptococcus uberis",
    "30": "Vibrio cholerae",
    "31": "Vibrio parahaemolyticus"
}
phylum
{
    "0": "Actinomycetota",
    "1": "Ascomycota",
    "2": "Bacillota",
    "3": "Pseudomonadota"
}
genus
{
    "0": "Acinetobacter",
    "1": "Actinobacillus",
    "2": "Bacillus",
    "3": "Candida",
    "4": "Candida glaebosa",
    "5": "Enterobacter",
    "6": "Enterococcus",
    "7": "Escherichia",
    "8": "Haemophilus",
    "9": "Klebsiella",
    "10": "Listeria",
    "11": "Micrococcus",
    "12": "Nakaseomyces",
    "13": "Neisseria",
    "14": "Pasteurella",
    "15": "Proteus",
    "16": "Pseudomonas",
    "17": "Salmonella",
    "18": "Serratia",
    "19": "Staphylococcus",
    "20": "Streptococcus",
    "21": "Vibrio"
}
Drug_class_drug_bank
{
    "0": "Aminoglycosides",
    "1": "Azolidines",
    "2": "Benzene and substituted derivatives",
    "3": "Carboxylic acids and derivatives",
    "4": "Diazanaphthalenes",
    "5": "Fatty Acyls",
    "6": "Lactams",
    "7": "Macrolactams",
    "8": "NC",
    "9": "Organooxygen compounds",
    "10": "Phenol ethers",
    "11": "Quinolines and derivatives",
    "12": "Steroids and steroid derivatives",
    "13": "Tetracyclines"
}
order
{
    "0": "Bacillales",
    "1": "Enterobacterales",
    "2": "Lactobacillales",
    "3": "Micrococcales",
    "4": "Moraxellales",
    "5": "Neisseriales",
    "6": "Pasteurellales",
    "7": "Pseudomonadales",
    "8": "Saccharomycetales",
    "9": "Vibrionales"
}
species
{
    "0": "Acinetobacter calcoaceticus/baumannii complex",
    "1": "Actinobacillus pleuropneumoniae",
    "2": "Bacillus cereus group",
    "3": "Bacillus subtilis group",
    "4": "Candida albicans",
    "5": "Candida glabrata",
    "6": "Candida saitoana",
    "7": "Enterobacter cloacae complex",
    "8": "Enterococcus faecalis",
    "9": "Escherichia coli",
    "10": "Haemophilus influenzae",
    "11": "Klebsiella pneumoniae",
    "12": "Klebsiella sp.",
    "13": "Listeria monocytogenes",
    "14": "Micrococcus luteus",
    "15": "Neisseria mucosa",
    "16": "Pasteurella Multocida",
    "17": "Proteus mirabilis",
    "18": "Proteus sp.",
    "19": "Proteus vulgaris",
    "20": "Pseudomonas aeruginosa group",
    "21": "Pseudomonas fluorescens group",
    "22": "Salmonella enterica",
    "23": "Serratia odorifera",
    "24": "Staphylococcus aureus",
    "25": "Staphylococcus epidermidis",
    "26": "Streptococcus uberis",
    "27": "Vibrio cholerae",
    "28": "Vibrio harveyi group",
    "29": "unclassified Bacillus"
}
shape
{
    "0": "nanorods and triangles",
    "1": "spherical"
}
kingdom
{
    "0": "Bacteria",
    "1": "Fungi"
}
isolated_from
{
    "0": "blood",
    "1": "food",
    "2": "lungs",
    "3": "mammary gland",
    "4": "meat",
    "5": "mouth",
    "6": "nose",
    "7": "respiratory tract",
    "8": "skin",
    "9": "soil",
    "10": "sputum",
    "11": "urine",
    "12": "water"
}
family
{
    "0": "Bacillaceae",
    "1": "Debaryomycetaceae",
    "2": "Enterobacteriaceae",
    "3": "Enterococcaceae",
    "4": "Listeriaceae",
    "5": "Micrococcaceae",
    "6": "Moraxellaceae",
    "7": "Morganellaceae",
    "8": "Neisseriaceae",
    "9": "Pasteurellaceae",
    "10": "Pseudomonadaceae",
    "11": "Saccharomycetaceae",
    "12": "Staphylococcaceae",
    "13": "Streptococcaceae",
    "14": "Vibrionaceae",
    "15": "Yersiniaceae"
}
gram
{
    "0": "n",
    "1": "p"
}
class
{
    "0": "Actinomycetes",
    "1": "Bacilli",
    "2": "Betaproteobacteria",
    "3": "Gammaproteobacteria",
    "4": "Saccharomycetes"
}
method
{
    "0": "MIC",
    "1": "disc_diffusion",
    "2": "well_diffusion"
}
Drug
{
    "0": "amikacin",
    "1": "amoxicillin",
    "2": "amphotericin b",
    "3": "ampicillin",
    "4": "carbenicillin",
    "5": "cefazolin",
    "6": "cefepime",
    "7": "cefixime",
    "8": "cefotaxime",
    "9": "ceftazidime",
    "10": "ceftriaxone",
    "11": "cephalexin",
    "12": "chloramphenicol",
    "13": "ciprofloxacin",
    "14": "clindamycin",
    "15": "co-trimoxazole",
    "16": "colistin",
    "17": "doxycycline",
    "18": "erythromycin",
    "19": "faropenem",
    "20": "fusidic acid",
    "21": "gentamicin",
    "22": "imipenem",
    "23": "kanamycin",
    "24": "levofloxacin",
    "25": "mupirocin",
    "26": "nalidixic acid",
    "27": "neomycin",
    "28": "nitrofurantoin",
    "29": "penicillin",
    "30": "piperacillin",
    "31": "polymyxin",
    "32": "rifampicin",
    "33": "streptomycin",
    "34": "tetracycline",
    "35": "trimethoprim",
    "36": "vancomycin"
}
'''

## Способ 2 (хороший)
 - Добавить строку со своим экспериментом в чистый ```dataframe```
 - Применить кодирование:

In [40]:
clean_df = pd.read_csv('clean_data.csv')

# Добавьте в этот df свои эксперименты
clean_df

clean_df_code, clean_df_onehot = code_and_onehot(clean_df)

pred = get_prediction(clean_df_code)

array([3.20000000e+01, 3.20000000e+01, 3.19900000e+01, 3.20200000e+01,
       1.70100000e+01, 1.69500000e+01, 1.50066667e+01, 3.12500000e+01,
       4.96566667e+01, 4.79533333e+01, 4.79300000e+01, 4.70166667e+01,
       4.40866667e+01, 4.11500000e+01, 3.99866667e+01, 4.00066667e+01,
       3.95933333e+01, 3.73200000e+01, 3.80833333e+01, 3.69600000e+01,
       3.66500000e+01, 3.60000000e+01, 3.60000000e+01, 3.63200000e+01,
       3.56100000e+01, 3.41566667e+01, 3.37233333e+01, 3.38700000e+01,
       3.40866667e+01, 3.40100000e+01, 3.17700000e+01, 3.20933333e+01,
       3.19633333e+01, 3.20966667e+01, 3.19866667e+01, 3.19666667e+01,
       3.17866667e+01, 3.12500000e+01, 3.12500000e+01, 3.04866667e+01,
       3.12266667e+01, 3.09400000e+01, 3.09900000e+01, 3.09900000e+01,
       2.95433333e+01, 2.99100000e+01, 3.00100000e+01, 3.00000000e+01,
       3.00000000e+01, 3.00000000e+01, 2.91233333e+01, 2.90600000e+01,
       2.89900000e+01, 2.90200000e+01, 2.74700000e+01, 2.80100000e+01,
      

## Способ 3 (нейронновый)
 - Сделать то же, что в способе 2
 - Призвать с помощью команд ниже нейронку и сделать предсказание

In [None]:
n_features = clean_df_onehot.shape[1]-1
model = DenseNN(n_features)
model.load_state_dict(torch.load('best_nn.pkl'))
model.eval()
