In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from tpot import TPOTClassifier
from tpot import TPOTRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
from sklearn.impute import KNNImputer

import pickle

In [None]:
data = pd.read_csv('small_data.csv')
data.head()

In [None]:
data.columns

In [None]:
X = data.drop(["death", "severity", "days_at_hosp"], axis=1)
death = data.death.astype(int)
severity = data.severity.astype(int)
days = data.days_at_hosp.astype(int)

In [None]:
def draw_train_test(y_train, y_test, name):
    print(name)
    print('train')
    sns.histplot(y_train)
    plt.show()
    print('test')
    sns.histplot(y_test)
    plt.show()
    print('=======')
    
def draw_hist(y, name):
    print(name)
    sns.histplot(y)
    plt.show()


Xd_train, Xd_test, yd_train, yd_test = train_test_split(
    X, death, train_size=0.75, test_size=0.25, random_state=47
)
Xs_train, Xs_test, ys_train, ys_test = train_test_split(
    X, severity, train_size=0.75, test_size=0.25, random_state=47
)
Xday_train, Xday_test, yday_train, yday_test = train_test_split(
    X, days, train_size=0.75, test_size=0.25, random_state=47
)

draw_train_test(yd_train, yd_test, 'death')
draw_train_test(ys_train, ys_test, 'severity')
draw_train_test(yday_train, yday_test, 'day')

## Oversampling

### death

In [None]:
def over_sampling(X, y):
    # Class count
    counts = y.value_counts()

    # Divide by class
    data_classes = []
    for c in counts.keys():
        data_classes.append(X[y==c])

    max_count = np.max(counts)
    new_datasets = []
    new_ys = []
    for i, c, k in zip(range(0,len(counts)),counts, counts.keys()):
        if c<max_count:
            new_datasets.append(data_classes[i].sample(max_count, replace=True))
        else:
            new_datasets.append(data_classes[i])
        new_ys.append(np.full(max_count, k))
    new_X = pd.concat(new_datasets, axis = 0)
    new_y = np.array(new_ys).flatten()
    return new_X, new_y

In [None]:
Xd_train, yd_train = over_sampling(Xd_train, yd_train)
Xs_train, ys_train = over_sampling(Xs_train, ys_train)
# Xday_train, yday_train = over_sampling(Xday_train, yday_train)

draw_hist(yd_train, 'death')
draw_hist(ys_train, 'severity')
draw_hist(yday_train, 'days')

## MODELS

In [None]:
def check_classification(y_test, y_pred):
    CM = confusion_matrix(y_test, y_pred)
    
    print('Overall stat:')
    print(f'Accuracy: {np.round(accuracy_score(y_test, y_pred), 3)}')
    print(f'Precision: {np.round(precision_score(y_test, y_pred, average="micro"), 3)}')
    print(f'Recall: {np.round(recall_score(y_test, y_pred, average="micro"), 3)}')
    print(f'F1: {np.round(f1_score(y_test, y_pred, average="micro"), 3)}')

    print('\nBy classes stat:')
    scores = {
        'Accuracy': np.round(CM.diagonal()/CM.sum(axis=1), 3),
        'Precision': np.round(precision_score(y_test, y_pred, average=None), 3),
        'Recall': np.round(recall_score(y_test, y_pred, average=None), 3),
        'F1': np.round(f1_score(y_test, y_pred, average=None), 3)
    }

    display(pd.DataFrame(scores))

### DEATH

In [None]:
tpot_death = TPOTClassifier(
    generations=100,
    mutation_rate=0.9,
    crossover_rate=0.1,
    population_size=10,
    verbosity=2,
    n_jobs=10,
)
tpot_death.fit(Xd_train, yd_train)

In [None]:
yd_pred = tpot_death.predict(Xd_test)
check_classification(yd_test, yd_pred)

### Severity

In [None]:
tpot_sev = TPOTClassifier(
    generations=100,
    population_size=20,
    verbosity=2,
    n_jobs=10,
)
tpot_sev.fit(Xs_train, ys_train)

In [None]:
ys_pred = tpot_sev.predict(Xs_test)
check_classification(ys_test, ys_pred)

## Days

In [None]:
def err_score(y_test, y_pred):
    return np.sum(np.abs(y_pred-y_test))/np.sum(y_test)

def scores(y_test, y_pred):
    err = np.round(err_score(y_test, y_pred), 3)
    R2 = np.round(r2_score(y_test, y_pred), 3)
    print(f'err = {err}')
    print(f'R2 = {R2}')
    return err, R2

In [None]:
data_alive = data[data.death==0]
X_alive = data_alive.drop(["death", "severity", "days_at_hosp"], axis=1)
days_alive = data_alive.days_at_hosp.astype(int)

data_dead = data[data.death==1]
X_dead = data_dead.drop(["death", "severity", "days_at_hosp"], axis=1)
days_dead = data_dead.days_at_hosp.astype(int)

Xalive_train, Xalive_test, yalive_train, yalive_test = train_test_split(
    X_alive, days_alive, train_size=0.75, test_size=0.25, random_state=47
)

Xdead_train, Xdead_test, ydead_train, ydead_test = train_test_split(
    X_dead, days_dead, train_size=0.75, test_size=0.25, random_state=47
)

draw_train_test(yalive_train, yalive_test, 'alive')
draw_train_test(ydead_train, ydead_test, 'dead')

In [None]:
tpot_dead = TPOTRegressor(
    generations=100,
    population_size=20,
    verbosity=2,
    n_jobs=10,
)
tpot_dead.fit(Xdead_train, ydead_train)

In [None]:
ydead_pred = tpot_dead.predict(Xdead_test)
scores(ydead_test, ydead_pred)

In [None]:
tpot_alive = TPOTRegressor(
    generations=100,
    population_size=20,
    verbosity=2,
    n_jobs=10,
)
tpot_alive.fit(Xalive_train, yalive_train)

In [None]:
yalive_pred = tpot_alive.predict(Xalive_test)
scores(yalive_test, yalive_pred)

## Imputer data

In [None]:
imputer = KNNImputer(n_neighbors=5).fit(X)
#     return pd.DataFrame(
#         imputer.fit_transform(data), columns=data.columns, index=data.index
#     )

## Pickling

In [None]:
models = [
    imputer,
    tpot_death.fitted_pipeline_,
    tpot_sev.fitted_pipeline_,
    tpot_dead.fitted_pipeline_,
    tpot_alive.fitted_pipeline_,
]
names = [
    "imputer",
    "tpot_death",
    "tpot_sev",
    "tpot_dead",
    "tpot_alive",
]

for m, n in zip(models, names):
    print(n)
    with open(f'models/{n}.pickle', 'wb') as f:
        pickle.dump(m, f)

## SOME MODEL TESTIG

In [None]:
##### Death Prediction

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier
estimators = [
('lr', LogisticRegression()),
('svc', SVC(C=0.001,degree=1,coef0=1)),
('gbc',GradientBoostingClassifier(n_estimators=17,max_depth=2))
]
clf_death = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression())
c=StandardScaler().fit(Xd_train)
clf_death.fit(c.transform(Xd_train), yd_train)

yd_pred = clf_death.predict(c.transform(Xd_test))
check_classification(yd_test, yd_pred)

In [None]:
### Days of illness prediction

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression,SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

cl=MinMaxScaler().fit(Xalive_train)
clf_alive = GradientBoostingRegressor()
clf_alive.fit(cl.transform(Xalive_train), yalive_train)

yalive_pred = clf_alive.predict(cl.transform(Xalive_test))
scores(yalive_test, yalive_pred)