# Модели для бейзлайна

### Automl aproaches

In [1]:
import h2o
from h2o.estimators.deeplearning import H2OAutoEncoderEstimator
h2o.init()

ModuleNotFoundError: No module named 'h2o'

In [None]:
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import os.path

In [None]:
PATH = ''

In [None]:
train = h2o.import_file(PATH + "/train.csv")
testg = h2o.import_file(PATH + "/test.csv")

In [None]:
model = H2OAutoEncoderEstimator( 
        activation="Tanh", 
        hidden=[50], 
        l1=1e-5,
        score_interval=0,
        epochs=100
)

model.train(x=train_ecg.names, training_frame=train_ecg)

In [None]:
model

In [None]:
reconstruction_error = model.anomaly(test_ecg)

In [None]:
df = reconstruction_error.as_data_frame()

In [None]:
df['Rank'] = df['Reconstruction.MSE'].rank(ascending=False)

In [None]:
df_sorted = df.sort_values('Rank')
df_sorted

In [None]:
anomalies = df_sorted[ df_sorted['Reconstruction.MSE'] > 1.0 ]
anomalies

### Модель для rare events на деревьях

In [None]:
import pandas as pd
import numpy as np
import random
import os
import pickle
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC, OneClassSVM
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_curve, auc, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.discrete.discrete_model import Logit

In [None]:
class ImbalancedClassifier(object):
    ''' A set of tools to help with classification problems for imbalanced data.'''

    def __init__(self):
        X_train = self.X_train
        X_test = self.X_test
        y_train = self.y_train
        y_test = self.y_test
        dep_variable=self.dep_variable

    def sample_abundant_data(self, tolerance=0.20):
        '''Create a sample from the abundant class of a binary dependent variable.
        INPUTS:
        df (dataframe) - A pandas dataframe containing the set of features for modeling.
        y_df (dataframe) - A pandas dataframe containing the dependent variable for which to produce the sample.
        dep_variable (str) - The dataframe column representing the dependent variable, stored as 0/1 boolean values.
        tolerance (float) - A tolerance factor for the number of samples to produce.  The resulting sample will be
        between 1 +/- tolerance of the rare events.
        RETURNS:
        X_tr (dataframe) - A new dataframe containing all instances where dep_variable == 1 and the sampled rows
        where dep_variable == 0.
        y_tr (Pandas data series) - A new Pandas data series of the response variable based on the sample.
        '''
        df = pd.merge(self.X_train, pd.DataFrame(self.y_train, columns=[self.dep_variable], index=self.y_train.index),\
        how='inner', left_index=True, right_index=True)
        y_1 = df.loc[df[self.dep_variable] == 1]

        sample_pct = random.uniform(1 - tolerance, 1 + tolerance)
        sample_size = int(np.sum(df[self.dep_variable]) * sample_pct)

        samp = df.loc[df[self.dep_variable] == 0].sample(n=sample_size)

        new_x = pd.concat([samp, y_1], axis=0)
        new_y = new_x.pop(dep_variable)
        return new_x, new_y

    def bootstrap_sample(self, tolerance=0.20):
        '''Create bootstrap samples from the majority and minority class of a data frame.  The resulting
        samples are used in balanced random forests (http://statistics.berkeley.edu/sites/default/files/tech-reports/666.pdf)
        and gradient boosting algorithms.
        INPUTS:
        df (dataframe) - A pandas dataframe containing the set of features for modeling.
        y_df (dataframe) - A pandas dataframe containing the dependent variable for which to produce the sample.
        dep_variable (str) - The dataframe column representing the dependent variable, stored as 0/1 boolean values.
        tolerance (float) - A tolerance factor for the number of samples to produce.  The resulting sample will be
        between 1 +/- tolerance of the rare events.
        RETURNS:
        X_tr (dataframe) - A new dataframe containing all instances where dep_variable == 1 and the sampled rows
        where dep_variable == 0.
        y_tr (Pandas data series) - A new Pandas data series of the response variable based on the sample.
        '''
        df = pd.merge(self.X_train, pd.DataFrame(self.y_train, columns=[self.dep_variable], index=self.y_train.index),\
        how='inner', left_index=True, right_index=True)

        sample_pct = random.uniform(1 - tolerance, 1 + tolerance)
        sample_size = int(np.sum(df[self.dep_variable]) * sample_pct)

        samp0 = df.loc[df[self.dep_variable] == 0].sample(n=sample_size, replace=True)
        samp1 = df.loc[df[self.dep_variable] == 1].sample(n=sample_size, replace=True)

        new_x = pd.concat([samp0, samp1], axis=0)
        new_y = new_x.pop(dep_variable)
        return new_x, new_y

    def get_combined_proba(models, X_train, X_test, y_train, y_test, sample_method='abundant', ksamples=15):
        '''sample_method = 'abundant' or 'bootstrap'
        '''
        sample_methods={'abundant': sample_abundant_data, 'bootstrap': bootstrap_sample}
        predictions = []
        for k in range(ksamples):
            Xt, Yt = sample_methods[sample_method](X_train, y_train, dep_variable='N188')
            for model in models.values():
                model.fit(Xt, Yt)
                p = model.predict_proba(X_test)
                predictions.append(p)

        return predictions

    def get_majority_vote(models, modname='Default Model',sample_method='abundant', ksamples=15, print_results=True):
        ''' Fit k models using either abundant of bootstrap samples and create classification predictions.
        Use a majority vote of the k samples to produce a final classification prediction.
        INPUTS:
        models (dict) - A dictionary of model class instantiation references.
        E.g. if gb=GradientBoostingClassifier, lr=LogisticRegression, then models = {'gb': gb, 'lr': lr}
        modname (str) - A string for printing the algorithm name if quality metrics are printed.
        sample_method (str) - 'abundant' or 'bootstrap', the preferred sample method to use.
        k_samples (int) - The number of samples to take from the training dataset
        print_results (bool) - A boolean of whether or not quality metrics should be printed for final results.
        RETURNS:
        roc_auc (float) - The ROC AUC (area under the receiver/operator characteristic)
        votes (Numpy Array) - An array of the binary predictions for y_test
        probs (Numpy Array) - An array of the probabilities of positive prediction for y_test
        '''
        sample_methods={'abundant': sample_abundant_data, 'bootstrap': bootstrap_sample}
        predictions = []
        probs = np.zeros(len(y_test))
        for k in range(ksamples):
            Xt, Yt = sample_methods[sample_method](self.X_train, self.y_train, dep_variable=self.dep_variable)
            for model in models.values():
                model.fit(Xt, Yt)
                p = model.predict(X_test)
                predictions.append(p)
                prob = model.predict_proba(self.X_test)[:,1]
                probs = np.sum([probs, prob], axis=0)

        votes = np.array([1 if sum(x) > (len(predictions) / 2) else 0 for x in zip(*predictions)])
        probs = probs / len(predictions)
        accuracy = accuracy_score(self.y_test, votes)
        precision = precision_score(self.y_test, votes)
        cm = confusion_matrix(self.y_test, votes)
        recall = recall_score(self.y_test, votes)
        fpr, tpr, thresholds = roc_curve(y_test, probs)
        roc_auc = auc(fpr, tpr)

        if print_results is True:
            model = 'combined'
            self.print_quality_metrics(model, modname, self.X_test, self.y_test)

        return roc_auc, votes, probs

    def print_quality_metrics(model, model_name):
        '''Print basic quality metrics for a given model, including:
        confusion matrix, AUC, accuracy, precision and recall.
        INPUTS:
        model - The instantiated class name of the model used (e.g. gb, lr, rf)
        model_name (str) - The text description of the model for use in printing.
        RETURNS:
        None
        '''
        accuracy = accuracy_score(self.y_test, model.predict(self.X_test))
        precision = precision_score(self.y_test, model.predict(self.X_test))
        recall = recall_score(self.y_test, model.predict(self.X_test))
        fpr, tpr, thresholds = roc_curve(self.y_test, model.predict(self.X_test))
        roc_auc = auc(fpr, tpr)
        cm = confusion_matrix(self.y_test, model.predict(self.X_test))
        print ("Model Name: Accuracy\tPrecision\tRecall\tAUC")
        print('{0}: {1:.4f}\t{2:.4f}\t{3:.4f}\t{4:.4f}'.format(model_name, accuracy, precision, recall, roc_auc))
        print ("Confusion Matrix")
        print (cm)



In [None]:
dep = 'N189'

X_train = 
X_test = 
y_train = 
y_test = 

gb = GradientBoostingClassifier(learning_rate=0.005, n_estimators=500,\
max_features='sqrt', max_depth=5)

models = {'Gradient Boosting': model1}
model1 = {'gb': gb}

for modname, model in models.items():
    roc_auc, votes, probs = get_majority_vote(model, X_train, X_test, y_train, y_test,\
    modname=modname, dep_variable=dep, sample_method='abundant', ksamples=k_samp)

    fpr, tpr, thresholds = roc_curve(y_test, probs)

### Модель для rare events на автоэнкодере

In [7]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from pylab import rcParams
import tensorflow as tf
from keras.models import Model, load_model
from keras.layers import Input, Dense
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_recall_curve
from sklearn.metrics import recall_score, classification_report, auc, roc_curve
from sklearn.metrics import precision_recall_fscore_support, f1_score
from numpy.random import seed
SEED = 123 #used to help randomly select the data points
DATA_SPLIT_PCT = 0.2
rcParams['figure.figsize'] = 8, 6
LABELS = ["Normal","Danger",'Very_Danger']

In [None]:
scaler = StandardScaler().fit(df_train_0_x)
df_train_0_x_rescaled = scaler.transform(df_train_0_x)
df_valid_0_x_rescaled = scaler.transform(df_valid_0_x)
df_valid_x_rescaled = scaler.transform(df_valid.drop(['y'], axis = 1))df_test_0_x_rescaled = scaler.transform(df_test_0_x)
df_test_x_rescaled = scaler.transform(df_test.drop(['y'], axis = 1))

In [None]:
nb_epoch = 200
batch_size = 128
input_dim = df_train_0_x_rescaled.shape[1] #num of predictor variables, 
encoding_dim = 32
hidden_dim = int(encoding_dim / 2)
learning_rate = 1e-3

input_layer = Input(shape=(input_dim, ))
encoder = Dense(encoding_dim, activation="relu", activity_regularizer=regularizers.l1(learning_rate))(input_layer)
encoder = Dense(hidden_dim, activation="relu")(encoder)
decoder = Dense(hidden_dim, activation="relu")(encoder)
decoder = Dense(encoding_dim, activation="relu")(decoder)
decoder = Dense(input_dim, activation="linear")(decoder)
autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.summary()

In [None]:
autoencoder.compile(metrics=['accuracy'],
                    loss='mean_squared_error',
                    optimizer='adam')cp = ModelCheckpoint(filepath="autoencoder_classifier.h5",
                               save_best_only=True,
                               verbose=0)tb = TensorBoard(log_dir='./logs',
                histogram_freq=0,
                write_graph=True,
                write_images=True)history = autoencoder.fit(df_train_0_x_rescaled, df_train_0_x_rescaled,
                    epochs=nb_epoch,
                    batch_size=batch_size,
                    shuffle=True,
                    validation_data=(df_valid_0_x_rescaled, df_valid_0_x_rescaled),
                    verbose=1,
                    callbacks=[cp, tb]).history

In [None]:
valid_x_predictions = autoencoder.predict(df_valid_x_rescaled)
mse = np.mean(np.power(df_valid_x_rescaled - valid_x_predictions, 2), axis=1)
error_df = pd.DataFrame({'Reconstruction_error': mse,
                        'True_class': df_valid['y']})precision_rt, recall_rt, threshold_rt = precision_recall_curve(error_df.True_class, error_df.Reconstruction_error)
plt.plot(threshold_rt, precision_rt[1:], label="Precision",linewidth=5)
plt.plot(threshold_rt, recall_rt[1:], label="Recall",linewidth=5)
plt.title('Precision and recall for different threshold values')
plt.xlabel('Threshold')
plt.ylabel('Precision/Recall')
plt.legend()
plt.show()

In [None]:
test_x_predictions = autoencoder.predict(df_test_x_rescaled)
mse = np.mean(np.power(df_test_x_rescaled - test_x_predictions, 2), axis=1)
error_df_test = pd.DataFrame({'Reconstruction_error': mse,
                        'True_class': df_test['y']})
error_df_test = error_df_test.reset_index()threshold_fixed = 0.4
groups = error_df_test.groupby('True_class')fig, ax = plt.subplots()for name, group in groups:
    ax.plot(group.index, group.Reconstruction_error, marker='o', ms=3.5, linestyle='',
            label= "Break" if name == 1 else "Normal")
ax.hlines(threshold_fixed, ax.get_xlim()[0], ax.get_xlim()[1], colors="r", zorder=100, label='Threshold')
ax.legend()
plt.title("Reconstruction error for different classes")
plt.ylabel("Reconstruction error")
plt.xlabel("Data point index")
plt.show();

In [None]:
pred_y = [1 if e > threshold_fixed else 0 for e in error_df.Reconstruction_error.values]conf_matrix = confusion_matrix(error_df.True_class, pred_y)plt.figure(figsize=(12, 12))
sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d");
plt.title("Confusion matrix")
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.show()

### Модель для анализа выживаемости