In [None]:
import sys
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import validation_curve, learning_curve,GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score, roc_auc_score,roc_curve,accuracy_score,make_scorer 

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_excel("Задания_1_2.xlsx", sheet_name="Training")
df.head()

In [None]:
#смотрим информацию об сэте
df.info()

In [None]:
df.describe()

In [None]:
#вставялем пропущенные значения минимальными
df['P2'].fillna(df['P2'].mean(), inplace=True)
df['P3'].fillna(df['P3'].mean(), inplace=True)
df['P8'].fillna(df['P8'].mean(), inplace=True)
df['P16'].fillna(df['P16'].mean(), inplace=True)
df['P25'].fillna(df['P25'].mean(), inplace=True)
df['P29'].fillna(df['P29'].mean(), inplace=True)

In [None]:
df.info()

In [None]:
#датафрэйм сбалансирован
df["Target"].value_counts()

In [None]:
del df['ID']

In [None]:
#посмотрим корреляцию данных
sns.set()
fig, ax = plt.subplots(figsize=(40,20));
sns.heatmap(df.corr('spearman'), cmap='PuOr', annot=True, ax=ax);

In [None]:
corr=df.corr()
c=corr.abs().unstack()
c[c == 1] = 0
c=c.sort_values(ascending = False).drop_duplicates()
tmp=c.head(10)
tmp.sort_values(ascending = True)
tmp

In [None]:
#заменим сильно коррелируемые данные на их произведение и удалим их
def prepare_df(data):
    df = data.copy()
    
    df["P23_22"]=df["P23"]*df["P22"]
    df["P25_17"]=df["P25"]*df["P17"]
    df["P1_5"]=df["P1"]*df["P5"]
    df["P12_15"]=df["P12"]*df["P15"]
    df["P31_29"]=df["P31"]*df["P29"]
    df=df.drop(labels=['P1','P5','P12','P15','P17','P22','P23','P25','P29','P31'], axis=1)    
    return df

In [None]:
data=prepare_df(df)

In [None]:
data.head()

In [None]:
corr_data=data.corr()
c=corr_data.abs().unstack()
c[c == 1] = 0
c=c.sort_values(ascending = False).drop_duplicates()
tmp=c.head(10)
tmp.sort_values(ascending = True)
tmp

In [None]:
X = data.drop(columns=['Target'])
y = data['Target']

In [None]:
#разбиваем данные на тренировочные тестовые и валидационные данные
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=232)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=232)


In [None]:
# создаём функцию  построения моделей используя GridsearchCV и вывода метрик на тенировочных данных

def fit_classifier(model, X, y, parameters=None, scorer_metrics=None):

    # Perform grid search on the classifier using scorer_metrics as the scoring method
    grid_obj = GridSearchCV(estimator = model, param_grid = parameters, scoring=make_scorer(scorer_metrics), cv=5)

    # Fit the grid search object to the training data and find the optimal parameters using fit()
    grid_fit = grid_obj.fit(X, y)

    # Get the estimator
    model_estimator = grid_fit.best_estimator_

    # Report the metrics scores on train data
    model_estimator.fit(X, y)
    y_pred = model_estimator.predict(X)

    print("\n")
    print("\nModel performance on training set\n------------------------")
    print("Final accuracy score on the training data: {:.4f}".format(accuracy_score(y, y_pred)))
    print("Final precision score on training data: {:.4f}".format(precision_score(y, y_pred)))
    print("Final Recall score on training data: {:.4f}".format(recall_score(y, y_pred)))
    print("Final ROC AUC score on training data: {:.4f}".format(roc_auc_score(y, y_pred)))
    print("\n")
    print("The best parameters are: {}".format(model_estimator))

    return model_estimator

In [None]:
# предсказать тестовые данные и вывода метрик на тестовых данных
def classifier_test(model_fit, X, y):
    y_pred = model_fit.predict(X)
    print("\n")
    print("\nModel performance on test set\n------------------------")
    print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y, y_pred)))
    print("Final precision score on testing data: {:.4f}".format(precision_score(y, y_pred)))
    print("Final Recall score on testing data: {:.4f}".format(recall_score(y, y_pred)))
    print("Final ROC AUC score on testing data: {:.4f}".format(roc_auc_score(y, y_pred)))
    return y_pred

In [None]:
# построение графика ROC
def roc_curve_plot(model, X, y,label=None):
    
    y_score = model.predict_proba(X)[:,1]
    
    
    roc = roc_curve(y, y_score)
    
    plt.plot(roc[0], roc[1], label=label)
    plt.plot([0,1],[0,1], 'k--')
    plt.axis([0,1,0,1])
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    
    roc_score = auc(roc[0],roc[1])
    print('AUC score of %s is %.4f.' % (label, roc_score))

In [None]:
from sklearn.metrics import auc

def gain_plot(lift_input, label=None):
    plt.plot(lift_input[1], lift_input[0], label=label)
    plt.plot([0,1],[0,1], 'k--') # reference line for random model
    plt.axis([0,1,0,1])
    plt.xlabel('Population%')
    plt.ylabel('Subscribe%')
    # calculate area under curve
    AUC = auc(lift_input[1], lift_input[0], reorder=False)
    print('AUC score of %s is %.4f.' % (label, AUC))

In [None]:
# строим наилучшую модель для логистической регресии 
parameters_LR = {'C': [0.0001,0.0003, 0.0005], 'penalty': ['l1', 'l2']}

model_LR = fit_classifier(LogisticRegression(random_state=18), X_train, y_train, 
                          parameters=parameters_LR, scorer_metrics=recall_score)
model_LR.score(X_test, y_test)

In [None]:
# Модель логистической регрессии на тестовом датасэте
y_test_LR = classifier_test(model_LR, X_test, y_test)

In [None]:
#  строим наилучшую модель для random forest 
from sklearn.ensemble import RandomForestClassifier
# Create the parameters list
parameters_RF = {'max_depth': [2,5,7,10], 'min_samples_leaf': [2,3,5,7,10], 'min_samples_split': [2,3,5,10]}

model_RF = fit_classifier(RandomForestClassifier(random_state=18), X_train, y_train, 
                          parameters=parameters_RF, scorer_metrics=recall_score)
model_RF.score(X_test, y_test)

In [None]:
# random forest модель на тестовом датасэте
y_test_RF = classifier_test(model_RF, X_test, y_test)

In [None]:
# строим наилучшую модель для DecisionTree
parameters_DT = {'max_depth': [7,8,9],
                 'min_samples_leaf': [2,3,4],
                 'min_samples_split': [2,3,4]}

model_DT = fit_classifier(DecisionTreeClassifier(random_state=44), X_train, y_train, 
                          parameters=parameters_DT, scorer_metrics=recall_score)
model_DT.score(X_test, y_test)

In [None]:
# Decision Tree модель на тестовом датасэте
y_test_DT = classifier_test(model_DT, X_test, y_test)

In [None]:
# построение ROC кривых для всех моделей на тренировочном датасэте
roc_curve_plot(model_LR, X_train, y_train, label='Logistic Regression')
roc_curve_plot(model_DT, X_train, y_train, label='Decision Tree')
roc_curve_plot(model_RF, X_train, y_train, label='Random Forest')
plt.title('ROC Curves on Train Set')
plt.legend(loc='lower right')

In [None]:
# построение ROC кривых для всех моделей на тестовом датасэте
roc_curve_plot(model_LR, X_test, y_test, label='Logistic Regression')
roc_curve_plot(model_DT, X_test, y_test, label='Decision Tree')
roc_curve_plot(model_RF, X_test, y_test, label='Random Forest')
plt.title('ROC Curves on Test Set')
plt.legend(loc='lower right')

In [None]:
roc_curve_plot(model_LR, X_val, y_val, label='Logistic Regression')
roc_curve_plot(model_DT, X_val, y_val, label='Decision Tree')
roc_curve_plot(model_RF, X_val, y_val, label='Random Forest')
plt.title('ROC Curves on Valid Set')
plt.legend(loc='lower right')

In [None]:
model_RF.fit(X, y)

pred = model_RF.predict_proba(X)[:, 1]
print("Total ROC AUC: %.2f" % roc_auc_score(y, pred))

In [None]:
# функция построения матрицы ошибок
def show_confusion_matrix(X, y, clf, threshold=0.7):
    pred = clf.predict_proba(X)[:, 1]
    pred = [1 if p >= threshold else 0 for p in pred]
    cm = pd.DataFrame(confusion_matrix(y, pred),
        index=["Real 0", "Real 1"], columns=["Predicted 0", "Predicted 1"])
    print("Threshold = %.2f" % threshold)
    print("Accuracy score: %.1f%%" % (100 * accuracy_score(y, pred)))
    print("Confusion matrix:")
    display(cm)

In [None]:
show_confusion_matrix(X, y, model_RF, threshold=.7)

In [None]:
# создание словаря для переменных
def importances_dict(columns, model):
    importances = dict()
    for col, importance in zip(columns, model.feature_importances_):
        importances[col] = importance.round(2)
    return importances
# построение графика важности переменных
def plot_importances(importances):
    fig, ax = plt.subplots(figsize=(20,10))
    pd.Series(importances).plot(kind='barh', ax=ax)
    plt.title("Важность переменных")
    plt.grid(axis="x")
    plt.show()
    
plt.rcParams.update({'font.size': 12})
plot_importances(importances_dict(X.columns, model_RF))

In [None]:
valid_df = pd.read_excel("Задания_1_2.xlsx", sheet_name="Validate")
valid_df.head()

In [None]:
del valid_df['ID']

In [None]:
valid_df=prepare_df(valid_df)

In [None]:
pred = model_RF.predict_proba(X).T[1]

pred = pd.DataFrame({ "prediction": pred})
pred.head()


In [None]:
pred.to_csv("data/validate_scores.csv", index=False)