In [6]:
# Числовые признаки
num_cols = [
    'ClientPeriod',
    'MonthlySpending',
    'TotalSpent'
]

# Категориальные признаки
cat_cols = [
    'Sex',
    'IsSeniorCitizen',
    'HasPartner',
    'HasChild',
    'HasPhoneService',
    'HasMultiplePhoneNumbers',
    'HasInternetService',
    'HasOnlineSecurityService',
    'HasOnlineBackup',
    'HasDeviceProtection',
    'HasTechSupportAccess',
    'HasOnlineTV',
    'HasMovieSubscription',
    'HasContractPhone',
    'IsBillingPaperless',
    'PaymentMethod'
]

feature_cols = num_cols + cat_cols
target_col = 'Churn'

import pandas as pd
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

PATH = r'C:\Users\user\PycharmProjects\DeepLearningSchool\8.Algorithm_composition\{}'
data = pd.read_csv(PATH.format('train.csv'))
features_labels = []
print(f"Количество строк {data.shape[0]}, количество признаков {data.shape[1]}")

# Чтение данных для обучения модели
y = data['Churn']
data.drop(['Churn'], inplace=True, axis=True)
data['TotalSpent'] = data['TotalSpent'].replace(' ', '0')
data['TotalSpent'] = data['TotalSpent'].astype(float)
data['TotalSpent'] = data['TotalSpent'].replace(0, data['TotalSpent'].mean())

# Чтение данных для предсказания
data_predict = pd.read_csv(PATH.format('test.csv'))
data_predict['TotalSpent'] = data_predict['TotalSpent'].replace(' ', '0')
data_predict['TotalSpent'] = data_predict['TotalSpent'].astype(float)
data_predict['TotalSpent'] = data_predict['TotalSpent'].replace(0, data['TotalSpent'].mean())
X_predict = data_predict

#LabelEncoder для категориальных признаков
le = LabelEncoder()
for s in cat_cols:
    le.fit(data[s])
    data[s+'_le'] = le.transform(data[s])
    X_predict[s+'_le'] = le.transform(X_predict[s])
    features_labels.append(s+'_le')

# OneHotEncoder для категориальных признаков
ohe = OneHotEncoder(handle_unknown='ignore')
for s in cat_cols:
    ohe.fit(data[s].values.reshape(-1, 1))
    
    new_ohe_features = ohe.transform(X_predict[s].values.reshape(-1, 1)).toarray()
    tmp = pd.DataFrame(new_ohe_features, columns=[s+'='+str(i) for i in range(new_ohe_features.shape[1])])
    X_predict = pd.concat([X_predict, tmp], axis=1)
    X_predict.drop([s], axis=1, inplace=True)
    
    new_ohe_features = ohe.transform(data[s].values.reshape(-1, 1)).toarray()
    tmp = pd.DataFrame(new_ohe_features, columns=[s+'='+str(i) for i in range(new_ohe_features.shape[1])])
    data = pd.concat([data, tmp], axis=1)
    data.drop([s], axis=1, inplace=True)
    
    [features_labels.append(s+'='+str(i)) for i in range(new_ohe_features.shape[1])]

# Нормализация числовых признаков
scaler = StandardScaler()
for i in num_cols:
    scaler.fit(data[i].values.reshape(-1, 1))
    
    X_predict[i+'_ss'] = scaler.transform(X_predict[i].values.reshape(-1, 1))
    X_predict.drop([i], axis=1, inplace=True)
    
    data[i+'_ss'] = scaler.transform(data[i].values.reshape(-1, 1))
    data.drop([i], axis=1, inplace=True)
    
    features_labels.append(i+'_ss') 

# Разбивание выборки для обучения на тестовую и валидационную
X_train, X_valid, y_train, y_valid = train_test_split(
    data, y, stratify=y, random_state=42, test_size=0.2
)


from sklearn.preprocessing import MaxAbsScaler
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectFromModel



transformer = MaxAbsScaler().fit(X_train)
X_train = pd.DataFrame(transformer.transform(X_train), columns=features_labels)
X_valid = pd.DataFrame(transformer.transform(X_valid), columns=features_labels)


# qt = QuantileTransformer(n_quantiles=1000, random_state=0, output_distribution='normal')
# qt.fit(X_train, y_train)
# X_train = pd.DataFrame(qt.transform(X_train), columns=features_labels)
# X_valid = pd.DataFrame(qt.transform(X_valid), columns=features_labels)

new_names = ['ClientPeriod_ss', 'MonthlySpending_ss', 'TotalSpent_ss']
pf = PolynomialFeatures(interaction_only=True, degree=2)
pf.fit(X_train[(new_names)])

X_train_pf = pf.transform(X_train[(new_names)])
X_valid_pf = pf.transform(X_valid[(new_names)])
new_names_pf = [f'new_names_{i}' for i in range(X_train_pf.shape[1])]

X_train_pf = pd.DataFrame(X_train_pf, columns=new_names_pf)
X_valid_pf = pd.DataFrame(X_valid_pf, columns=new_names_pf)
X_train = pd.concat([X_train, X_train_pf], axis=1)
X_valid = pd.concat([X_valid, X_valid_pf], axis=1)
X_train = X_train.drop(['new_names_0'], axis=1)
X_valid = X_valid.drop(['new_names_0'], axis=1)

# Лучшие результаты получены для моделей LogisticRegression и GradientBoostingClassifier при k=62
selecter = SelectKBest(f_classif, k=48).fit(X_train, y_train)
X_train = selecter.transform(X_train)
X_valid = selecter.transform(X_valid)

print(f"Количество строк {X_train.shape[0]}, количество признаков {X_train.shape[1]}")
print(f"Количество строк {y_train.shape}")
print(X_train.shape)
print(X_valid.shape)

Количество строк 5282, количество признаков 20
Количество строк 4225, количество признаков 48
Количество строк (4225,)
(4225, 48)
(1057, 48)


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline


##############################################################################################
# MLPClassifier - 0.8520

X_train_mlpc = X_train.copy()
y_train_mlpc = y_train.copy()
X_valid_mlpc = X_valid.copy()
y_valid_mlpc = y_valid.copy()

parameters_mlpc = {
    'activation': ['logistic'],
    'solver': ['adam'],
    'max_iter': [50, 100, 150],
    'alpha': [0.0001, 0.0005],
    'learning_rate': ['constant'],
    'hidden_layer_sizes': [100, 150, 200],
    'random_state': [57]
}
clf_mlpc = MLPClassifier()
mlpc_grid = GridSearchCV(clf_mlpc, parameters_mlpc, cv=5, scoring='roc_auc', n_jobs=-1, verbose=3)
mlpc_grid.fit(X_train_mlpc, y_train_mlpc)
##############################################################################################


##############################################################################################
# GaussianNB - 0.8294

clf_gnb = GaussianNB()
clf_gnb.fit(X_train, y_train)
##############################################################################################


##############################################################################################
# GradientBoostingClassifier - 0.8519

pipe_gbc = Pipeline(steps=[
        ('select', SelectFromModel(estimator=GradientBoostingClassifier(loss='exponential', 
                                                                        criterion='friedman_mse', 
                                                                        random_state=57))),
        ('clf', GradientBoostingClassifier(loss='exponential', 
                                           criterion='friedman_mse', 
                                          random_state=57))
])

parameters_gbc = {
        'clf__learning_rate': [0.05],
        'clf__n_estimators': [200, 300, 400],
        'clf__subsample': [1, 2, 3],
        'clf__min_samples_leaf': [2, 3],
        'clf__max_depth': [1, 2],
        'clf__max_features': [None],
        'clf__random_state': [57]
        }

grid_gbc = GridSearchCV(pipe_gbc, parameters_gbc, cv=5, scoring='roc_auc', n_jobs=-1, refit=True, verbose=1)
grid_gbc.fit(X_train, y_train)
##############################################################################################


##############################################################################################
# LogisticRegression - 0.8491

pipe_lr = Pipeline(steps=[
        ('select', SelectFromModel(estimator=LogisticRegression(solver='saga', random_state=57))),
        ('clf', LogisticRegression(solver='saga', random_state=57))
])

parameters_lr= {
    'clf__penalty': ['l1'],
    'clf__C': [20],
    'clf__class_weight': [None],
    'clf__max_iter': [50000],
    'clf__random_state': [57],
    'clf__solver': ['saga']
}

grid_lr = GridSearchCV(pipe_lr, parameters_lr, cv=5, scoring='roc_auc', n_jobs=-1, verbose=3)
grid_lr.fit(X_train, y_train)
##############################################################################################


##############################################################################################
# RandomForestClassifier - 0.84906

pipe_rfc = Pipeline(steps=[
        ('select', SelectFromModel(estimator=RandomForestClassifier(criterion='entropy'))),
        ('clf', RandomForestClassifier(criterion='entropy'))
])

parameters_rfc = {
    'clf__n_estimators': [200, 500, 1000],
    'clf__criterion': ['entropy'],
    'clf__min_samples_leaf': [1],
    'clf__max_features': ['sqrt'],
    'clf__max_samples': [80]
}

grid_rfc = GridSearchCV(pipe_rfc, parameters_rfc, cv=5, scoring='roc_auc', n_jobs=-1, verbose=3)
grid_rfc.fit(X_train, y_train)
##############################################################################################

estimators = [
              ('grid_rfc', grid_rfc.best_estimator_),
              ('grid_gbc', grid_gbc.best_estimator_),
              ('clf_gnb', clf_gnb),
              ('grid_lr', grid_lr.best_estimator_),
              ('mlpc_grid', mlpc_grid.best_estimator_)
]

sc_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), cv=5)
parameters_sc_lr = {
    'final_estimator__penalty': ['l2'],
    'final_estimator__solver': ['liblinear'],
    'final_estimator__class_weight': ['balanced'],
    'final_estimator__C': [8, 10, 12],
    'final_estimator__max_iter': [50000],
    'stack_method': ['predict_proba']
}
grid_sc_sklearn_clf = GridSearchCV(sc_clf, parameters_sc_lr, cv=5, scoring='roc_auc', n_jobs=-1, verbose=3)
grid_sc_sklearn_clf.fit(X_train, y_train)

print('Sklearn - best estimator: ', grid_sc_sklearn_clf.best_estimator_)
print("Предсказание класса через лучшие параметры параметры", 
      roc_auc_score(y_valid, grid_sc_sklearn_clf.best_estimator_.predict_proba(X_valid)[:, 1]))
##############################################################################################

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:   18.4s finished


Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:   35.2s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.1s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:    1.6s remaining:    4.5s
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed:    2.6s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    4.1s finished


Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:   36.7s remaining:  1.7min
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed:   39.3s remaining:   19.6s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   55.2s finished


Sklearn - best estimator:  StackingClassifier(cv=5,
                   estimators=[('grid_rfc',
                                Pipeline(memory=None,
                                         steps=[('select',
                                                 SelectFromModel(estimator=RandomForestClassifier(bootstrap=True,
                                                                                                  ccp_alpha=0.0,
                                                                                                  class_weight=None,
                                                                                                  criterion='entropy',
                                                                                                  max_depth=None,
                                                                                                  max_features='auto',
                                                                                               

In [None]:
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.ensemble import StackingClassifier

from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import roc_auc_score


#############################################################################################
params_xgb = {
        'min_child_weight': [5],
        'gamma': [5],
        'subsample': [0.1],
        'colsample_bytree': [0.5, 0.55, 0.6, ],
        'max_depth': [2]
        }

clf_xgb = XGBClassifier()
xgb_grid = GridSearchCV(clf_xgb, params_xgb, cv=5, verbose=4, scoring='roc_auc', refit=True, n_jobs=-1)
xgb_grid.fit(X_train, y_train)
#############################################################################################


#############################################################################################
parametrs_lgb = {
    'num_leaves': [6],
   'max_depth': [4],
   'class_weight': ['balanced'], 
   'random_state': [100],
   'learning_rate': [0.05],
   'n_estimators': [110]
}

clf_lgb = lgb.LGBMClassifier()
lgb_grid = GridSearchCV(clf_lgb, parametrs_lgb, cv=5, verbose=4, scoring='roc_auc', refit=True, n_jobs=-1)
lgb_grid.fit(X_train, y_train)
#############################################################################################


#############################################################################################
parameters_lr= {
    'penalty': ['elasticnet', 'l1', 'l2', 'none'],
    'C': [0.01, 0.1, 1, 10, 20, 50],
    'class_weight': [None],
    'max_iter': [1000],
    'solver': ['saga']
}
clf_lr = LogisticRegression()
lr_grid = GridSearchCV(clf_lr, parameters_lr, cv=5, scoring='roc_auc', n_jobs=-1, verbose=3)
lr_grid.fit(X_train, y_train)
#############################################################################################


#############################################################################################
cbc_clf = CatBoostClassifier(iterations=400,
                       depth=4,
                       learning_rate=0.05,
                       loss_function='Logloss',
                       verbose=False,
                       random_seed = 4,
                       l2_leaf_reg = 40,
                       eval_metric='AUC'
                       )

parametrs_cbc = {
    'iterations': [160],
    "learning_rate": [0.05],
    'min_data_in_leaf': [6],
    'depth': [5],
    'l2_leaf_reg': [0]    
}

cbc_grid = GridSearchCV(cbc_clf, parametrs_cbc, cv=5, verbose=4, scoring='roc_auc', refit=True, n_jobs=-1)
cbc_grid.fit(X_train, y_train)
#############################################################################################

estimators = [
     ('xgb', xgb_grid.best_estimator_),
     ('lgb', lgb_grid.best_estimator_),
     ('cbc', cbc_grid.best_estimator_),
    ('lr', lr_grid.best_estimator_)
]

sc_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), cv=5)
parameters_sc_lr = {
    'final_estimator__C': [10, 1, 0.1],
    'final_estimator__penalty': ['elasticnet'],
    'final_estimator__solver': ['saga'],
    'final_estimator__l1_ratio': [0.5, 0.9],
    'stack_method': ['predict_proba']
}
gs_sc_estim_clf = GridSearchCV(sc_clf, parameters_sc_lr, cv=5, scoring='roc_auc', n_jobs=-1, verbose=3)
gs_sc_estim_clf.fit(X_train, y_train)
print('StackingClassifier - best estimator: ', gs_sc_estim_clf.best_estimator_)
print("Предсказание вероятности ", roc_auc_score(y_valid, gs_sc_estim_clf.best_estimator_.predict_proba(X_valid)[:, 1]))

In [None]:
from sklearn.ensemble import VotingClassifier
vk_clf = VotingClassifier(
    estimators= [
        ('grid_sc_sklearn_clf', grid_sc_sklearn_clf.best_estimator_),
        ('xgb', xgb_grid.best_estimator_),
        ('lgb', lgb_grid.best_estimator_),
        ('gs_sc_estim_clf', gs_sc_estim_clf.best_estimator_),
        ('cbc', cbc_grid.best_estimator_)
    ],
    voting='soft')

parameters_vk = {
    'n_jobs': [-1]
}
gs_vk_clf = GridSearchCV(estimator=vk_clf, param_grid=parameters_vk, cv=5, scoring='roc_auc', n_jobs=-1, verbose=3)
gs_vk_clf.fit(X_train, y_train)


# submission = pd.DataFrame(gs_vk_clf.best_estimator_.predict_proba(X_predict)[:, 1], columns=["Churn"])
# submission.to_csv(PATH.format('my_submission_voiting.csv'), columns=["Churn"])
print('VotingClassifier - best estimator: ', gs_vk_clf.best_estimator_)
print('VotingClassifier - best params: ', gs_vk_clf.best_params_)
print("Предсказание ", roc_auc_score(y_valid, gs_vk_clf.best_estimator_.predict(X_valid)))
print("Предсказание вероятности ", roc_auc_score(y_valid, gs_vk_clf.best_estimator_.predict_proba(X_valid)[:, 1]))

In [None]:
submission = pd.DataFrame(gs_vk_clf.best_estimator_.predict_proba(X_predict)[:, 1], columns=["Churn"])
submission.to_csv(PATH.format('my_submission_voiting.csv'), columns=["Churn"])

In [None]:
import time
import pandas as pd
import lightgbm as lgb
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import VotingClassifier

t = time.time()
PATH = r'C:\Users\user\PycharmProjects\DeepLearningSchool\8.Algorithm_composition\{}'

##############################################################################################
# from google.colab import drive
# drive.mount('/content/drive')
# Читаем данные
data = pd.read_csv(PATH.format('train.csv'))
print(f"Количество строк {data.shape[0]}, количество признаков {data.shape[1]}")
y = data['Churn']
data.drop(['Churn'], inplace=True, axis=True)
data['TotalSpent'] = data['TotalSpent'].replace(' ', '0')
data['TotalSpent'] = data['TotalSpent'].astype(float)
data['TotalSpent'] = data['TotalSpent'].replace(0, data['TotalSpent'].mean())
X_train, X_valid, y_train, y_valid = train_test_split(
    data, y, stratify=y, random_state=42, test_size=0.2
)
##############################################################################################


##############################################################################################
data_predict = pd.read_csv(PATH.format('test.csv'))
data_predict['TotalSpent'] = data_predict['TotalSpent'].replace(' ', '0')
data_predict['TotalSpent'] = data_predict['TotalSpent'].astype(float)
data_predict['TotalSpent'] = data_predict['TotalSpent'].replace(0, data['TotalSpent'].mean())
X_predict = data_predict.copy()
##############################################################################################


##############################################################################################
cat_cols = [
    'Sex',
    'IsSeniorCitizen',
    'HasPartner',
    'HasChild',
    'HasPhoneService',
    'HasMultiplePhoneNumbers',
    'HasInternetService',
    'HasOnlineSecurityService',
    'HasOnlineBackup',
    'HasDeviceProtection',
    'HasTechSupportAccess',
    'HasOnlineTV',
    'HasMovieSubscription',
    'HasContractPhone',
    'IsBillingPaperless',
    'PaymentMethod'
]

num_cols = [
    'ClientPeriod',
    'MonthlySpending',
    'TotalSpent'
]

features_labels = []
##############################################################################################


##############################################################################################
### LabelEncoder для категориальных признаков

le = LabelEncoder()
for s in cat_cols:
    le.fit(data[s])
    data[s+'_le'] = le.transform(data[s])
    X_predict[s+'_le'] = le.transform(X_predict[s])
    features_labels.append(s+'_le')
##############################################################################################


##############################################################################################
### OneHotEncoder для категориальных признаков
ohe = OneHotEncoder(handle_unknown='ignore')
for s in cat_cols:
    ohe.fit(data[s].values.reshape(-1, 1))
    
    new_ohe_features = ohe.transform(X_predict[s].values.reshape(-1, 1)).toarray()
    tmp = pd.DataFrame(new_ohe_features, columns=[s+'='+str(i) for i in range(new_ohe_features.shape[1])])
    X_predict = pd.concat([X_predict, tmp], axis=1)
    X_predict.drop([s], axis=1, inplace=True)
    
    new_ohe_features = ohe.transform(data[s].values.reshape(-1, 1)).toarray()
    tmp = pd.DataFrame(new_ohe_features, columns=[s+'='+str(i) for i in range(new_ohe_features.shape[1])])
    data = pd.concat([data, tmp], axis=1)
    data.drop([s], axis=1, inplace=True)
    
    [features_labels.append(s+'='+str(i)) for i in range(new_ohe_features.shape[1])]
##############################################################################################


##############################################################################################
### Нормализация числовых признаков
scaler = StandardScaler()
for i in num_cols:
    scaler.fit(data[i].values.reshape(-1, 1))
    
    X_predict[i+'_ss'] = scaler.transform(X_predict[i].values.reshape(-1, 1))
    X_predict.drop([i], axis=1, inplace=True)
    
    data[i+'_ss'] = scaler.transform(data[i].values.reshape(-1, 1))
    data.drop([i], axis=1, inplace=True)
    
    features_labels.append(i+'_ss') 
##############################################################################################

##############################################################################################
cbc = CatBoostClassifier(iterations=400,
                       depth=4,
                       learning_rate=0.05,
                       loss_function='Logloss',
                       verbose=False,
                       cat_features=cat_cols,
                       random_seed = 4,
                       l2_leaf_reg = 40,
                       eval_metric='AUC'
                       )

parametrs_cbc = {
    'iterations': [160, ],
    "learning_rate": [0.05, ],
    'min_data_in_leaf': [6,],
    'depth': [5,],
    'l2_leaf_reg': [0, ]    
}

cb_grid = GridSearchCV(cbc, parametrs_cbc, cv=5, verbose=4, scoring='roc_auc', refit=True, n_jobs=-1)
cb_grid.fit(X_train, y_train)
# print("CatBoostClassifier")
# print("Предсказание класса ", roc_auc_score(y_valid, cb_grid.best_estimator_.predict(X_valid)))
# print("Предсказание класса через лучшие параметры параметры", roc_auc_score(y_valid, cb_grid.best_estimator_.predict(X_valid)))
# print("Предсказание вероятности ", roc_auc_score(y_valid, cb_grid.predict_proba(X_valid)[:, 1]))
# print("Лучшие параметры ", cb_grid.best_params_)

# submission = pd.DataFrame(cb_grid.best_estimator_.predict_proba(data_predict)[:, 1], columns=["Churn"])
# submission.to_csv(PATH.format('my_submission_catboost.csv'), columns=["Churn"])
##############################################################################################


##############################################################################################
parametrs_lgb = {
    'num_leaves': [5],
   'max_depth': [4],
   'class_weight': ['balanced'], 
   'learning_rate': [0.05],
    'subsample_for_bin': [200],
   'n_estimators': [115]
}


X_train_lgb, X_valid_lgb, y_train_lgb, y_valid_lgb = train_test_split(
    data, y, stratify=y, random_state=42, test_size=0.2
)
clf_lgb = lgb.LGBMClassifier()
lgb_grid = GridSearchCV(clf_lgb, parametrs_lgb, cv=5, verbose=4, scoring='roc_auc', refit=True, n_jobs=-1)
lgb_grid.fit(X_train_lgb, y_train_lgb)
# print("LGBMClassifier")
# print("Предсказание класса через лучшие параметры параметры", 
#       roc_auc_score(y_valid_lgb, lgb_grid.best_estimator_.predict(X_valid_lgb)))

# submission = pd.DataFrame(lgb_grid.best_estimator_.predict_proba(X_predict)[:, 1], columns=["Churn"])
# submission.to_csv(PATH.format('my_submission_lgbm.csv'), columns=["Churn"])
##############################################################################################


##############################################################################################
parameters_mlpc = {
    'activation': ['identity'],
    'solver': ['lbfgs'],
    'max_iter': [110],
    'hidden_layer_sizes': [5]
}

mlpc_clf = MLPClassifier()
gs_mlpc_clf = GridSearchCV(mlpc_clf, parameters_mlpc, cv=5, scoring='roc_auc', n_jobs=-1, verbose=3)
gs_mlpc_clf.fit(X_train_lgb, y_train_lgb)
# print('StackingClassifier - best estimator: ', gs_mlpc_clf.best_estimator_)
# print("MLPClassifier - best predict: ", roc_auc_score(y_valid_xgb, gs_mlpc_clf.predict_proba(X_valid_xgb)[:, 1]))
##############################################################################################


##############################################################################################
params_xgb = {
        'min_child_weight': [5],
        'gamma': [5],
        'subsample': [0.1],
        'colsample_bytree': [0.5, 0.55, 0.6],
        'max_depth': [2]
        }
X_train_xgb, X_valid_xgb, y_train_xgb, y_valid_xgb = train_test_split(
    data, y, stratify=y, random_state=42, test_size=0.2
)
clf_xgb = XGBClassifier()
xgb_grid = GridSearchCV(clf_xgb, params_xgb, cv=5, verbose=4, scoring='roc_auc', refit=True, n_jobs=-1)
xgb_grid.fit(X_train_xgb, y_train_xgb)
# print("XGBClassifier")
# print("Предсказание класса через лучшие параметры параметры", roc_auc_score(y_valid_xgb, xgb_grid.best_estimator_.predict(X_valid_xgb)))
# print("Предсказание вероятности ", roc_auc_score(y_valid_xgb, xgb_grid.best_estimator_.predict_proba(X_valid_xgb)[:, 1]))
# print("Лучшие параметры ", xgb_grid.best_params_)

# submission = pd.DataFrame(xgb_grid.best_estimator_.predict_proba(X_predict)[:, 1], columns=["Churn"])
# submission.to_csv(PATH.format('my_submission_xgb.csv'), columns=["Churn"])
##############################################################################################


##############################################################################################
estimators = [
     ('xgb', xgb_grid.best_estimator_),
     ('lgb', lgb_grid.best_estimator_),
     ('mplpc', gs_mlpc_clf.best_estimator_)
#      ('cb', cb_grid.best_estimator_)
]


sc_clf = StackingClassifier(estimators=estimators, final_estimator=MLPClassifier(), cv=5)
parameters_sc_lr = {
    'final_estimator__activation': ['logistic'],
    'final_estimator__solver': ['lbfgs'],
    'final_estimator__hidden_layer_sizes': [100],
    'stack_method': ['predict_proba']
}
gs_sc_clf = GridSearchCV(sc_clf, parameters_sc_lr, cv=5, scoring='roc_auc', n_jobs=-1, verbose=3)
gs_sc_clf.fit(X_train_lgb, y_train_lgb)

# End estimate. Writing some info 
submission = pd.DataFrame(gs_sc_clf.best_estimator_.predict_proba(X_predict)[:, 1], columns=["Churn"])
submission.to_csv(PATH.format('my_submission_stack.csv'), columns=["Churn"])
print('StackingClassifier - best estimator: ', gs_sc_clf.best_estimator_)
print('StackingClassifier - best params: ', gs_sc_clf.best_params_)
print("Предсказание вероятности ", roc_auc_score(y_valid_xgb, gs_sc_clf.best_estimator_.predict_proba(X_valid_xgb)[:, 1]))
##############################################################################################


##############################################################################################

vk_clf = VotingClassifier(
    estimators= [
        ('xgb', xgb_grid.best_estimator_),
#         ('lgb', lgb_grid.best_estimator_),
#         ('mplpc', gs_mlpc_clf.best_estimator_)
#         ('cb', cb_grid.best_estimator_)
    ],
    voting='soft')
parameters_vk = {
#     'estimator__voting': ['hard','soft'],
    'n_jobs': [-1]
}
gs_vk_clf = GridSearchCV(estimator=vk_clf, param_grid=parameters_vk, cv=5, scoring='roc_auc', n_jobs=-1, verbose=3)
gs_vk_clf.fit(X_train_lgb, y_train_lgb)


submission = pd.DataFrame(gs_vk_clf.best_estimator_.predict_proba(X_predict)[:, 1], columns=["Churn"])
submission.to_csv(PATH.format('my_submission_voiting.csv'), columns=["Churn"])
print('VotingClassifier - best estimator: ', gs_vk_clf.best_estimator_)
print('VotingClassifier - best params: ', gs_vk_clf.best_params_)
print("Предсказание вероятности ", roc_auc_score(y_valid_xgb, gs_vk_clf.best_estimator_.predict_proba(X_valid_xgb)[:, 1]))




##############################################################################################
##############################################################################################
##############################################################################################




from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

##############################################################################################
# LogisticRegression
parameters_lr= {
    'penalty': ['l1'],
    'C': [10],
    'class_weight': [None],
    'max_iter': [1000],
    'solver': ['saga']
}
clf_lr = LogisticRegression()
gs_lr_clf = GridSearchCV(clf_lr, parameters_lr, cv=5, scoring='roc_auc', n_jobs=-1, verbose=3)
gs_lr_clf.fit(X_train_lgb, y_train_lgb)
print('LogisticRegression - best estimator: ', gs_sc_clf.best_estimator_)
print("Предсказание вероятности ", roc_auc_score(y_valid_lgb, gs_lr_clf.best_estimator_.predict_proba(X_valid_lgb)[:, 1]))
##############################################################################################


##############################################################################################
# GaussianNB
clf_gnb = GaussianNB()
clf_gnb.fit(X_train_lgb, y_train_lgb)
print('GaussianNB')
print("Предсказание вероятности ", roc_auc_score(y_valid_lgb, clf_gnb.predict_proba(X_valid_lgb)[:, 1]))
##############################################################################################


##############################################################################################
# RandomForestClassifier
parameters_rfc = {
    'n_estimators': [10, 50, 100, 150, 200],
    'criterion': ['gini','entropy'],
    'min_samples_leaf': [1, 5, 10, 15, 20],
    'max_features': ['sqrt', 'log2'],
    'bootstrap':[False],
    'max_samples': [5, 10, 15, 20]
}

clf_rfc = RandomForestClassifier()
gs_rfc_clf = GridSearchCV(clf_lr, parameters_lr, cv=5, scoring='roc_auc', n_jobs=-1, verbose=3)
gs_rfc_clf.fit(X_train_lgb, y_train_lgb)

print('RandomForestClassifier - best estimator: ', gs_sc_clf.best_estimator_)
print("Предсказание вероятности ", roc_auc_score(y_valid_lgb, gs_rfc_clf.best_estimator_.predict_proba(X_valid_lgb)[:, 1]))
##############################################################################################


estimators = [
     ('lr', gs_lr_clf.best_estimator_),
    ('xgb', xgb_grid.best_estimator_),
     ('lgb', lgb_grid.best_estimator_),
#      ('gnb', clf_gnb),
     ('rfc', gs_rfc_clf.best_estimator_)
]


sc_clf = StackingClassifier(estimators=estimators, final_estimator=MLPClassifier(), cv=5)
parameters_sc_lr = {
#     'final_estimator__activation': ['logistic'],
#     'final_estimator__solver': ['lbfgs'],
#     'final_estimator__hidden_layer_sizes': [100],
    'stack_method': ['predict_proba']
}
gs_sc_clf = GridSearchCV(sc_clf, parameters_sc_lr, cv=5, scoring='roc_auc', n_jobs=-1, verbose=3)
gs_sc_clf.fit(X_train_lgb, y_train_lgb)
submission = pd.DataFrame(gs_sc_clf.best_estimator_.predict_proba(X_predict)[:, 1], columns=["Churn"])
submission.to_csv(PATH.format('my_submission_stack.csv'), columns=["Churn"])
print('StackingClassifier - best estimator: ', gs_sc_clf.best_estimator_)
print('StackingClassifier - best params: ', gs_sc_clf.best_params_)
print("Предсказание вероятности ", roc_auc_score(y_valid_xgb, gs_sc_clf.best_estimator_.predict_proba(X_valid_xgb)[:, 1]))
print(f'How long: {"%.2f" % (time.time()-t)}, s')