In [None]:
import datetime
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.model_selection import cross_validate, GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.ensemble import RandomForestClassifier as RF
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score

from sklearn.tree import export_graphviz
import pydotplus
from IPython.display import Image


In [None]:
train = pd.read_csv('train.csv', index_col=0)
test = pd.read_csv('test.csv', index_col=0)
sample = pd.read_csv('submit_sample.csv', header=None)

In [None]:
train

In [None]:
import pandas_profiling
train.profile_report()

In [None]:
dt_now = datetime.datetime.now()
pred_list = {}

In [None]:
test['y'] = -999

In [None]:
all_df = pd.concat([train, test], axis=0)
all_df

In [None]:
import category_encoders as ce
ce_ohe = ce.OneHotEncoder(handle_unknown='impute')
ce_ohe.fit(all_df)

In [None]:
all_df = ce_ohe.transform(all_df)

In [None]:
all_df

In [None]:
X = all_df[all_df['y'] != -999].drop('y', axis=1)
y = all_df[all_df['y'] != -999]['y']
X.shape, y.shape

In [None]:
parameters = {
    'learning_rate': [0.05],
    'max_depth': [8],
    'random_state': [0],
}

parameters2 = {
    'eta': [0.1],
    'max_depth': [5],
    'random_state': [0],
}

# parameters3 = {
#     'max_depth': list(range(2, 11)),
#     'random_state': [0],
# }

# parameters = {
#     'max_depth': list(range(2, 11)),
#     'min_samples_leaf': [5,10,20,50,100,500],
#     'objective' : ['binary', 'cross_entropy'],
#     'learning_rate': [0.005, 0.05, 0.5, 0.1],
#     'random_state': [0],
#     'n_estimators': [1000],
#     'n_estimators': [40, 100, 1000],
#     'num_leaves': [6,8,12,16],
#     'early_stopping_round': [10],
# }

## LightGBM

In [None]:
lgb = LGBMClassifier()

gcv = GridSearchCV(lgb, parameters, cv=5, scoring="roc_auc", n_jobs=-1, return_train_score=True)
gcv.fit(X, y)


In [None]:
gcv.best_params_

In [None]:
gcv.cv_results_

In [None]:
train_score = gcv.cv_results_['mean_train_score']
test_score = gcv.cv_results_['mean_test_score']
print(train_score)
print(test_score)

In [None]:
plt.plot(train_score)
plt.plot(test_score)
plt.xticks(list(range(0, 10)), list(range(2, 11)))

In [None]:
y_pred = gcv.predict_proba(X)[:,1]
y_pred

In [None]:
roc_auc_score(y, y_pred)
# 0.9113408811229391

In [None]:
print(dt_now.strftime('%Y%m%d%H%M'))

In [None]:
pred_list['lgb'] = y_pred

## XGBoost

In [None]:
xgb = XGBClassifier()

gcv2 = GridSearchCV(xgb, parameters2, cv=5, scoring="roc_auc", n_jobs=-1, return_train_score=True)
gcv2.fit(X, y)


In [None]:
gcv2.best_params_

In [None]:
train_score = gcv2.cv_results_['mean_train_score']
test_score = gcv2.cv_results_['mean_test_score']
print(train_score)
print(test_score)

In [None]:
plt.plot(train_score)
plt.plot(test_score)
plt.xticks(list(range(0, 10)), list(range(2, 11)))

In [None]:
y_pred2 = gcv2.predict_proba(X)[:,1]
y_pred2

In [None]:
roc_auc_score(y, y_pred2)
# 0.9113408811229391

In [None]:
pred_list['xgb'] = y_pred2

In [None]:
plt.figure(figsize=(20, 4))
plt.scatter(y.index[:1000], y[:1000], alpha=1, label='y')
plt.scatter(y.index[:1000], y_pred[:1000], alpha=1, label='lgb')
plt.axhline(0.5)
plt.legend()

In [None]:
plt.figure(figsize=(20, 4))
plt.scatter(y.index[:1000], y[:1000], alpha=1, label='y')
plt.scatter(y.index[:1000], y_pred2[:1000], alpha=1, label='xgb')
plt.axhline(0.5)
plt.legend()

## SVM(support vector machine)

In [None]:
parameters3 = {
    'kernel': ['rbf'],
    'gamma': [1e-3, 1e-4],
    'C': [1, 10, 100, 1000],
    'random_state': [0],
}


In [None]:
svm = SVC()

gcv3 = GridSearchCV(svm, parameters3, cv=5, scoring="roc_auc", n_jobs=-1, return_train_score=True)
gcv3.fit(X, y)


In [None]:
gcv3.best_params_

In [None]:
train_score = gcv3.cv_results_['mean_train_score']
test_score = gcv3.cv_results_['mean_test_score']
print(train_score)
print(test_score)

In [None]:
plt.plot(train_score)
plt.plot(test_score)
plt.xticks(list(range(0, 10)), list(range(2, 11)))

In [None]:
y_pred3 = gcv3.predict_proba(X)[:,1]
y_pred3

In [None]:
roc_auc_score(y, y_pred3)
# 

In [None]:
pred_list['svm'] = y_pred3
pred_list

## Random Forest

In [None]:
parameters9 = {"max_depth": [2,3, None],
#               "n_estimators":[50,100,200,300,400,500],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

In [None]:
pred_list = pd.DataFrame(pred_list)

In [None]:
rf = RF()

gcv9 = GridSearchCV(rf, parameters9, cv=5, scoring="roc_auc", n_jobs=-1, return_train_score=True)
gcv9.fit(pred_list, y)


In [None]:
#rf.fit(pred_list, y)

In [None]:
y_pred9 = gcv9.predict_proba(pred_list)[:,1]
y_pred9

In [None]:
plt.figure(figsize=(20, 4))
plt.scatter(y.index[:1000], y[:1000], alpha=1, label='y')
plt.scatter(y.index[:1000], y_pred3[:1000], alpha=1, label='rf')
plt.axhline(0.5)
plt.legend()

In [None]:
roc_auc_score(y, y_pred9)
# 0.9113408811229391

# Test Data Predict

In [None]:
X1 = all_df[all_df['y'] == -999].drop('y', axis=1)
all_predict= {}

## LightGBM

In [None]:
all_predict['pred_x1'] = gcv.predict_proba(X1)[:,1]
all_predict['pred_x1']

## XGBoost

In [None]:
all_predict['pred_x2'] = gcv2.predict_proba(X1)[:,1]
all_predict['pred_x2']

## Random Forest

In [None]:
pd.DataFrame(all_predict)

In [None]:
pred_x3 = gcv3.predict_proba(pd.DataFrame(all_predict))
pred_x3[:,1]

In [None]:
sample[1] = pred_x3[:,1]

In [None]:
sample.to_csv('submit_20200822_1.csv', header=None, index=None)