In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import glob
import os
import math
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn import preprocessing
pd.set_option('display.max_columns', 200)

In [None]:
train_df = pd.read_csv('train.csv')

# 基礎確認

In [None]:
train_df.head()

In [None]:
train_df.describe()

In [None]:
train_df.shape

In [None]:
train_df.describe()

In [None]:
train_df["workclass"].value_counts()

In [None]:
train_df["fnlwgt"].value_counts()

In [None]:
train_df['fnlwgt'] = train_df['fnlwgt'].astype(str)
train_df['fnlwgt'].value_counts()

In [None]:
train_df["education"].value_counts()

In [None]:
train_df["marital-status"].value_counts()

In [None]:
train_df["occupation"].value_counts()

In [None]:
train_df["relationship"].value_counts()

In [None]:
train_df["race"].value_counts()

In [None]:
train_df["native-country"].value_counts()

# 前処理

In [None]:
train_df.shape

In [None]:
train_df['fnlwgt']=train_df['fnlwgt'].astype(str)

In [None]:
train_df.dtypes

In [None]:
test_df = pd.read_csv('test.csv')
test_df['fnlwgt']=test_df['fnlwgt'].astype(str)

In [None]:
test_df.dtypes

In [None]:
test_df.head()

In [None]:
test_df.shape

In [None]:
concat_df = pd.concat([train_df.iloc[:, 0:12], test_df])
concat_df.head()

In [None]:
concat_df.dtypes

In [None]:
concat_df['fnlwgt']

In [None]:
for val in concat_df['fnlwgt'].unique():
    print(f':{val}:')

In [None]:
concat_df = pd.get_dummies(concat_df)

In [None]:
train_rev_df = concat_df.iloc[0:11900, :]
train_rev_df.head()

In [None]:
train_rev_df.shape

In [None]:
train_rev_df['Y'] = train_df['Y']
train_rev_df.head()

In [None]:
test_rev_df = concat_df.iloc[11900:, :]
test_rev_df.head()

In [None]:
train_rev_df.shape

In [None]:
# 0がやや多いので比率は調整する必要あり
train_rev_df['Y'].value_counts()

In [None]:
train_rev_df.iloc[:, :-1].head()

# 学習

In [None]:
import lightgbm
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, precision_score, recall_score

In [None]:
#ここで、train_test_split の引数 stratify に y を指定することで 層化抽出 をすることができます。
x_train, x_val, y_train, y_val = train_test_split(train_rev_df.iloc[:, 1:-1], train_rev_df.iloc[:, -1], test_size=0.1, random_state=42, stratify=train_rev_df.iloc[:, -1])

In [None]:
print(f'x_train:shape: {x_train.shape}, x_val:shape: {x_val.shape}, y_train:{y_train.shape}, y_val:{y_val.shape}')

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10,
                      shuffle=True,
                      random_state=0)

In [None]:
lgb_clf = lightgbm.LGBMClassifier()


In [None]:
# grid search
from sklearn.model_selection import GridSearchCV
'''
param_grid = {"max_depth": [3, 5, 10, 25, 50],
              "learning_rate" : [0.05, 0.075, 0.1, 0.125, 0.15, 0.2, 0.3],
              "num_leaves": [100,300,600,900],
              "n_estimators": [100,200,400]
             }
'''
param_grid = {"max_depth": [2, 3, 4, 5, 6],
              "learning_rate" : [0.7,0.085, 0.1, 0.11, 0.12],
              "num_leaves": [15, 20, 25, 30, 35, 50],
              "n_estimators": [350, 375, 400, 425, 450]
             }

grid_result = GridSearchCV(estimator = lgb_clf,
                           param_grid = param_grid,
                           scoring = 'balanced_accuracy',
                           cv = skf,
                           verbose=3,
                           return_train_score = True,
                           n_jobs = -1)

In [None]:
grid_result.fit(x_train, y_train)

In [None]:
import pprint
pprint.pprint(grid_result.best_params_)

In [None]:
y_val_pred = grid_result.predict(x_val)
accuracy_score(y_val, y_val_pred)

In [None]:
test_rev_df.shape

In [None]:
y_pred = grid_result.predict(test_rev_df.iloc[:, 1:])

In [None]:
#y_pred = lgb_clf.predict(test_rev_df.iloc[:, 1:])
#y_pred

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_val, y_val_pred))

In [None]:
submit_df = pd.read_csv('sample_submit.csv', header=None)
submit_df.shape

In [None]:
submit_df['2'] = y_pred
submit_df.head()

In [None]:
submit_df = submit_df.drop(submit_df.columns[[1]], axis=1)

In [None]:
submit_df.to_csv("submit_gridCV_k4.csv", sep=",", header=None, index=0)

# サンプリング数の変更

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
x_train.shape

In [None]:
smote = SMOTE()
x_train_resampled, y_train_resampled = smote.fit_sample(x_train, y_train)

In [None]:
x_train_resampled.shape, y_train_resampled.shape

In [None]:
y_train_resampled.value_counts()

In [None]:
lgb_clf = lightgbm.LGBMClassifier(max_depth=20)
lgb_clf.fit(x_train_resampled, y_train_resampled)
y_pred = lgb_clf.predict(x_val)
accuracy_score(y_val, y_pred)

In [None]:
y_pred = lgb_clf.predict(test_rev_df.iloc[:, 1:])
y_pred

In [None]:
submit_df = pd.read_csv('sample_submit.csv', header=None)
submit_df['2'] = y_pred
submit_df = submit_df.drop(submit_df.columns[[1]], axis=1)

In [None]:
submit_df.to_csv("submit_smote.csv", sep=",", header=None, index=0)