# lightGBM

In [102]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [120]:
train = pd.read_csv('data/train2.csv')
test = pd.read_csv('data/test2.csv')

train = train.drop('index', axis=1)
# train = train.drop('fnlwgt', axis=1)
# train = train.drop('race', axis=1)
# train = train.drop('native-country', axis=1)

test = test.drop('index', axis=1)
# test = test.drop('fnlwgt', axis=1)
# test = test.drop('race', axis=1)
# test = test.drop('native-country', axis=1)

train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,native-country,Y
0,21,3,132652,13,10,0,1,3,2,0,2,0
1,29,3,132652,10,9,0,1,1,2,0,2,0
2,19,3,132652,9,13,2,8,1,2,0,2,0
3,17,3,132652,10,9,2,2,3,2,1,2,0
4,47,3,132652,13,10,1,2,0,2,1,2,0


In [125]:
X_train = train.iloc[:,:-1]
y_train = train.iloc[:,-1]

In [126]:
from sklearn.model_selection import train_test_split
train2,test2 = train_test_split(train, random_state=0)

X_train2 = train2.iloc[:,:-1]
y_train2 = train2.iloc[:,-1]
X_test2 = test2.iloc[:,:-1]
y_test2 = test2.iloc[:,-1]
X_train2.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,native-country
3464,31,3,132652,9,13,2,4,3,2,0,2
3431,21,0,132652,13,10,2,0,3,2,0,2
6547,29,3,132652,5,4,0,11,1,2,0,2
9322,23,3,132652,13,10,1,3,0,1,1,2
4428,21,1,132652,13,10,2,7,1,1,1,2


In [127]:
categorical_features = ["workclass","education","marital-status","occupation","relationship","race","sex","native-country"]
cat_columns = []
for i,v in enumerate(train.columns):
    if v in categorical_features:
        cat_columns.append(i)
print(cat_columns)

[1, 3, 5, 6, 7, 8, 9, 10]


In [123]:
import lightgbm as lgb

params = {'objective': 'binary'}

gbm = lgb.LGBMClassifier(**params,cat_column=cat_columns)

gbm.fit(X_train2, y_train2)

y_pred = gbm.predict(X_test2, num_iteration = gbm.best_iteration_)

from sklearn.metrics import confusion_matrix
#混同行列の作成
matrix6 = confusion_matrix(y_test2, y_pred)
print("混同行列:\n{}".format(matrix6))

from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score
print("正解率(Accuracy):{:.3f}".format(accuracy_score(y_test2, y_pred)))
print("適合率(Precision):{:.3f}".format(precision_score(y_test2, y_pred)))
print("再現率(Recall):{:.3f}".format(recall_score(y_test2, y_pred)))
print("F1値(Accuracy):{:.3f}".format(f1_score(y_test2, y_pred)))


importance = pd.DataFrame(gbm.feature_importances_, index = X_test2.columns, columns=["importance"])
display(importance.sort_values("importance", ascending= False))


混同行列:
[[2012  200]
 [ 267  496]]
正解率(Accuracy):0.843
適合率(Precision):0.713
再現率(Recall):0.650
F1値(Accuracy):0.680


Unnamed: 0,importance
age,1376
education-num,635
occupation,282
sex,141
education,121
race,121
workclass,112
marital-status,77
relationship,73
fnlwgt,62


In [128]:
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

params = {'n_estimators' : range(1,100)}


grid = GridSearchCV(
    estimator = lgb.LGBMClassifier(objective='binary',cat_column=cat_columns),
    param_grid = params,
    cv = 5
)
grid.fit(X_train, y_train)

print("Best parameters : {}".format(grid.best_params_))
print("Best cross-validation score : {:.3f}".format(grid.best_score_))

y_pred = grid.best_estimator_.predict(test)

Best parameters : {'n_estimators': 76}
Best cross-validation score : 0.841


In [112]:
_test = pd.read_csv('data/test2.csv')
df_result = pd.concat([_test['index'],pd.DataFrame(y_pred)],axis=1)
df_result.to_csv('data/submit.csv',index=,header=None)