# lightGBM

In [102]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [106]:
train = pd.read_csv('data/train2.csv')
test = pd.read_csv('data/test2.csv')

train = train.drop('index', axis=1)
train = train.drop('fnlwgt', axis=1)
train = train.drop('race', axis=1)
train = train.drop('native-country', axis=1)

test = test.drop('index', axis=1)
test = test.drop('fnlwgt', axis=1)
test = test.drop('race', axis=1)
test = test.drop('native-country', axis=1)

train.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,sex,Y
0,21,3,13,10,0,1,3,0,0
1,29,3,10,9,0,1,1,0,0
2,19,3,9,13,2,8,1,0,0
3,17,3,10,9,2,2,3,1,0
4,47,3,13,10,1,2,0,1,0


In [107]:
X_train = train.iloc[:,:-1]
y_train = train.iloc[:,-1]

In [108]:
from sklearn.model_selection import train_test_split
train2,test2 = train_test_split(train, random_state=0)

X_train2 = train2.iloc[:,:-1]
y_train2 = train2.iloc[:,-1]
X_test2 = test2.iloc[:,:-1]
y_test2 = test2.iloc[:,-1]
X_train2.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,sex
3464,31,3,9,13,2,4,3,0
3431,21,0,13,10,2,0,3,0
6547,29,3,5,4,0,11,1,0
9322,23,3,13,10,1,3,0,1
4428,21,1,13,10,2,7,1,1


In [109]:
categorical_features = ["workclass","education","marital-status","occupation","relationship","race","sex","native-country"]
cat_columns = []
for i,v in enumerate(train.columns):
    if v in categorical_features:
        cat_columns.append(i)
print(cat_columns)

[1, 2, 4, 5, 6, 7]


In [119]:
import lightgbm as lgb

params = {'objective': 'binary'}

gbm = lgb.LGBMClassifier(**params,cat_column=cat_columns)

gbm.fit(X_train2, y_train2)

y_pred = gbm.predict(X_test2, num_iteration = gbm.best_iteration_)

from sklearn.metrics import confusion_matrix
#混同行列の作成
matrix6 = confusion_matrix(y_test2, y_pred)
print("混同行列:\n{}".format(matrix6))

from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score
print("正解率(Accuracy):{:.3f}".format(accuracy_score(y_test2, y_pred)))
print("適合率(Precision):{:.3f}".format(precision_score(y_test2, y_pred)))
print("再現率(Recall):{:.3f}".format(recall_score(y_test2, y_pred)))
print("F1値(Accuracy):{:.3f}".format(f1_score(y_test2, y_pred)))


importance = pd.DataFrame(gbm.feature_importances_, index = X_test2.columns, columns=["importance"])
display(importance.sort_values("importance", ascending= False))


混同行列:
[[1997  215]
 [ 256  507]]
正解率(Accuracy):0.842
適合率(Precision):0.702
再現率(Recall):0.664
F1値(Accuracy):0.683


Unnamed: 0,importance
age,1487
education-num,631
occupation,294
sex,160
education,133
workclass,131
marital-status,83
relationship,81


In [115]:
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

params = {'n_estimators' : range(1,1000)}


grid = GridSearchCV(
    estimator = lgb.LGBMClassifier(objective='binary',cat_column=cat_columns),
    param_grid = params,
    cv = 5
)
grid.fit(X_train, y_train)

print("Best parameters : {}".format(grid.best_params_))
print("Best cross-validation score : {:.3f}".format(grid.best_score_))

y_pred = grid.best_estimator_.predict(test)

KeyboardInterrupt: 

In [112]:
_test = pd.read_csv('data/test2.csv')
df_result = pd.concat([_test['index'],pd.DataFrame(y_pred)],axis=1)
df_result.to_csv('data/submit.csv',index=None,header=None)