# ・ライブラリのインポート

In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier

import itertools
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

# ・データの読み込み

In [2]:
train = pd.read_csv('/Users/markun/SIGNATE/train.csv')
test = pd.read_csv('/Users/markun/SIGNATE/test.csv')

In [19]:
test = test.drop(['Insulin'], axis = 1)
test 

Unnamed: 0,index,Pregnancies,Glucose,BloodPressure,SkinThickness,BMI,DiabetesPedigreeFunction,Age
0,398,0,126,80,0,40.096264,0.822517,21
1,3833,3,88,60,20,39.810590,0.204331,22
2,4836,3,114,76,0,33.198760,0.521011,21
3,4572,1,146,74,0,26.890259,0.504950,38
4,636,1,123,90,26,40.270088,0.800513,28
...,...,...,...,...,...,...,...,...
1995,3138,4,150,60,0,39.385785,0.175051,26
1996,191,6,153,88,0,38.614204,0.509367,28
1997,3294,7,106,78,0,51.678147,0.728404,29
1998,3073,2,101,70,0,40.271989,0.316558,26


In [20]:
train

Unnamed: 0,index,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,200,9,125,74,0,0,28.536910,0.444902,45,1
1,3832,4,109,80,0,0,28.047673,0.238243,22,0
2,4927,4,88,78,39,0,52.371341,0.279471,26,0
3,4088,9,125,74,0,0,40.062688,0.203922,45,0
4,3644,5,107,78,44,284,52.935068,0.284959,45,1
...,...,...,...,...,...,...,...,...,...,...
2995,4931,4,88,74,17,0,33.848723,0.171073,23,0
2996,3264,0,144,88,0,0,26.846832,0.259957,21,1
2997,1653,6,117,96,36,0,28.101646,0.716126,22,1
2998,2607,2,113,74,0,0,33.079021,0.266179,38,1


# ・目的変数を除外

In [21]:
train_x = train.drop(['Outcome','Insulin'], axis = 1)
train_x 

Unnamed: 0,index,Pregnancies,Glucose,BloodPressure,SkinThickness,BMI,DiabetesPedigreeFunction,Age
0,200,9,125,74,0,28.536910,0.444902,45
1,3832,4,109,80,0,28.047673,0.238243,22
2,4927,4,88,78,39,52.371341,0.279471,26
3,4088,9,125,74,0,40.062688,0.203922,45
4,3644,5,107,78,44,52.935068,0.284959,45
...,...,...,...,...,...,...,...,...
2995,4931,4,88,74,17,33.848723,0.171073,23
2996,3264,0,144,88,0,26.846832,0.259957,21
2997,1653,6,117,96,36,28.101646,0.716126,22
2998,2607,2,113,74,0,33.079021,0.266179,38


# ・目的変数

In [22]:
train_y = train['Outcome']
train_y

0       1
1       0
2       0
3       0
4       1
       ..
2995    0
2996    1
2997    1
2998    1
2999    0
Name: Outcome, Length: 3000, dtype: int64

# ・学習及び検証

In [36]:
scores_accuracy = []
scores_logloss = []
xgb_params = {
    'objective':'binary:logistic',
    'max_depth': 5, 
    'eta': 0.1,
    'eval_metric':'logloss',
    'colsample_bytree':0.8,
    'subsample':0.8,
    'gamma':0,
    'lambda':1,
    'alpha':0,
    'min_child_weight':1.0,
}

kf = KFold(n_splits=9, shuffle=True, random_state=71)
for tr_idx, va_idx in kf.split(train_x):
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

    #model = XGBClassifier(n_estimators=60, random_state=71, max_depth = 4)
    model = XGBClassifier(n_estimators=60, random_state=71, max_depth = 3)
    model.fit(tr_x, tr_y)

    va_pred = model.predict_proba(va_x)[:, 1]

    logloss = log_loss(va_y, va_pred)
    accuracy = accuracy_score(va_y, va_pred > 0.5)
    
    scores_logloss.append(logloss)
    scores_accuracy.append(accuracy)

    logloss = np.mean(scores_logloss)
    accuracy = np.mean(scores_accuracy)
    print(f'logloss: {logloss:.4f}, accuracy: {accuracy:.4f}')

logloss: 0.4198, accuracy: 0.8443
logloss: 0.4299, accuracy: 0.8263
logloss: 0.4287, accuracy: 0.8244
logloss: 0.4310, accuracy: 0.8195
logloss: 0.4280, accuracy: 0.8141
logloss: 0.4274, accuracy: 0.8111
logloss: 0.4218, accuracy: 0.8123
logloss: 0.4252, accuracy: 0.8084
logloss: 0.4329, accuracy: 0.8026


#  ・テストデータに対する予測

In [37]:
prediction = model.predict(test)
prediction

array([0, 0, 0, ..., 0, 0, 1])

# ・提出用ファイルの作成

In [38]:
submission_C = pd.DataFrame({
    'index':test['index'],
    'Outcome':prediction, 
})
submission_C.to_csv('/Users/markun/SIGNATE/submission_C.csv', header = False, index = False )

In [39]:
submission_C.head()

Unnamed: 0,index,Outcome
0,398,0
1,3833,0
2,4836,0
3,4572,0
4,636,0
