### ライブラリの読み込み

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np

### データ読み取り

In [2]:
df = pd.read_csv("/Users/tueno/Liver disease determination based on medical checkup data/train.csv") #dataframe型
X_test = pd.read_csv("/Users/tueno/Liver disease determination based on medical checkup data/test.csv")

### データ概要

各パラメータについて確認する \
Age：年齢 \
Gender：性別 \
T_Bill：Total Bilirubin \
D_Bill：Direct Bilirubin \
ALP：Alkaline Phosphotase \
ALT_GPT：Alanine Transaminase \
AST_GOT：Aspartate Aminotransferase \
TP：Total Proteins \
Alb：Albumin \
AG_ratio：albumin/globulin ratio \
disease：目的変数， 肝疾患の有無（0:無，1:有）

In [3]:
df.head(5)

Unnamed: 0,Age,Gender,T_Bil,D_Bil,ALP,ALT_GPT,AST_GOT,TP,Alb,AG_ratio,disease
0,59,Male,0.786886,0.150498,220.178691,13.467617,21.729246,6.815731,3.112276,1.006802,0
1,69,Male,1.003987,0.195625,221.218413,51.033462,64.752323,6.889608,3.051521,0.751346,1
2,65,Male,0.65714,0.081287,320.770533,12.625011,30.61318,5.947767,2.489167,0.774952,0
3,65,Male,0.906822,0.214173,369.27816,34.347597,54.510085,6.967183,3.613837,0.988155,1
4,22,Female,1.734959,0.197706,222.782025,20.572891,170.010177,5.837537,3.068697,1.026654,0


In [4]:
# データのサイズ
print(df.shape)

(850, 11)


### データ前処理
データの前処理として，Genderをカテゴリ変数に変換する

In [5]:
class_le = LabelEncoder()
df["Gender"] = class_le.fit_transform(df["Gender"])
X_test["Gender"] = class_le.fit_transform(X_test["Gender"])

### 訓練データの分割

訓練データを，説明変数と目的変数とで分割する

In [6]:
y = df["disease"]
X = df.drop("disease",axis = 1)

X.head()

Unnamed: 0,Age,Gender,T_Bil,D_Bil,ALP,ALT_GPT,AST_GOT,TP,Alb,AG_ratio
0,59,1,0.786886,0.150498,220.178691,13.467617,21.729246,6.815731,3.112276,1.006802
1,69,1,1.003987,0.195625,221.218413,51.033462,64.752323,6.889608,3.051521,0.751346
2,65,1,0.65714,0.081287,320.770533,12.625011,30.61318,5.947767,2.489167,0.774952
3,65,1,0.906822,0.214173,369.27816,34.347597,54.510085,6.967183,3.613837,0.988155
4,22,0,1.734959,0.197706,222.782025,20.572891,170.010177,5.837537,3.068697,1.026654


In [7]:
y.head()

0    0
1    1
2    0
3    1
4    0
Name: disease, dtype: int64

In [8]:
# X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)
X_train = X
Y_train = y
X_test = X_test

In [9]:
import xgboost as xgb
xgb_train = xgb.DMatrix(X_train, label=Y_train, feature_names=X.columns)
xgb_test = xgb.DMatrix(X_test)

In [10]:
param = {
    # 二値分類問題
    'objective': 'binary:logistic',  
} 

In [11]:
model = xgb.train(param, xgb_train)

In [12]:
y_pred_proba = model.predict(xgb_test)
y_pred = np.where(y_pred_proba > 0.5, 1, 0)
submit = pd.read_csv('/Users/tueno/Liver disease determination based on medical checkup data/sample_submit.csv',header = None)
y_pred = pd.DataFrame(y_pred)
submit[1] = y_pred
submit.to_csv('/Users/tueno/Liver disease determination based on medical checkup data/submit_pycaret.csv',index=False,header=False)
