# Library

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier

from sklearn.metrics import accuracy_score

# Data

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

sample_submission = pd.read_csv('../data/sample_submission.csv')

In [6]:
X = train.drop('NObeyesdad', axis=1)
y = train['NObeyesdad']

cat_features_names = []

# データセット内の各列に対してループ
for column in X.columns:
    # 列のデータ型が 'object' または 'category' の場合、その列をカテゴリカルとみなす
    if X[column].dtype == 'object' or X[column].dtype.name == 'category':
        cat_features_names.append(column)

# カテゴリカル特徴量の列インデックスを取得
cat_features_indices = [train.columns.get_loc(name) for name in cat_features_names]

cat_features_names 

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

- model

In [7]:
# モデルの初期化
model = CatBoostClassifier(
    iterations=100, 
    learning_rate=0.1,
    depth=4,
    verbose=10  # トレーニングプロセスのログを10イテレーションごとに表示
)

# val予測

In [8]:
# モデルのトレーニング
model.fit(X_train, y_train, cat_features=cat_features_names, eval_set=(X_val, y_val), use_best_model=True)

0:	learn: 1.6664040	test: 1.6650451	best: 1.6650451 (0)	total: 178ms	remaining: 17.6s
10:	learn: 0.8515494	test: 0.8499687	best: 0.8499687 (10)	total: 639ms	remaining: 5.17s
20:	learn: 0.6263022	test: 0.6277784	best: 0.6277784 (20)	total: 1.13s	remaining: 4.25s
30:	learn: 0.5271372	test: 0.5310644	best: 0.5310644 (30)	total: 1.57s	remaining: 3.49s
40:	learn: 0.4672037	test: 0.4749030	best: 0.4749030 (40)	total: 1.98s	remaining: 2.85s
50:	learn: 0.4308046	test: 0.4406701	best: 0.4406701 (50)	total: 2.4s	remaining: 2.3s
60:	learn: 0.4055312	test: 0.4171320	best: 0.4171320 (60)	total: 2.81s	remaining: 1.8s
70:	learn: 0.3876100	test: 0.3997846	best: 0.3997846 (70)	total: 3.25s	remaining: 1.33s
80:	learn: 0.3724663	test: 0.3856609	best: 0.3856609 (80)	total: 3.67s	remaining: 861ms
90:	learn: 0.3602607	test: 0.3740782	best: 0.3740782 (90)	total: 4.09s	remaining: 404ms
99:	learn: 0.3471881	test: 0.3616791	best: 0.3616791 (99)	total: 4.46s	remaining: 0us

bestTest = 0.3616790814
bestIteration 

<catboost.core.CatBoostClassifier at 0x2aa7de35960>

In [9]:
y_val_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_val_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.8889691714836223


# test予測

In [10]:
test_pred = model.predict(test)

- 結合

In [11]:
sample_submission



Unnamed: 0,id,NObeyesdad
0,20758,Normal_Weight
1,20759,Normal_Weight
2,20760,Normal_Weight
3,20761,Normal_Weight
4,20762,Normal_Weight
...,...,...
13835,34593,Normal_Weight
13836,34594,Normal_Weight
13837,34595,Normal_Weight
13838,34596,Normal_Weight


In [18]:
submission_dataframe = pd.DataFrame()
submission_dataframe['id'] = test['id']
submission_dataframe['MObeyesdad'] = test_pred.flatten()

In [19]:
submission_dataframe

Unnamed: 0,id,MObeyesdad
0,20758,Obesity_Type_II
1,20759,Overweight_Level_I
2,20760,Obesity_Type_III
3,20761,Obesity_Type_I
4,20762,Obesity_Type_III
...,...,...
13835,34593,Overweight_Level_II
13836,34594,Normal_Weight
13837,34595,Insufficient_Weight
13838,34596,Normal_Weight


In [20]:
submission_dataframe.to_csv(f'../output/submit_val{accuracy}.csv', index=False)

- submit確認

In [21]:
submission_dataframe.head()

Unnamed: 0,id,MObeyesdad
0,20758,Obesity_Type_II
1,20759,Overweight_Level_I
2,20760,Obesity_Type_III
3,20761,Obesity_Type_I
4,20762,Obesity_Type_III
