In [1]:
# 환경설정
import matplotlib.pyplot as plt 
# 한글출력
plt.rcParams['font.family'] = 'Malgun Gothic' #  Windows 'Malgun Gothic' 
plt.rcParams['axes.unicode_minus'] = False

In [87]:
# 라이브러리
import seaborn as sns 
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score

In [49]:
# 데이터 가져오기
import pandas as pd

# 훈련 데이터
train = pd.read_csv("train.csv")
train.head(1)

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,male,36,189.0,82.0,26.0,101.0,41.0,150.0


In [50]:
# 테스트 데이터
test = pd.read_csv("test.csv")
test.head(1)

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,750000,male,45,177.0,81.0,7.0,87.0,39.8


In [51]:
# 결측치 확인
train.isnull().sum()

id            0
Sex           0
Age           0
Height        0
Weight        0
Duration      0
Heart_Rate    0
Body_Temp     0
Calories      0
dtype: int64

In [52]:
# 수치형 / 범주형 컬럼 나누기
numeric_features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']
category_features = ['Sex']

# 독립변수, Target 설정
X = train[numeric_features + category_features]
X

y = train['Calories']
y

0         150.0
1          34.0
2          29.0
3         140.0
4         146.0
          ...  
749995    230.0
749996     96.0
749997    221.0
749998    109.0
749999    103.0
Name: Calories, Length: 750000, dtype: float64

In [53]:
# 학습/검증 데이터 분할 함수
def data_seperate(X_df, y_series):
    num_bins = 20
    y_binned = pd.cut(y, bins=num_bins, labels=False)
    X_train, X_val, y_train, y_val = train_test_split(
        X_df, y_series, test_size=0.2, random_state=42, stratify=y_binned)

    return [X_train, X_val, y_train, y_val]

In [54]:
X_train, X_val, y_train, y_val = data_seperate(X, y)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((600000, 7), (150000, 7), (600000,), (150000,))

In [83]:
# 범주형 데이터 인코딩 함수
def category_encoding(category_data):
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    return encoder.fit_transform(category_data)

# 수치형 데이터 인코딩 함수
def numeric_encoding(numeric_data):
    scaler = StandardScaler()
    return scaler.fit_transform(numeric_data)

In [63]:
# 인코딩 데이터 합체 함수
def data_concat(category_data, numeric_data):
    train_combined = pd.concat([
        pd.DataFrame(category_data, columns=['sex1','sex2']),
        pd.DataFrame(numeric_data, columns=numeric_features)
    ], axis=1)
    return train_combined

In [57]:
# Target 데이터 로그변환 : RMSLE 평가지표를 따라가기 위함
y_train = np.log1p(y_train)
y_train
y_val = np.log1p(y_val)
y_val

532263    3.135494
580340    2.484907
101839    5.111988
438420    4.762174
449976    4.094345
            ...   
269955    4.143135
502294    4.997212
58461     3.367296
647977    5.087596
245703    2.890372
Name: Calories, Length: 150000, dtype: float64

In [73]:
# 평가지표 함수
def score_model(y_val_df, y_pred_df):
    print("검증 데이터 성능:")
    print(f"RMSLE: {np.sqrt(mean_squared_error(np.log1p(y_val_df), np.log1p(y_pred_df))):.10f}")
    print(f"RMSE: {mean_squared_error(y_val_df, y_pred_df):.2f}")
    print(f"R2 Score: {r2_score(y_val_df, y_pred_df):.2f}")

In [88]:
# 모델 라이브러리
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [84]:
# 모델 사용 코드
# 새 모델 추가하고 싶으면 여기만 수정
# 독립변수 데이터 인코딩
# 훈련 데이터
X_train_category, X_val_category = category_encoding(X_train[category_features]), category_encoding(X_val[category_features])
X_train_numeric, X_val_numeric = numeric_encoding(X_train[numeric_features]), numeric_encoding(X_val[numeric_features])

X_train_combined = data_concat(X_train_category, X_train_numeric)
X_val_combined = data_concat(X_val_category, X_val_numeric)

# print(X_train_combined.head())
# print(X_val_combined.head())

# 모델 적용
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
rf_model = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42, n_jobs=-1)
lgb_model = LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=-1, random_state=42, n_jobs=-1)
cat_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=10,
    subsample=0.8,
    random_strength=5,
    loss_function='RMSE',
    early_stopping_rounds=50,
    verbose=100,
    random_seed=42
)

# 모델 학습
print("xgboost")
xgb_model.fit(X_train_combined, y_train)
xgb_pred = xgb_model.predict(X_val_combined)
score_model(y_val, xgb_pred)

print("\nrandom forest")
rf_model.fit(X_train_combined, y_train)
rf_pred = rf_model.predict(X_val_combined)
score_model(y_val, rf_pred)

print("\nlightgbm")
lgb_model.fit(X_train_combined, y_train)
lgb_pred = rf_model.predict(X_val_combined)
score_model(y_val, lgb_pred)

print("\ncatboost")
cat_model.fit(X_train_combined, y_train, eval_set=(X_val_combined, y_val))
cat_pred = cat_model.predict(X_val_combined)
score_model(y_val, cat_pred)

xgboost
검증 데이터 성능:
RMSLE: 0.0209195477
RMSE: 0.01
R2 Score: 0.99

random forest
검증 데이터 성능:
RMSLE: 0.0182104888
RMSE: 0.00
R2 Score: 1.00

lightgbm
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005488 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 360
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 8
[LightGBM] [Info] Start training from score 4.141196
검증 데이터 성능:
RMSLE: 0.0182104888
RMSE: 0.00
R2 Score: 1.00

catboost
0:	learn: 0.9188701	test: 0.9199195	best: 0.9199195 (0)	total: 17.6ms	remaining: 17.5s
100:	learn: 0.0733049	test: 0.0735699	best: 0.0735699 (100)	total: 1.41s	remaining: 12.5s
200:	learn: 0.0638636	test: 0.0643988	best: 0.0643988 (200)	total: 2.72s	remaining: 10.8s
300:	learn: 0.0613750	test: 0.0620603	best: 0.0620603 (300)	total: 4.07s	remaining: 9.44s
400:	learn: 0.0602408	test: 0.0610857	best: 0.0610857 (400)	total: 5.46s	remaining: 8.15

In [89]:
# 테스트 데이터 인코딩
def test_predict(model):
    test_category = category_encoding(test[category_features])
    test_numeric = numeric_encoding(test[numeric_features])
    test_combined = data_concat(test_category, test_numeric)
    
    test_pred = model.predict(test_combined)
    test_pred = np.exp(test_pred)
    return test_pred

In [90]:
def submit_kaggle(test_pred_data):
    # 제출 파일 생성, 공모전 제출 양식 준수
    submission = pd.DataFrame({
        'id' : test['id'],
        'Calories' : test_pred_data
    })
    # 현재 날짜와 시간을 파일명에 포함
    current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
    submission.to_csv(f'submission_{current_time}.csv', index=False)

In [93]:
submit_kaggle(test_predict(cat_model))

In [94]:
train.head()

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,4,female,38,166.0,61.0,25.0,102.0,40.6,146.0


In [95]:
test.head()

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,750000,male,45,177.0,81.0,7.0,87.0,39.8
1,750001,male,26,200.0,97.0,20.0,101.0,40.5
2,750002,female,29,188.0,85.0,16.0,102.0,40.4
3,750003,female,39,172.0,73.0,20.0,107.0,40.6
4,750004,female,30,173.0,67.0,16.0,94.0,40.5


In [96]:
train.shape, test.shape

((750000, 9), (250000, 8))