In [76]:
!pip install optuna catboost



In [77]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import optuna
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc

In [78]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [79]:
import os
os.chdir('/content/drive/MyDrive/Euron')

In [114]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [115]:
#train.drop(columns=['ID', 'Book-ID', 'Year-Of-Publication'], inplace=True)
#test.drop(columns=['ID', 'Book-ID', 'Year-Of-Publication'], inplace=True)
# location 전처리
# Location 분리
train['Location'] = train['Location'].str.replace(r'[^0-9a-zA-Z:,]', '')
train['city'] = train['Location'].apply(lambda x: x.split(',')[0].strip())
train['state'] = train['Location'].apply(lambda x: x.split(',')[1].strip())
train['country'] = train['Location'].apply(lambda x: x.split(',')[2].strip())

train.drop('Location', axis=1, inplace=True)
train = train.replace('na', np.nan)
train = train.replace('', np.nan)

# Location 최빈값으로 대체
mode_city = train['city'].mode()[0]
mode_state = train['state'].mode()[0]
mode_country = train['country'].mode()[0]

# 결측값 대체
train['city'].fillna(mode_city, inplace=True)
train['state'].fillna(mode_state, inplace=True)
train['country'].fillna(mode_country, inplace=True)

  train['Location'] = train['Location'].str.replace(r'[^0-9a-zA-Z:,]', '')


In [116]:
## Age 이상치 처리
# 이상치: 5세 이하 90세 초과
train.loc[(train['Age'] <= 5)|(train['Age'] > 90), 'Age'] = np.nan
train['Age'].isnull().sum() # 5세 이하 또는 90세 초과인 데이터 4497개
# 평균값으로 대체
train['Age'].mean() # Age (이상치 제거 후) 평균: 36.541634752034845
train['Age'] = train['Age'].fillna(train['Age'].mean())

## Age 범주화
# Age_group: 이상치 처리 & 5세 단위로 범주화
train['Age_group'] = pd.cut(train['Age'], bins = list(range(5, 91, 5)))

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder() #labelencoder함수를 가져온다.
encoder.fit(train['Age_group']) #내가 가지고 있는 데이터에 학습시킨다
digit_label = encoder.transform(train['Age_group'])
train['Age_group'] = digit_label # encoding 잘됐나?
#train['Age_group']

In [117]:
# 소문자 통일, 특수문자 제거 (종류: 15505 -> 15021)
train['Publisher'] = train['Publisher'].str.lower().replace('[^0-9a-zA-Z\s]', '', regex=True)

# 소문자 통일, 특수문자 제거, 띄어쓰기 제거 (종류: 15505 -> 14840)
train['Publisher'] = train['Publisher'].str.lower().replace('[^0-9a-zA-Z]', '', regex=True)

In [118]:
train['Book-Author'] = train['Book-Author'].str.lower().replace('[^0-9a-zA-Z]', '', regex=True)

In [119]:
# 출판년도 그룹화
# 만약 출판연도가 null이 있다면 정보없음(-1)로 채움
# train['Year-Of-Publication'] = train['Year-Of-Publication'].fillna(-1)

labels = ['Unknown', '-1800', '1800-1850', '1850-1900', '1900-1910', '1910-1920', '1920-1930', '1930-1940', '1940-1950','1950-1960', '1960-1970', '1970-1980', '1980-1990', '1990-2000', '2000-2010', '2010-2020', '2020-']
bins = [-1, 0, 1800, 1850, 1900, 1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020, 3000]
train['Pub_gb'] = pd.cut(train['Year-Of-Publication'], bins, labels = labels,include_lowest = True)
train = train.drop(columns =['Year-Of-Publication'])
#train['Year-Of-Publication'] = pd.cut(train['Year-Of-Publication'], bins=bins, labels=labels)


encoder2 = LabelEncoder() #labelencoder함수를 가져온다.
encoder2.fit(train['Pub_gb']) #내가 가지고 있는 데이터에 학습시킨다
digit_label = encoder2.transform(train['Pub_gb'])
train['Pub_gb'] = digit_label # encoding 잘됐나?
#train['Pub_gb']

In [126]:
y = train['Book-Rating']

X = train.drop(['Book-Rating','Book-ID','Book-Title','ID'], axis=1)

In [127]:
kf = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True)

cat_feature = ["User-ID",  "Age_group","Pub_gb","Publisher","country","Book-Author","state","city"]

In [128]:
def objective_cat(trial):
    """
    Objective function to tune a `CatBoostRegressor` model.
    """

    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

    params = {
        'objective' :  'RMSE',
        'n_estimators':trial.suggest_int("n_estimators", 30, 10000),
        'od_wait':trial.suggest_int('od_wait', 500, 2300),
        'learning_rate' : trial.suggest_uniform('learning_rate',0.01, 1),
        'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
        'random_strength': trial.suggest_uniform('random_strength',10,50),
        'depth': trial.suggest_int('depth',1, 15),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
        'random_state' : 42,
        'task_type' : "GPU",
        'verbose' : False
    }


    model = CatBoostRegressor(
        **params,
        cat_features=["User-ID",  "Age_group","Pub_gb","Publisher","country","Book-Author","state","city"]
    )
    model.fit(x_train, y_train, eval_set = [(x_test, y_test)], early_stopping_rounds = 500, verbose = 2000)

    pred = model.predict(x_test)

    mae = mean_squared_error(y_test, pred, squared=False)


    return mae

In [129]:
study = optuna.create_study(direction="minimize")
study.optimize(objective_cat, n_trials=10)

[I 2024-02-06 09:11:25,899] A new study created in memory with name: no-name-da264c10-9269-4600-a0e8-620eedfeaa12
  'learning_rate' : trial.suggest_uniform('learning_rate',0.01, 1),
  'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
  'random_strength': trial.suggest_uniform('random_strength',10,50),


0:	learn: 3.5828813	test: 3.5604905	best: 3.5604905 (0)	total: 12.9ms	remaining: 1m 26s
2000:	learn: 3.3934044	test: 3.3492643	best: 3.3492468 (1987)	total: 14.2s	remaining: 33.1s
4000:	learn: 3.3927396	test: 3.3488876	best: 3.3488070 (3736)	total: 28.1s	remaining: 18.7s
bestTest = 3.348806991
bestIteration = 3736
Shrink model to first 3737 iterations.


[I 2024-02-06 09:12:06,162] Trial 0 finished with value: 3.3488069990428153 and parameters: {'n_estimators': 6658, 'od_wait': 1283, 'learning_rate': 0.6898417531012464, 'reg_lambda': 48.4828935889989, 'random_strength': 30.614184939512338, 'depth': 1, 'min_data_in_leaf': 18}. Best is trial 0 with value: 3.3488069990428153.
  'learning_rate' : trial.suggest_uniform('learning_rate',0.01, 1),
  'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
  'random_strength': trial.suggest_uniform('random_strength',10,50),


0:	learn: 3.6039751	test: 3.5895512	best: 3.5895512 (0)	total: 438ms	remaining: 43m 23s
bestTest = 3.273952185
bestIteration = 39
Shrink model to first 40 iterations.


[I 2024-02-06 09:15:38,188] Trial 1 finished with value: 3.2739523598225833 and parameters: {'n_estimators': 5945, 'od_wait': 1047, 'learning_rate': 0.4206428021430828, 'reg_lambda': 20.259558840976442, 'random_strength': 43.54236874511634, 'depth': 14, 'min_data_in_leaf': 5}. Best is trial 1 with value: 3.2739523598225833.
  'learning_rate' : trial.suggest_uniform('learning_rate',0.01, 1),
  'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
  'random_strength': trial.suggest_uniform('random_strength',10,50),


0:	learn: 3.5362195	test: 3.5129558	best: 3.5129558 (0)	total: 227ms	remaining: 5m
bestTest = 3.278931885
bestIteration = 39
Shrink model to first 40 iterations.


[I 2024-02-06 09:17:26,277] Trial 2 finished with value: 3.2789317658192587 and parameters: {'n_estimators': 1326, 'od_wait': 740, 'learning_rate': 0.6220951726140971, 'reg_lambda': 53.04567509490288, 'random_strength': 24.005730082581145, 'depth': 12, 'min_data_in_leaf': 25}. Best is trial 1 with value: 3.2739523598225833.
  'learning_rate' : trial.suggest_uniform('learning_rate',0.01, 1),
  'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
  'random_strength': trial.suggest_uniform('random_strength',10,50),


0:	learn: 3.5037924	test: 3.4755653	best: 3.4755653 (0)	total: 154ms	remaining: 11m 12s
bestTest = 3.275153365
bestIteration = 78
Shrink model to first 79 iterations.


[I 2024-02-06 09:18:39,371] Trial 3 finished with value: 3.275153463024238 and parameters: {'n_estimators': 4376, 'od_wait': 861, 'learning_rate': 0.7638653295970875, 'reg_lambda': 95.28847498056896, 'random_strength': 30.95514028339229, 'depth': 9, 'min_data_in_leaf': 8}. Best is trial 1 with value: 3.2739523598225833.
  'learning_rate' : trial.suggest_uniform('learning_rate',0.01, 1),
  'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
  'random_strength': trial.suggest_uniform('random_strength',10,50),


0:	learn: 3.8091557	test: 3.8068192	best: 3.8068192 (0)	total: 68.1ms	remaining: 5m 58s
2000:	learn: 3.3084633	test: 3.2574296	best: 3.2574294 (1999)	total: 2m 22s	remaining: 3m 52s
4000:	learn: 3.2968070	test: 3.2549257	best: 3.2549257 (4000)	total: 4m 42s	remaining: 1m 29s
5273:	learn: 3.2908402	test: 3.2541238	best: 3.2541223 (5272)	total: 6m 12s	remaining: 0us
bestTest = 3.254122269
bestIteration = 5272
Shrink model to first 5273 iterations.


[I 2024-02-06 09:25:16,299] Trial 4 finished with value: 3.25412244828048 and parameters: {'n_estimators': 5274, 'od_wait': 918, 'learning_rate': 0.0593319227615133, 'reg_lambda': 38.55215448548511, 'random_strength': 35.74540595540632, 'depth': 5, 'min_data_in_leaf': 10}. Best is trial 4 with value: 3.25412244828048.
  'learning_rate' : trial.suggest_uniform('learning_rate',0.01, 1),
  'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
  'random_strength': trial.suggest_uniform('random_strength',10,50),


0:	learn: 3.5342601	test: 3.5052796	best: 3.5052796 (0)	total: 78.8ms	remaining: 6m 22s
bestTest = 3.269400505
bestIteration = 357
Shrink model to first 358 iterations.


[I 2024-02-06 09:26:28,809] Trial 5 finished with value: 3.2694005085987357 and parameters: {'n_estimators': 4853, 'od_wait': 753, 'learning_rate': 0.7121908771636585, 'reg_lambda': 0.868196465654616, 'random_strength': 47.93027117416366, 'depth': 5, 'min_data_in_leaf': 28}. Best is trial 4 with value: 3.25412244828048.
  'learning_rate' : trial.suggest_uniform('learning_rate',0.01, 1),
  'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
  'random_strength': trial.suggest_uniform('random_strength',10,50),


0:	learn: 3.4986716	test: 3.4742064	best: 3.4742064 (0)	total: 756ms	remaining: 2h 9s
bestTest = 3.298129917
bestIteration = 13
Shrink model to first 14 iterations.


[I 2024-02-06 09:29:44,741] Trial 6 finished with value: 3.2981299093200773 and parameters: {'n_estimators': 9536, 'od_wait': 1808, 'learning_rate': 0.7348729134836387, 'reg_lambda': 8.83049135298282, 'random_strength': 19.94720135495203, 'depth': 14, 'min_data_in_leaf': 16}. Best is trial 4 with value: 3.25412244828048.
  'learning_rate' : trial.suggest_uniform('learning_rate',0.01, 1),
  'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
  'random_strength': trial.suggest_uniform('random_strength',10,50),


0:	learn: 3.7534122	test: 3.7477284	best: 3.7477284 (0)	total: 106ms	remaining: 3m 21s
bestTest = 3.255101177
bestIteration = 1233
Shrink model to first 1234 iterations.


[I 2024-02-06 09:32:51,290] Trial 7 finished with value: 3.2551010721535225 and parameters: {'n_estimators': 1897, 'od_wait': 1308, 'learning_rate': 0.1422701721441821, 'reg_lambda': 17.859018716523558, 'random_strength': 44.246358018009595, 'depth': 8, 'min_data_in_leaf': 29}. Best is trial 4 with value: 3.25412244828048.
  'learning_rate' : trial.suggest_uniform('learning_rate',0.01, 1),
  'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
  'random_strength': trial.suggest_uniform('random_strength',10,50),


0:	learn: 3.5062013	test: 3.4676300	best: 3.4676300 (0)	total: 49.9ms	remaining: 6m 1s
bestTest = 3.27131797
bestIteration = 517
Shrink model to first 518 iterations.


[I 2024-02-06 09:34:02,883] Trial 8 finished with value: 3.2713182300170067 and parameters: {'n_estimators': 7249, 'od_wait': 1045, 'learning_rate': 0.9676589250635431, 'reg_lambda': 75.06487647919542, 'random_strength': 45.32610775158288, 'depth': 4, 'min_data_in_leaf': 25}. Best is trial 4 with value: 3.25412244828048.
  'learning_rate' : trial.suggest_uniform('learning_rate',0.01, 1),
  'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
  'random_strength': trial.suggest_uniform('random_strength',10,50),


0:	learn: 3.5894437	test: 3.5685629	best: 3.5685629 (0)	total: 50.7ms	remaining: 4m 7s
bestTest = 3.26080063
bestIteration = 1137
Shrink model to first 1138 iterations.


[I 2024-02-06 09:35:57,199] Trial 9 finished with value: 3.260800517554556 and parameters: {'n_estimators': 4884, 'od_wait': 784, 'learning_rate': 0.5135205673239862, 'reg_lambda': 0.014558203100588604, 'random_strength': 21.168475149667948, 'depth': 4, 'min_data_in_leaf': 7}. Best is trial 4 with value: 3.25412244828048.


In [125]:
len(train.loc[train['Book-Title'] == 'A Scientist at the Seashore'])

1

In [103]:
train.head()

Unnamed: 0,ID,User-ID,Book-ID,Book-Rating,Age,Location,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,TRAIN_000000,USER_00000,BOOK_044368,8,23.0,"sackville, new brunswick, canada",Road Taken,Rona Jaffe,2001.0,Mira
1,TRAIN_000001,USER_00000,BOOK_081205,8,23.0,"sackville, new brunswick, canada",Macbeth (New Penguin Shakespeare),William Shakespeare,1981.0,Penguin Books
2,TRAIN_000002,USER_00000,BOOK_086781,0,23.0,"sackville, new brunswick, canada",Waverley (Penguin English Library),Walter Scott,1981.0,Penguin Books
3,TRAIN_000003,USER_00000,BOOK_098622,0,23.0,"sackville, new brunswick, canada",Mother Earth Father Sky,Sue Harrison,1991.0,Avon
4,TRAIN_000004,USER_00000,BOOK_180810,8,23.0,"sackville, new brunswick, canada",She Who Remembers,Linda Lay Shuler,1989.0,Signet Book


In [130]:
params=study.best_params
params['objective'] = 'RMSE'
params['random_state'] = 42
params['verbose'] = False
params['task_type'] = "GPU"
params['cat_features'] = ["User-ID",  "Age_group","Pub_gb","Publisher","country","Book-Author","state","city"]

In [131]:
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

Best Score: 3.25412244828048
Best trial: {'n_estimators': 5274, 'od_wait': 918, 'learning_rate': 0.0593319227615133, 'reg_lambda': 38.55215448548511, 'random_strength': 35.74540595540632, 'depth': 5, 'min_data_in_leaf': 10}


In [132]:
# 시각화
optuna.visualization.plot_optimization_history(study)

In [133]:
# 하이퍼파라미터 중요도
optuna.visualization.plot_param_importances(study)