In [1]:
import pandas as pd
import numpy as np
import random
import os

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [3]:
train = pd.read_csv("C:/Users/jain5/Desktop/dataset.csv")

train.head()

Unnamed: 0.1,Unnamed: 0,ID,User-ID,Book-ID,Book-Rating,Age,Book-Title,Book-Author,Publisher,Main_Title,Sub_Title,Pub_gb,Age_group,Age_group2,city,state,country
0,0,TRAIN_000000,USER_00000,BOOK_044368,8,23.0,roadtaken,ronajaffe,mira,roadtaken,No_SUB,2000-2010,"(20, 25]","(18, 25]",sackville,new brunswick,canada
1,1,TRAIN_000001,USER_00000,BOOK_081205,8,23.0,macbethnewpenguinshakespeare,williamshakespeare,penguin books,macbethnewpenguinshakespeare,No_SUB,1980-1990,"(20, 25]","(18, 25]",sackville,new brunswick,canada
2,2,TRAIN_000002,USER_00000,BOOK_086781,0,23.0,waverleypenguinenglishlibrary,walterscott,penguin books,waverleypenguinenglishlibrary,No_SUB,1980-1990,"(20, 25]","(18, 25]",sackville,new brunswick,canada
3,3,TRAIN_000003,USER_00000,BOOK_098622,0,23.0,motherearthfathersky,sueharrison,avon,motherearthfathersky,No_SUB,1990-2000,"(20, 25]","(18, 25]",sackville,new brunswick,canada
4,4,TRAIN_000004,USER_00000,BOOK_180810,8,23.0,shewhoremembers,lindalayshuler,signet book,shewhoremembers,No_SUB,1980-1990,"(20, 25]","(18, 25]",sackville,new brunswick,canada


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 871393 entries, 0 to 871392
Data columns (total 17 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Unnamed: 0   871393 non-null  int64  
 1   ID           871393 non-null  object 
 2   User-ID      871393 non-null  object 
 3   Book-ID      871393 non-null  object 
 4   Book-Rating  871393 non-null  int64  
 5   Age          871393 non-null  float64
 6   Book-Title   871393 non-null  object 
 7   Book-Author  871386 non-null  object 
 8   Publisher    871393 non-null  object 
 9   Main_Title   871393 non-null  object 
 10  Sub_Title    871393 non-null  object 
 11  Pub_gb       871393 non-null  object 
 12  Age_group    871393 non-null  object 
 13  Age_group2   871393 non-null  object 
 14  city         857346 non-null  object 
 15  state        853108 non-null  object 
 16  country      860205 non-null  object 
dtypes: float64(1), int64(2), object(14)
memory usage: 113.0+ MB


In [5]:
X_train = train.drop(columns=['ID', 'Book-ID', 'Book-Rating', 'Age_group2', 'Book-ID', 'Unnamed: 0']) # 일단 5세 기준으로 나눈 Age_group만 사용
X_test = test.drop(columns = ['ID','Book-Title'])
y_train = train['Book-Rating']

In [8]:
from catboost import CatBoostRegressor, Pool
model = CatBoostRegressor(random_seed = 113,
                          l2_leaf_reg = 0.003426034644149707,
                          max_bin = 358,
                          subsample = 0.9974697184313627,
                          learning_rate = 0.009464402227606937,
                          max_depth = 11,
                          min_data_in_leaf = 139,
                          eval_metric = 'RMSE',
                          iterations = 8694,
                          task_type='GPU',
                          bootstrap_type = 'Poisson',
                          early_stopping_rounds = 100,
                          verbose=500
                          )

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
k = 20 # a number of folds best is 20
skfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=113)
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.metrics import r2_score

y_valid_pred = 0*y_train
y_test_pred = 0

FEATURE = ['User-ID', 'Main_Title','Sub_Title','Book-Author','Publisher', 'City','State','Country','Age_gb']

for i, (train_index, test_index) in tqdm(enumerate(skfold.split(X_train, y_train))):    
    X_train_fold, X_valid_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_valid_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    X_train_fold[FEATURE] = X_train_fold[FEATURE].astype('int')
    X_valid_fold[FEATURE] = X_valid_fold[FEATURE].astype('int')
    
    train_pool = Pool(data=X_train_fold, label=y_train_fold, cat_features=FEATURE)
    valid_pool = Pool(data=X_valid_fold, label=y_valid_fold, cat_features=FEATURE)

    
    print( "\nFold ", i)
    
    fit_model = model.fit(train_pool, 
                          eval_set=valid_pool,
                          use_best_model=True
                          )
    print( "  N trees = ", model.tree_count_ )
        

    def score_model(model,X_train, X_test, y_train, y_test,
               show_plot=True):   
        y_pred = np.clip(model.predict(X_test),0,10)
        print(f"Test score: {r2_score(y_test, y_pred)}")
        print("MSE: ", mean_squared_error(y_test, y_pred))
        print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred)))
    
        predictions_comparision = pd.DataFrame({'Actual': y_test.tolist(), 'Predicted': y_pred.tolist()}).sample(25)
        if show_plot == True:
            predictions_comparision.plot(kind="bar", figsize=(12,8),title="Actual vs predicted values")
            print(predictions_comparision.sample(10))    
    
    
        return {
            "test_score_r2" : r2_score(y_test, y_pred),
            "test_score_mse" : mean_squared_error(y_test, y_pred),
            "test_score_rmse" : np.sqrt(mean_squared_error(y_test, y_pred))
            }
    score_model(fit_model, X_train_fold, X_valid_fold, y_train_fold, y_valid_fold, show_plot=True)
    
    x_test[FEATURE] = x_test[FEATURE].astype('int')
    # Predict value Clipping
    y_test_pred +=  np.clip(fit_model.predict(x_test[X_valid_fold.columns]),0.0,10.0)
    
y_test_pred /= k  # Average test set predictions
