In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV

from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

In [29]:
df = pd.read_csv('input_data.csv', index_col=0)
print(df.shape)

df = pd.get_dummies(df, columns=['Category', 'Installs','Content Rating'], drop_first=True)

print(df.shape)
df.dropna(inplace=True)
print(df.shape)

(1187210, 8)
(1187210, 59)
(1187198, 59)


In [9]:
X = df.drop(['Rating'], axis=1).values
y = df.Rating.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2) 
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=2)

print(X_train.shape, X_val.shape, X_test.shape)
print(y_train.shape, y_val.shape, y_test.shape)

(759806, 58) (189952, 58) (237440, 58)
(759806,) (189952,) (237440,)


In [10]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [11]:
#최종모델과 성능을 비교할 기준모델
y_pred = [y_train.mean()] * len(y_train)

print(f'base model RMSE : {mean_squared_error(y_train, y_pred)**0.5}')
#base model RMSE score: 0.6895426039410417

base model RMSE : 0.6895426039410417


## 다양한 회귀모델 비교
+ 학습 : train 데이터셋(1338134개) 사용
+ 평가
    + validation 데이터셋(334534개)으로 모델 성능 측정
    + test 데이터셋(418168개)으로 best model의 성능 측정
    + 지표는 RMSE, R2 score 사용
        + RMSE : 0에 가까울수록 좋은 성능
        + R2 score : 1에 가까울수록 좋은 성능

In [12]:
#모델별 cv 점수 저장할 list
score_list = [['model', 'train_RMSE', 'train_R2', 'val_RMSE', 'val_R2', 'cv_score'],
             ['LinearRegression'], 
             ['ElasticNet'], 
             ['Decision Tree'], 
             ['Random Forest'], 
             ['GBM'], 
             ['LGBM'], 
             ['XGB'],
             ['Bagging']]

In [13]:
#LinearRegression
lr = LinearRegression(n_jobs=8)
lr.fit(X_train, y_train)
y_train_pred = lr.predict(X_train)

#train
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = mse_train ** 0.5
r2_train = r2_score(y_train, y_train_pred)

print(f"LinearRegression train RMSE : {rmse_train}")
print(f"LinearRegression train R2 : {r2_train}\n")

#validation
y_val_pred = lr.predict(X_val)

mse_test = mean_squared_error(y_val, y_val_pred)
rmse_test = mse_train ** 0.5
r2_test = r2_score(y_val, y_val_pred)

print(f"LinearRegression val RMSE: {rmse_test}")
print(f"LinearRegression val R2: {r2_test}\n")

# cv
scores = cross_val_score(lr, X, y, scoring='neg_mean_squared_error', cv=10, n_jobs=8)
rmse_scores = (-scores) ** 0.5
cv_score = rmse_scores.mean()

print(f"LinearRegression cross validation RMSE: {cv_score}")

score_list[1].append(rmse_train)
score_list[1].append(r2_train)
score_list[1].append(rmse_test)
score_list[1].append(r2_test)
score_list[1].append(cv_score)

LinearRegression train RMSE : 0.6480135101947617
LinearRegression train R2 : 0.11682673937793897

LinearRegression val RMSE: 0.6480135101947617
LinearRegression val R2: 0.1190980383561937

LinearRegression cross validation RMSE: 0.6488470589866715


In [15]:
#ElasticNet
elastic = ElasticNet(random_state=2)
elastic.fit(X_train, y_train)
y_train_pred = elastic.predict(X_train)

#train
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = mse_train ** 0.5
r2_train = r2_score(y_train, y_train_pred)

print(f"ElasticNet train RMSE : {rmse_train}")
print(f"ElasticNet train R2 : {r2_train}\n")

#validation
y_val_pred = elastic.predict(X_val)

mse_test = mean_squared_error(y_val, y_val_pred)
rmse_test = mse_train ** 0.5
r2_test = r2_score(y_val, y_val_pred)

print(f"ElasticNet val RMSE: {rmse_test}")
print(f"ElasticNet val R2: {r2_test}\n")

# cv
scores = cross_val_score(elastic, X, y, scoring='neg_mean_squared_error', cv=10, n_jobs=8)
rmse_scores = (-scores) ** 0.5
cv_score = rmse_scores.mean()

print(f"ElasticNet cross validation RMSE: {cv_score}")

score_list[2].append(rmse_train)
score_list[2].append(r2_train)
score_list[2].append(rmse_test)
score_list[2].append(r2_test)
score_list[2].append(cv_score)

ElasticNet train RMSE : 0.6895426039410417
ElasticNet train R2 : 0.0

ElasticNet val RMSE: 0.6895426039410417
ElasticNet val R2: -9.849174564147134e-06

ElasticNet cross validation RMSE: 0.6898189747445775


In [16]:
#Decision Tree
dt = DecisionTreeRegressor(max_depth = 8, max_features = 'sqrt')
dt.fit(X_train, y_train)
y_train_pred = dt.predict(X_train)

#train
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = mse_train ** 0.5
r2_train = r2_score(y_train, y_train_pred)

print(f"Decision Tree train RMSE : {rmse_train}")
print(f"Decision Tree train R2 : {r2_train}\n")

#validation
y_val_pred = dt.predict(X_val)

mse_test = mean_squared_error(y_val, y_val_pred)
rmse_test = mse_train ** 0.5
r2_test = r2_score(y_val, y_val_pred)

print(f"Decision Tree val RMSE: {rmse_test}")
print(f"Decision Tree val R2: {r2_test}\n")

# cv
scores = cross_val_score(dt, X, y, scoring='neg_mean_squared_error', cv=10, n_jobs=8)
rmse_scores = (-scores) ** 0.5
cv_score = rmse_scores.mean()

print(f"Decision Tree cross validation RMSE: {cv_score}")

score_list[3].append(rmse_train)
score_list[3].append(r2_train)
score_list[3].append(rmse_test)
score_list[3].append(r2_test)
score_list[3].append(cv_score)

Decision Tree train RMSE : 0.6632883260861491
Decision Tree train R2 : 0.07470013592828761

Decision Tree val RMSE: 0.6632883260861491
Decision Tree val R2: 0.0756181161351841

Decision Tree cross validation RMSE: 0.666572074796214


In [17]:
#Random Forest
rf = RandomForestRegressor(max_depth = 20, n_estimators = 100)
rf.fit(X_train,y_train)
y_train_pred = rf.predict(X_train)

#train
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = mse_train ** 0.5
r2_train = r2_score(y_train, y_train_pred)

print(f"Random Forest train RMSE : {rmse_train}")
print(f"Random Forest train R2 : {r2_train}\n")

#validation
y_val_pred = rf.predict(X_val)

mse_test = mean_squared_error(y_val, y_val_pred)
rmse_test = mse_train ** 0.5
r2_test = r2_score(y_val, y_val_pred)

print(f"Random Forest val RMSE: {rmse_test}")
print(f"Random Forest val R2: {r2_test}\n")

# cv
scores = cross_val_score(rf, X, y, scoring='neg_mean_squared_error', cv=10, n_jobs=8)
rmse_scores = (-scores) ** 0.5
cv_score = rmse_scores.mean()

print(f"Random Forest cross validation RMSE: {cv_score}")

score_list[4].append(rmse_train)
score_list[4].append(r2_train)
score_list[4].append(rmse_test)
score_list[4].append(r2_test)
score_list[4].append(cv_score)

Random Forest train RMSE : 0.5690372232384866
Random Forest train R2 : 0.31898113309930065

Random Forest val RMSE: 0.5690372232384866
Random Forest val R2: 0.16481227060532433

Random Forest cross validation RMSE: 0.6307684282781312


In [18]:
#GBM
gbm = GradientBoostingRegressor(learning_rate=0.1, max_depth=8, n_estimators=30, random_state=2)
gbm.fit(X_train,y_train)
y_train_pred = gbm.predict(X_train)

#train
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = mse_train ** 0.5
r2_train = r2_score(y_train, y_train_pred)

print(f"GBM train RMSE : {rmse_train}")
print(f"GBM train R2 : {r2_train}\n")

#validation
y_val_pred = gbm.predict(X_val)

mse_test = mean_squared_error(y_val, y_val_pred)
rmse_test = mse_train ** 0.5
r2_test = r2_score(y_val, y_val_pred)

print(f"GBM val RMSE: {rmse_test}")
print(f"GBM val R2: {r2_test}\n")

# cv
scores = cross_val_score(gbm, X, y, scoring='neg_mean_squared_error', cv=10, n_jobs=8)
rmse_scores = (-scores) ** 0.5
cv_score = rmse_scores.mean()

print(f"GBM cross validation RMSE: {cv_score}")

score_list[5].append(rmse_train)
score_list[5].append(r2_train)
score_list[5].append(rmse_test)
score_list[5].append(r2_test)
score_list[5].append(cv_score)

GBM train RMSE : 0.6293961231878967
GBM train R2 : 0.16684478341118836

GBM val RMSE: 0.6293961231878967
GBM val R2: 0.16221125303647121

GBM cross validation RMSE: 0.6328415487456673


In [19]:
#lgbm 
lgbm = LGBMRegressor(random_state=2, n_jobs=8)
lgbm.fit(X_train, y_train)
y_train_pred = lgbm.predict(X_train)

#train
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = mse_train ** 0.5
r2_train = r2_score(y_train, y_train_pred)

print(f"LGBM train RMSE : {rmse_train}")
print(f"LGBM train R2 : {r2_train}\n")

#validation
y_val_pred = lgbm.predict(X_val)

mse_test = mean_squared_error(y_val, y_val_pred)
rmse_test = mse_train ** 0.5
r2_test = r2_score(y_val, y_val_pred)

print(f"LGBM val RMSE: {rmse_test}")
print(f"LGBM val R2: {r2_test}\n")

# cv
scores = cross_val_score(lgbm, X, y, scoring='neg_mean_squared_error', cv=10, n_jobs=8)
rmse_scores = (-scores) ** 0.5
cv_score = rmse_scores.mean()

print(f"LGBM cross validation RMSE: {cv_score}")

score_list[6].append(rmse_train)
score_list[6].append(r2_train)
score_list[6].append(rmse_test)
score_list[6].append(r2_test)
score_list[6].append(cv_score)

LGBM train RMSE : 0.6279584241448293
LGBM train R2 : 0.17064670828835682

LGBM val RMSE: 0.6279584241448293
LGBM val R2: 0.17097708117025168

LGBM cross validation RMSE: 0.6298078058056982


In [20]:
#XGB 
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
y_train_pred = xgb.predict(X_train)

#train
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = mse_train ** 0.5
r2_train = r2_score(y_train, y_train_pred)

print(f"XGB train RMSE : {rmse_train}")
print(f"XGB train R2 : {r2_train}\n")

#validation
y_val_pred = xgb.predict(X_val)

mse_test = mean_squared_error(y_val, y_val_pred)
rmse_test = mse_train ** 0.5
r2_test = r2_score(y_val, y_val_pred)

print(f"XGB val RMSE: {rmse_test}")
print(f"XGB val R2: {r2_test}\n")

# cv
scores = cross_val_score(xgb, X, y, scoring='neg_mean_squared_error', cv=10, n_jobs=8)
rmse_scores = (-scores) ** 0.5
cv_score = rmse_scores.mean()

print(f"XGB cross validation RMSE: {cv_score}")

score_list[7].append(rmse_train)
score_list[7].append(r2_train)
score_list[7].append(rmse_test)
score_list[7].append(r2_test)
score_list[7].append(cv_score)

XGB train RMSE : 0.6215052573153973
XGB train R2 : 0.18760469616736453

XGB val RMSE: 0.6215052573153973
XGB val R2: 0.1757911744168451

XGB cross validation RMSE: 0.6275753145752293


In [21]:
#BaggingRegressor 
bagging = BaggingRegressor()
bagging.fit(X_train, y_train)
y_train_pred = xgb.predict(X_train)

#train
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = mse_train ** 0.5
r2_train = r2_score(y_train, y_train_pred)

print(f"Bagging train RMSE : {rmse_train}")
print(f"Bagging train R2 : {r2_train}\n")

#validation
y_val_pred = bagging.predict(X_val)

mse_test = mean_squared_error(y_val, y_val_pred)
rmse_test = mse_train ** 0.5
r2_test = r2_score(y_val, y_val_pred)

print(f"Bagging val RMSE: {rmse_test}")
print(f"Bagging val R2: {r2_test}\n")

# cv
scores = cross_val_score(bagging, X, y, scoring='neg_mean_squared_error', cv=10, n_jobs=8)
rmse_scores = (-scores) ** 0.5
cv_score = rmse_scores.mean()

print(f"Bagging cross validation RMSE: {cv_score}")

score_list[8].append(rmse_train)
score_list[8].append(r2_train)
score_list[8].append(rmse_test)
score_list[8].append(r2_test)
score_list[8].append(cv_score)

Bagging train RMSE : 0.6215052573153973
Bagging train R2 : 0.18760469616736453

Bagging val RMSE: 0.6215052573153973
Bagging val R2: 0.03604685660826956

Bagging cross validation RMSE: 0.6785742238820952


In [23]:
pd.DataFrame(score_list[1:], columns=score_list[0])

Unnamed: 0,model,train_RMSE,train_R2,val_RMSE,val_R2,cv_score
0,LinearRegression,0.648014,0.116827,0.648014,0.119098,0.648847
1,ElasticNet,0.689543,0.0,0.689543,-1e-05,0.689819
2,Decision Tree,0.663288,0.0747,0.663288,0.075618,0.666572
3,Random Forest,0.569037,0.318981,0.569037,0.164812,0.630768
4,GBM,0.629396,0.166845,0.629396,0.162211,0.632842
5,LGBM,0.627958,0.170647,0.627958,0.170977,0.629808
6,XGB,0.621505,0.187605,0.621505,0.175791,0.627575
7,Bagging,0.621505,0.187605,0.621505,0.036047,0.678574


### Best model
+ RandomForest, LGBM, XGB 세 모델의 성능이 유사하게 좋아보임
+ 세 모델 모두 baseline model의 성능(0.6895426039410417)보다 좋은 결과
+ 성능확인까지 걸린 시간이 각각 약 50분, 30초, 5분  
    => 시간이 가장 적게 걸린 LGBM 모델 선택  
    => RandomizedSearchCV를 이용해 하이퍼파라미터 튜닝

In [24]:
model = LGBMRegressor(random_state=2, n_jobs=8)
param_distribution = {"n_estimators": [15, 30, 50], 
                      "max_depth": [10, 30, 50], 
                      'learning_rate':[0.001, 0.005, 0.01, 0.1],
                      'normalize':[True, False]}

random_search = RandomizedSearchCV(model, param_distributions=param_distribution, 
                                 scoring='neg_mean_squared_error', n_jobs=8, cv=10, verbose=1,
                                 n_iter=2000, refit=True, random_state=2)
random_search.fit(X_train, y_train)

best_params = random_search.best_params_

print("Best params : {}".format(random_search.best_params_))
print("Best RMSE : {}".format((-random_search.best_score_)**0.5))

model = random_search.best_estimator_

#Best params : {'n_estimators': 50, 'max_depth': 20, 'learning_rate': 0.1}
#Best RMSE : 0.47677192300834276

Fitting 10 folds for each of 72 candidates, totalling 720 fits
Best params : {'normalize': True, 'n_estimators': 50, 'max_depth': 30, 'learning_rate': 0.1}
Best RMSE : 0.6313311130844016


## 최종 모델 성능 평가
+ test 데이터셋 사용

In [26]:
y_test_pred = model.predict(X_test)

mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = mse_train ** 0.5
r2_test = r2_score(y_test, y_test_pred)

print(f"best model test RMSE: {rmse_test}")
print(f"best model test R2: {r2_test}\n")

best model test RMSE: 0.6215052573153973
best model test R2: 0.15971898762037473

