<a href="https://colab.research.google.com/github/yujinc129-oss/Drama_data_project/blob/main/notebooks/05_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [108]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn. ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import PolynomialFeatures

In [140]:
X = pd.read_json('https://github.com/yujinc129-oss/Drama_data_project/raw/refs/heads/main/data/processed/X')
y = pd.read_json('https://github.com/yujinc129-oss/Drama_data_project/raw/refs/heads/main/data/processed/y')

In [141]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2177 entries, 0 to 2176
Data columns (total 39 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                2177 non-null   int64  
 1   action             2177 non-null   int64  
 2   comedy             2177 non-null   int64  
 3   drama              2177 non-null   int64  
 4   etc                2177 non-null   int64  
 5   food               2177 non-null   int64  
 6   hist_war           2177 non-null   int64  
 7   music              2177 non-null   int64  
 8   romance            2177 non-null   int64  
 9   sf                 2177 non-null   int64  
 10  society            2177 non-null   int64  
 11  sports             2177 non-null   int64  
 12  thriller           2177 non-null   int64  
 13  youth              2177 non-null   int64  
 14  friday             2177 non-null   int64  
 15  monday             2177 non-null   int64  
 16  saturday           2177 

In [142]:
#변수 제거
X= X.drop(columns=['gender_Male', 'gender_Female','role_Main Role','role_Support Role','etc','CJ'])

In [143]:
#train/test split
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42)

#Linear regression

In [8]:
linear = LinearRegression()

In [11]:
cv = cross_validate(linear, X_tr, y_tr, return_train_score=True)

In [14]:
print('train_score : ',cv['train_score'].mean())
print('test_score : ',cv['test_score'].mean())

train_score :  0.22560566805125498
test_score :  0.186926613640018


#Polynoial regression

In [16]:
poly = PolynomialFeatures(degree=2, include_bias=False)

In [17]:
poly_reg = Pipeline([('poly',poly),('linear',linear)])

In [18]:
cv =cross_validate(poly_reg, X_tr, y_tr, n_jobs=-1, return_train_score=True)

In [19]:
print('train_score : ',cv['train_score'].mean())
print('test_score : ',cv['test_score'].mean())

train_score :  0.7625879952039346
test_score :  -633.8871701049375


In [30]:
param = {'poly__degree' : np.arange(2,5)}

gv = GridSearchCV(poly_reg, param, n_jobs=-1, return_train_score=True)

gv.fit(X_tr, y_tr)

In [31]:
gv.best_params_

{'poly__degree': np.int64(2)}

In [32]:
gv.best_score_

np.float64(-633.8871701049375)

#Ridge regression

In [33]:
from sklearn.linear_model import Ridge

In [35]:
ridge = Ridge()

In [37]:
poly_reg = Pipeline([('poly',poly), ('ridge',ridge)])

In [38]:
# cross valid
cv = cross_validate(poly_reg, X_tr, y_tr, n_jobs=-1, return_train_score=True)

In [39]:
print('train_score : ',cv['train_score'].mean())
print('test_score : ',cv['test_score'].mean())

train_score :  0.6781730436239048
test_score :  0.4919294278648426


In [42]:
#grid search
param ={'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

gv = GridSearchCV(poly_reg, param, n_jobs=-1, return_train_score=True)

gv.fit(X_tr, y_tr)

In [43]:
gv.best_params_

{'ridge__alpha': 1}

In [47]:
gv.cv_results_

{'mean_fit_time': array([0.07151423, 0.0701901 , 0.07191453, 0.06983318, 0.0750504 ,
        0.07430048, 0.06483598]),
 'std_fit_time': array([0.00186041, 0.00266272, 0.00161743, 0.00133723, 0.0052931 ,
        0.00324736, 0.01169376]),
 'mean_score_time': array([0.00812454, 0.00867834, 0.00769596, 0.00778589, 0.00830193,
        0.00776505, 0.00733104]),
 'std_score_time': array([0.00089254, 0.00164332, 0.00044474, 0.0002944 , 0.00051101,
        0.00042876, 0.0013952 ]),
 'param_ridge__alpha': masked_array(data=[0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
              mask=[False, False, False, False, False, False, False],
        fill_value=1e+20),
 'params': [{'ridge__alpha': 0.001},
  {'ridge__alpha': 0.01},
  {'ridge__alpha': 0.1},
  {'ridge__alpha': 1},
  {'ridge__alpha': 10},
  {'ridge__alpha': 100},
  {'ridge__alpha': 1000}],
 'split0_test_score': array([0.31257001, 0.4215187 , 0.47341869, 0.49574413, 0.44069822,
        0.31979935, 0.23390353]),
 'split1_test_score': array(

# Lasso Regression

In [48]:
from sklearn.linear_model import Lasso

In [49]:
poly_lasso= Pipeline([('poly',poly), ('lasso',Lasso())])

In [50]:
#cross valid

cv=cross_validate(poly_lasso, X_tr, y_tr, return_train_score=True)

print('train_score : ',cv['train_score'].mean())
print('test_score : ',cv['test_score'].mean())

train_score :  0.22248506395787232
test_score :  0.186810422902477


In [79]:
#grid search

param ={'lasso__alpha': np.logspace(-6, -1, 10)
        }

gv = GridSearchCV(poly_lasso, param, n_jobs=-1, return_train_score=True)

gv.fit(X_tr, y_tr)

  model = cd_fast.enet_coordinate_descent(


In [80]:
gv.best_params_

{'lasso__alpha': np.float64(0.0001668100537200059)}

In [81]:
gv.best_score_

np.float64(0.4801523438310006)

In [82]:
print(gv.cv_results_['mean_train_score'])
print(gv.cv_results_['mean_test_score'])

[0.70760511 0.70670119 0.70257075 0.69019595 0.65888562 0.56130266
 0.37959186 0.2486143  0.23526838 0.22572395]
[0.29108176 0.33693716 0.41300585 0.47828637 0.48015234 0.41151259
 0.2785374  0.18183876 0.18581559 0.18677177]


#ElasticNet

In [72]:
from sklearn.linear_model import ElasticNet

In [74]:
pipe_ela = Pipeline([('poly',poly), ('ela',ElasticNet())])

In [83]:
#GridSearch
params={'ela__alpha':np.logspace(-6, -1, 10)}
gv = GridSearchCV(pipe_ela, params, n_jobs=-1, return_train_score = True)

gv.fit(X_tr, y_tr)

  model = cd_fast.enet_coordinate_descent(


In [84]:
gv.best_params_

{'ela__alpha': np.float64(0.0001668100537200059)}

In [85]:
gv.best_score_

np.float64(0.48812440952327735)

In [86]:
print(gv.cv_results_['mean_train_score'])
print(gv.cv_results_['mean_test_score'])

[0.70769031 0.70710839 0.70462713 0.69625771 0.67775927 0.62048837
 0.47629877 0.29487576 0.24193559 0.23001211]
[0.28627587 0.3330747  0.39186424 0.46198431 0.48812441 0.45639684
 0.34692486 0.2117622  0.18277652 0.18718751]


#SVR

In [87]:
from sklearn.svm import SVR

In [None]:
pipe_svr = Pipeline([('poly',poly), ('svr',SVR())])

#GridSearch
param = [
    {
        'svr__kernel': ['rbf'],
        'svr__C': [0.1, 1, 10, 100],
        'svr__gamma': [0.001, 0.01, 0.1, 1]
    },
    {
        'svr__kernel': ['linear'],
        'svr__C': [0.1, 1, 10, 100]
    },
    {
        'svr__kernel': ['poly'],
        'svr__C': [0.1, 1, 10],
        'svr__degree': [2, 3] # poly 커널은 degree를 튜닝
    }]
gv = GridSearchCV(pipe_ela, params, n_jobs=-1, return_train_score = True)

gv.fit(X_tr, y_tr)



In [90]:
results_df = pd.DataFrame(gv.cv_results_)

# 원하는 컬럼만 선택하여 정렬할 수도 있습니다.
results_df = results_df.sort_values(by='rank_test_score')
print(results_df[[
    'params',
    'mean_test_score',
    'mean_train_score',
]].head())

# CSV 파일로 저장 (파일명에 모델과 날짜를 포함하면 좋습니다)
from datetime import datetime
today_str = datetime.now().strftime('%Y%m%d')
results_df.to_csv(f'svr_tuning_results_{today_str}.csv', index=False)

                                   params  mean_test_score  mean_train_score
4   {'ela__alpha': 0.0001668100537200059}         0.488124          0.677759
3   {'ela__alpha': 4.641588833612782e-05}         0.461984          0.696258
5   {'ela__alpha': 0.0005994842503189409}         0.456397          0.620488
2  {'ela__alpha': 1.2915496650148827e-05}         0.391864          0.704627
6   {'ela__alpha': 0.0021544346900318843}         0.346925          0.476299


#Graident Boosting

In [92]:
from sklearn.ensemble import GradientBoostingRegressor

In [98]:
pipe_gb = Pipeline([('poly',poly), ('gb',GradientBoostingRegressor())])

params={'gb__n_estimators': [200],
    'gb__learning_rate': [0.1],
        'gb__max_depth':[3,4,5],
        'gb__subsample': [0.8, 0.9]}

gv = GridSearchCV(pipe_gb, params, n_jobs=-1, return_train_score = True)

gv.fit(X_tr, y_tr)

  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


In [99]:
gv.best_params_

{'gb__learning_rate': 0.1,
 'gb__max_depth': 5,
 'gb__n_estimators': 200,
 'gb__subsample': 0.9}

In [100]:
gv.best_score_

np.float64(0.7397920580045497)

In [101]:
print(gv.cv_results_['mean_train_score'])
print(gv.cv_results_['mean_test_score'])

[0.80640734 0.7985832  0.90813622 0.90364147 0.96105552 0.95823404]
[0.59947109 0.60883014 0.69462947 0.6969149  0.73497577 0.73979206]


#StackingRegressor

In [104]:
from sklearn.ensemble import StackingRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor

In [106]:
# 1. 1단계 모델(Base Models) 정의
# 이전에 찾은 최적의 하이퍼파라미터를 사용합니다.
estimators = [
    ('rf', RandomForestRegressor(n_estimators=600, max_depth=18, min_samples_leaf=1, random_state=42, n_jobs=-1)),
    ('xgb', XGBRegressor(n_estimators=600, gamma=0.01, lmbda=0.05, max_depth=13, learning_rate=0.05, colsample_bytree=0.8, subsample=0.9, random_state=42))
]

# 2. 2단계 모델(Meta-Model) 및 StackingRegressor 생성
# final_estimator에는 2단계 모델을, cv는 교차 검증 폴드 수를 지정합니다.
stacking_model = StackingRegressor(
    estimators=estimators,
    final_estimator=Ridge(), # 2단계 모델로 Ridge 회귀 사용
    cv=5, # 데이터 유출 방지를 위한 내부 교차 검증 횟수
    n_jobs=-1
)

# 3. 모델 학습
# fit 한 줄이면 내부적으로 복잡한 교차 검증과 학습이 모두 이루어집니다.
print("스태킹 모델 학습을 시작합니다...")
stacking_model.fit(X_tr, y_tr)
print("학습 완료!")

# 4. 성능 평가
train_score = stacking_model.score(X_tr, y_tr)
test_score = stacking_model.score(X_tr, y_tr)

print(f"\n스태킹 모델 Train Score: {train_score:.6f}")
print(f"스태킹 모델 Test Score: {test_score:.6f}")
print(f"Train-Test 격차: {train_score - test_score:.6f}")

스태킹 모델 학습을 시작합니다...


  y = column_or_1d(y, warn=True)


학습 완료!

스태킹 모델 Train Score: 0.975901
스태킹 모델 Test Score: 0.975901
Train-Test 격차: 0.000000


In [110]:
scores = cross_val_score(stacking_model, X_tr, y_tr, cv=5, scoring='r2', n_jobs=-1)
print("교차 검증 완료!")

교차 검증 완료!


In [111]:
print(f"\n각 폴드별 검증 점수(R2): \n{scores}")
print(f"\n평균 검증 점수: {np.mean(scores):.6f}")
print(f"검증 점수 표준편차: {np.std(scores):.6f}")


각 폴드별 검증 점수(R2): 
[0.71892456 0.80914102 0.79300853 0.77785298 0.7699751 ]

평균 검증 점수: 0.773780
검증 점수 표준편차: 0.030530


#Randomforest

In [144]:
randomf = RandomForestRegressor()
cv = cross_validate(randomf, X_tr, y_tr, n_jobs=-1, return_train_score=True)

In [146]:
print('train_score : ',cv['train_score'].mean())
print('test_score : ',cv['test_score'].mean())

train_score :  0.9643876516972867
test_score :  0.7541632353164778


In [168]:

params = { 'n_estimators': [300, 500, 700],  # 이전 최적값으로 고정
    'max_depth': [10, 15, 20,25, 30], # 이전 최적값(18)보다 의도적으로 낮게 설정
    'min_samples_leaf': [2, 3, 4, 5],                 # 간격을 두고 3개 값 테스트
    #'min_samples_split' : [2, 5, 10],                # 기존 범위에서 일부만 선택
    #'max_features': ['sqrt']
           }

grid = GridSearchCV(randomf, params, n_jobs=-1, cv= 5, return_train_score=True,)
grid.fit(X_tr, y_tr)


  return fit_method(estimator, *args, **kwargs)


In [169]:
grid.best_params_

{'max_depth': 30, 'min_samples_leaf': 2, 'n_estimators': 700}

In [170]:
grid.best_score_

np.float64(0.6992935382906633)

In [171]:
results = pd.DataFrame(grid.cv_results_)
results['train-test'] = results['mean_train_score'] - results['mean_test_score']
results_10 = results[['param_max_depth', 'param_min_samples_leaf', 'param_n_estimators',
       'mean_train_score','mean_test_score', 'train-test']].sort_values(by=['mean_test_score','train-test'], ascending=[False, True]).head(10)


results_10

Unnamed: 0,param_max_depth,param_min_samples_leaf,param_n_estimators,mean_train_score,mean_test_score,train-test
50,30,2,700,0.926626,0.699294,0.227332
24,20,2,300,0.926424,0.698305,0.228119
36,25,2,300,0.926137,0.698094,0.228043
49,30,2,500,0.926046,0.697904,0.228142
38,25,2,700,0.926101,0.69736,0.22874
12,15,2,300,0.923458,0.697206,0.226252
26,20,2,700,0.926254,0.697199,0.229055
25,20,2,500,0.925806,0.695667,0.230139
14,15,2,700,0.923431,0.695532,0.227899
37,25,2,500,0.925923,0.694681,0.231242


param_max_depth            20.000000
param_min_samples_leaf      2.000000
param_n_estimators        500.000000
mean_train_score            0.926063
mean_test_score             0.697916
train-test                  0.228148
Name: 19, dtype: float64


In [172]:
history = pd.DataFrame(results_10.iloc[0])

In [174]:
history

Unnamed: 0,50
param_max_depth,30.0
param_min_samples_leaf,2.0
param_n_estimators,700.0
mean_train_score,0.926626
mean_test_score,0.699294
train-test,0.227332


In [175]:
best_model = grid.best_estimator_

# 2. 변수 중요도를 계산합니다.
importances = best_model.feature_importances_

# 3. 컬럼 이름은 학습 데이터(X_tr)에서 바로 가져옵니다.
feature_names = X_tr.columns

# 4. 컬럼 이름과 중요도를 합쳐서 보기 좋게 출력합니다.
feature_importances = pd.Series(importances, index=feature_names).sort_values(ascending=False)

print("--- 변수 중요도 (상위 20개) ---")
print(feature_importances)

--- 변수 중요도 (상위 20개) ---
start airing     0.144242
platform_tier    0.084486
season           0.082218
age              0.066795
sf               0.043416
drama            0.041071
comedy           0.040710
NETFLIX          0.040173
society          0.037999
thriller         0.033954
main_ratio       0.033597
hist_war         0.029410
action           0.028179
romance          0.027269
TVN              0.026010
friday           0.024003
youth            0.019905
tuesday          0.018144
JTBC             0.017254
saturday         0.016060
SBS              0.015783
monday           0.015467
KBS              0.014616
sunday           0.013263
wednesday        0.013226
thursday         0.012673
food             0.010802
WEB              0.009960
MBC              0.009501
sports           0.009096
OCN              0.008981
music            0.007291
JPP              0.004446
dtype: float64


#XGBOOST

In [176]:
from xgboost import XGBRegressor

In [177]:
xg = XGBRegressor()

In [178]:
cv = cross_validate(xg,X_tr, y_tr, n_jobs= -1, return_train_score=True )
print(cv['train_score'].mean())
print(cv['test_score'].mean())

0.9899096131324768
0.7715003252029419


In [183]:
params = {
    'learning_rate': [0.05],  # 이전 최적값으로 고정하여 탐색 시간 단축
    'max_depth': [10, 12, 14],      # 이전 최적값(12-14) 주변과 더 단순한 모델 가능성 탐색
    'n_estimators': [600],          # 고정
    'subsample': [0.9],             # 고정
    'colsample_bytree': [0.7, 0.8, 0.9], # 1 이하의 유효한 값으로 수정 및 탐색

    # 규제 파라미터의 영향력을 더 명확히 보기 위해 범위를 약간 넓힘
    'gamma': [0, 0.1, 0.2],
    'lambda': [0, 1, 2]
}

grid = GridSearchCV(xg, params, n_jobs=-1, cv= 5, return_train_score=True)
grid.fit(X_tr, y_tr)

In [184]:
grid.best_params_

{'colsample_bytree': 0.8,
 'gamma': 0,
 'lambda': 0,
 'learning_rate': 0.05,
 'max_depth': 10,
 'n_estimators': 600,
 'subsample': 0.9}

In [185]:
grid.best_score_

np.float64(0.8415792107582092)

In [186]:
results = pd.DataFrame(grid.cv_results_)
results['train-test'] = results['mean_train_score'] - results['mean_test_score']
results_10 = results[['param_colsample_bytree', 'param_learning_rate', 'param_max_depth',
        'param_subsample','mean_train_score', 'mean_test_score','train-test']].sort_values(by=['mean_test_score','train-test'], ascending=[False, True]).head(10)

results_10

Unnamed: 0,param_colsample_bytree,param_learning_rate,param_max_depth,param_subsample,mean_train_score,mean_test_score,train-test
27,0.8,0.05,10,0.9,0.999988,0.841579,0.158409
28,0.8,0.05,12,0.9,0.999991,0.839262,0.160729
31,0.8,0.05,12,0.9,0.999988,0.837873,0.162115
35,0.8,0.05,14,0.9,0.999988,0.835752,0.164236
32,0.8,0.05,14,0.9,0.99999,0.835651,0.164338
29,0.8,0.05,14,0.9,0.999994,0.835601,0.164393
0,0.7,0.05,10,0.9,0.999987,0.833297,0.166689
34,0.8,0.05,12,0.9,0.999986,0.833127,0.166858
30,0.8,0.05,10,0.9,0.999984,0.831569,0.168416
33,0.8,0.05,10,0.9,0.999982,0.831197,0.168785
