# 영화 관객수 예측

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("./data/movies_train.csv")
test = pd.read_csv('./data/movies_test.csv')

# 배급사 전처리

In [3]:
train['distributor'] = train.distributor.str.replace("(주)", '')
test['distributor'] = test.distributor.str.replace("(주)", '')

In [4]:
def get_dis(x) :
    if 'CJ' in x or 'CGV' in x :
        return 'CJ'
    elif '쇼박스' in x :
        return '쇼박스'
    elif 'SK' in x :
        return 'SK'
    elif '리틀빅픽' in x :
        return '리틀빅픽처스'
    elif '스폰지' in x :
        return '스폰지'
    elif '싸이더스' in x :
        return '싸이더스'
    elif '에이원' in x :
        return '에이원'
    elif '마인스' in x :
        return '마인스'
    elif '마운틴픽' in x :
        return '마운틴픽처스'
    elif '디씨드' in x :
        return '디씨드'
    elif '드림팩트' in x :
        return '드림팩트'
    elif '메가박스' in x :
        return '메가박스'
    elif '마운틴' in x :
        return '마운틴'
    else :
        return x

In [5]:
train['distributor'] = train.distributor.apply(get_dis)
test['distributor'] = test.distributor.apply(get_dis)

In [6]:
train.groupby('genre').box_off_num.mean().sort_values()

genre
뮤지컬       6.627000e+03
다큐멘터리     6.717226e+04
서스펜스      8.261100e+04
애니메이션     1.819267e+05
멜로/로맨스    4.259680e+05
미스터리      5.275482e+05
공포        5.908325e+05
드라마       6.256898e+05
코미디       1.193914e+06
SF        1.788346e+06
액션        2.203974e+06
느와르       2.263695e+06
Name: box_off_num, dtype: float64

# 장르별 영화 관객수 평균값으로 랭크 인코딩

In [7]:
train.groupby('genre').box_off_num.mean().sort_values()

genre
뮤지컬       6.627000e+03
다큐멘터리     6.717226e+04
서스펜스      8.261100e+04
애니메이션     1.819267e+05
멜로/로맨스    4.259680e+05
미스터리      5.275482e+05
공포        5.908325e+05
드라마       6.256898e+05
코미디       1.193914e+06
SF        1.788346e+06
액션        2.203974e+06
느와르       2.263695e+06
Name: box_off_num, dtype: float64

In [8]:
train['genre_rank'] = train.genre.map({'뮤지컬' : 1, '다큐멘터리' : 2, '서스펜스' : 3, '애니메이션' : 4, '멜로/로맨스' : 5,
                                      '미스터리' : 6, '공포' : 7, '드라마' : 8, '코미디' : 9, 'SF' : 10, '액션' : 11, '느와르' : 12})
test['genre_rank'] = test.genre.map({'뮤지컬' : 1, '다큐멘터리' : 2, '서스펜스' : 3, '애니메이션' : 4, '멜로/로맨스' : 5,
                                      '미스터리' : 6, '공포' : 7, '드라마' : 8, '코미디' : 9, 'SF' : 10, '액션' : 11, '느와르' : 12})

In [9]:
tr_nm_rank = train.groupby('distributor').box_off_num.median().reset_index(name = 'num_rank').sort_values(by = 'num_rank')
tr_nm_rank

Unnamed: 0,distributor,num_rank
125,인피니티엔터테인먼트,2.0
51,고구마공작소,8.0
79,사람과 사람들,42.0
115,위드시네마,46.0
54,나우콘텐츠,54.0
...,...,...
128,전망좋은영화사,1214237.0
120,이십세기폭스코리아(),1422844.0
83,쇼박스,2138560.0
105,영구아트무비,2541603.0


In [10]:
tr_nm_rank['num_rank'] = [i + 1 for i in range(tr_nm_rank.shape[0])]
tr_nm_rank

Unnamed: 0,distributor,num_rank
125,인피니티엔터테인먼트,1
51,고구마공작소,2
79,사람과 사람들,3
115,위드시네마,4
54,나우콘텐츠,5
...,...,...
128,전망좋은영화사,148
120,이십세기폭스코리아(),149
83,쇼박스,150
105,영구아트무비,151


In [11]:
train = pd.merge(train, tr_nm_rank, how = 'left')

In [12]:
test = pd.merge(test, tr_nm_rank, how = 'left')

In [13]:
test.fillna(0, inplace = True)

# 모델링 데이터 구성
### 타겟 값인 관객수를 로그변환
### 상영등급은 더미 변수
### 출연 배우 수는 로그변환

In [41]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from ngboost import NGBRegressor

In [42]:
X = train[['num_rank', 'time', 'num_staff', 'num_actor', 'genre_rank', 'screening_rat']]
y = np.log1p(train.box_off_num)

In [43]:
X = pd.get_dummies(columns = ['screening_rat'], data = X)

In [44]:
X['num_actor'] = np.log1p(X['num_actor'])

In [45]:
target = test[['num_rank', 'time', 'num_staff', 'num_actor', 'genre_rank', 'screening_rat']]

In [46]:
target = pd.get_dummies(columns = ['screening_rat'], data = target)

In [47]:
target['num_actor'] = np.log1p(target['num_actor'])

In [48]:
kf = KFold(n_splits =  10, shuffle = True, random_state = 42)

In [49]:
gbm = GradientBoostingRegressor(random_state = 42)

In [50]:
rmse_list = []
gb_pred = np.zeros((test.shape[0]))
for tr_idx, val_idx in kf.split(X, y) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    gbm.fit(tr_x, tr_y)
    
    pred = np.expm1([0 if x < 0 else x for x in gbm.predict(val_x)])
    sub_pred = np.expm1([0 if x < 0 else x for x in gbm.predict(target)])
    rmse = np.sqrt(mean_squared_error(val_y, pred))
    
    rmse_list.append(rmse)
    
    gb_pred += (sub_pred / 10)

In [51]:
np.mean(rmse_list)

1136799.2172945936

In [52]:
ngb = NGBRegressor(random_state = 518)

In [53]:
rmse_list = []
ngb_pred = np.zeros((test.shape[0]))
for tr_idx, val_idx in kf.split(X, y) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    ngb.fit(tr_x, tr_y)
    
    pred = np.expm1([0 if x < 0 else x for x in ngb.predict(val_x)])
    sub_pred = np.expm1([0 if x < 0 else x for x in ngb.predict(target)])
    rmse = np.sqrt(mean_squared_error(val_y, pred))
    
    rmse_list.append(rmse)
    
    ngb_pred += (sub_pred / 10)

[iter 0] loss=2.6200 val_loss=0.0000 scale=1.0000 norm=2.9523
[iter 100] loss=1.9644 val_loss=0.0000 scale=2.0000 norm=2.7260
[iter 200] loss=1.5825 val_loss=0.0000 scale=2.0000 norm=2.1835
[iter 300] loss=1.4500 val_loss=0.0000 scale=1.0000 norm=1.0474
[iter 400] loss=1.3739 val_loss=0.0000 scale=0.5000 norm=0.5079
[iter 0] loss=1.4009 val_loss=0.0000 scale=1.0000 norm=1.0662
[iter 100] loss=1.3241 val_loss=0.0000 scale=1.0000 norm=1.0043
[iter 200] loss=1.2820 val_loss=0.0000 scale=0.5000 norm=0.4906
[iter 300] loss=1.2505 val_loss=0.0000 scale=1.0000 norm=0.9651
[iter 400] loss=1.2216 val_loss=0.0000 scale=1.0000 norm=0.9517
[iter 0] loss=1.2099 val_loss=0.0000 scale=1.0000 norm=0.9445
[iter 100] loss=1.1605 val_loss=0.0000 scale=0.5000 norm=0.4573
[iter 200] loss=1.1382 val_loss=0.0000 scale=0.2500 norm=0.2258
[iter 300] loss=1.1248 val_loss=0.0000 scale=0.2500 norm=0.2244
[iter 400] loss=1.1109 val_loss=0.0000 scale=0.2500 norm=0.2221
[iter 0] loss=1.1543 val_loss=0.0000 scale=1.0

In [54]:
np.mean(rmse_list)

1445085.6307961675

In [55]:
lgbm = LGBMRegressor(random_state = 518)

In [56]:
rmse_list = []
lgb_pred = np.zeros((test.shape[0]))
for tr_idx, val_idx in kf.split(X, y) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    lgbm.fit(tr_x, tr_y)
    
    pred = np.expm1([0 if x < 0 else x for x in lgbm.predict(val_x)])
    sub_pred = np.expm1([0 if x < 0 else x for x in lgbm.predict(target)])
    rmse = np.sqrt(mean_squared_error(val_y, pred))
    
    rmse_list.append(rmse)
    
    lgb_pred += (sub_pred / 10)

In [57]:
np.mean(rmse_list)

1155679.240289382

In [58]:
xgb = XGBRegressor(random_state = 518)

In [59]:
rmse_list = []
xgb_pred = np.zeros((test.shape[0]))
for tr_idx, val_idx in kf.split(X, y) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    xgb.fit(tr_x, tr_y)
    
    pred = np.expm1([0 if x < 0 else x for x in xgb.predict(val_x)])
    sub_pred = np.expm1([0 if x < 0 else x for x in xgb.predict(target)])
    rmse = np.sqrt(mean_squared_error(val_y, pred))
    
    rmse_list.append(rmse)
    
    xgb_pred += (sub_pred / 10)

In [60]:
cat = CatBoostRegressor(random_state = 518, silent = True)

In [61]:
rmse_list = []
cat_pred = np.zeros((test.shape[0]))
for tr_idx, val_idx in kf.split(X, y) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    cat.fit(tr_x, tr_y)
    
    pred = np.expm1([0 if x < 0 else x for x in cat.predict(val_x)])
    sub_pred = np.expm1([0 if x < 0 else x for x in cat.predict(target)])
    rmse = np.sqrt(mean_squared_error(val_y, pred))
    
    rmse_list.append(rmse)
    
    cat_pred += (sub_pred / 10)

In [62]:
rf = RandomForestRegressor(random_state = 518)

In [63]:
rmse_list = []
rf_pred = np.zeros((test.shape[0]))
for tr_idx, val_idx in kf.split(X, y) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    rf.fit(tr_x, tr_y)
    
    pred = np.expm1([0 if x < 0 else x for x in rf.predict(val_x)])
    sub_pred = np.expm1([0 if x < 0 else x for x in rf.predict(target)])
    rmse = np.sqrt(mean_squared_error(val_y, pred))
    
    rmse_list.append(rmse)
    
    rf_pred += (sub_pred / 10)

In [64]:
np.mean(rmse_list)

870860.0449658377

In [65]:
(xgb_pred + cat_pred + lgb_pred + rf_pred + gb_pred + ngb_pred) / 6

array([1.17486862e+06, 2.03849083e+06, 8.02574174e+05, 2.21538085e+06,
       1.85251960e+06, 5.63032078e+01, 1.65875705e+03, 3.06949317e+01,
       2.67927158e+03, 1.39187497e+02, 1.25984360e+05, 5.57432110e+05,
       2.44960028e+02, 2.09397117e+03, 4.20699745e+05, 1.82280893e+06,
       2.27597771e+03, 2.07013152e+06, 1.09492336e+05, 3.24728393e+05,
       3.65864978e+01, 1.79137898e+03, 8.08519383e+03, 9.66380052e+01,
       5.73856442e+01, 6.83021632e+05, 1.49781240e+03, 1.16718922e+06,
       8.31561710e+03, 4.77240125e+01, 8.04693915e+05, 8.54334510e+04,
       3.63831944e+03, 3.92341109e+03, 3.18652905e+03, 2.02489437e+03,
       1.30845960e+03, 2.97248447e+03, 2.34680268e+03, 1.36943078e+01,
       4.97629091e+01, 3.95404451e+04, 7.82133529e+05, 1.61557810e+05,
       2.35918395e+05, 7.34982090e+05, 3.83489129e+01, 5.59283736e+01,
       4.09062175e+04, 2.16713841e+02, 1.10714742e+03, 1.00799932e+06,
       3.57158534e+03, 7.58231577e+05, 1.15800423e+06, 1.31805363e+02,
      