# 다섯번째 모델의 검증

Key Attempts:
- 관객 수 라벨별 정확도 -> 어느 특정 라벨에서 정확도가 높았나?
- 감독별/배우별 정확도 -> 특정 감독/배우의 경우 정확도가 높나?
- 청불 특성이 주는 영향은 어느정도가 되는가?

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('./data/join_final_v5.csv', encoding='utf-8-sig', thousands=',')

## 1. 메디안 값 치환

In [4]:
# 감독, 배우, 배급사 각각의 중앙 값 테이블 만들기
director = df.groupby('director').agg({"view":"median"}).reset_index()
actor = df.groupby('actor').agg({"view":"median"}).reset_index()
distributor = df.groupby('distributor').agg({"view":"median"}).reset_index()
genre = df.groupby('genre').agg({"view":"median"}).reset_index()

In [5]:
for idx, row in df.iterrows():
    act = row['actor']
    dist = row['distributor']
    dir = row['director']
    g = row['genre']

    a_val = actor[actor['actor'] == act]['view'].values[0]
    df.loc[idx, 'actor'] = a_val

    dist_val = distributor[distributor['distributor'] == dist]['view'].values[0]
    df.loc[idx, 'distributor'] = dist_val

    dir_val = director[director['director'] == dir]['view'].values[0]
    df.loc[idx, 'director'] = dir_val

    g_val = genre[genre['genre'] == g]['view'].values[0]
    df.loc[idx, 'genre'] = g_val

## 2. 관객 수 범주형 변환

In [6]:
for idx, row in df.iterrows():
    val = row['view']
    if val <10000:
        df.loc[idx, 'view'] = 1
    elif val <100000:
        df.loc[idx, 'view'] = 2
    elif val <500000:
        df.loc[idx, 'view'] = 3
    elif val <1000000:
        df.loc[idx, 'view'] = 4
    elif val <3000000:
        df.loc[idx, 'view'] = 5
    elif val <7000000:
        df.loc[idx, 'view'] = 6
    elif val <10000000:
        df.loc[idx, 'view'] = 7
    else:
        df.loc[idx, 'view'] = 8

## 3. 라벨인코딩

In [7]:
y = df['view']
X = df.drop('view', axis=1)

In [8]:
X = pd.get_dummies(data = X, columns=['nation'], prefix='region')

In [9]:
X = pd.get_dummies(data = X, columns=['openMonth'], prefix='month')

## 4. 데이터 스플릿

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=13, stratify=y)

## 5. Normalise

In [11]:
# train data의 runTm 컬럼만 normalisation fit 적용
from sklearn.preprocessing import MinMaxScaler

mm = MinMaxScaler()
X_train.iloc[:,0:6] = mm.fit_transform(X_train.iloc[:, 0:6])

In [12]:
# test 데이터의 runTm 컬럼도 같은 minMax scaler transform
X_test.iloc[:,0:6] = mm.transform(X_test.iloc[:, 0:6])

## 6. Modelling

## 7. Boosting

In [14]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import time
import warnings

warnings.filterwarnings('ignore')

In [15]:
start_time = time.time()

gb_clf = GradientBoostingClassifier(random_state=13, n_estimators=100, learning_rate=0.01, max_depth=4)
gb_clf.fit(X_train, y_train)
gb_pred = gb_clf.predict(X_test)
gb_train_pd = gb_clf.predict(X_train)

print("ACC : ", accuracy_score(y_test, gb_pred))
print("Fit time : ", time.time() - start_time)

ACC :  0.7502448579823702
Fit time :  9.450073957443237


In [16]:
print("Train Accuracy : ", accuracy_score(y_train, gb_train_pd))
print("Test Accuracy : ", accuracy_score(y_test, gb_pred))

Train Accuracy :  0.8035264483627204
Test Accuracy :  0.7502448579823702


In [17]:
y_train_copy = y_train.copy()
y_test_copy = y_test.copy()

y_train_copy -= 1
y_test_copy -= 1

In [18]:
from xgboost import XGBClassifier

start_time = time.time()
xgb = XGBClassifier(n_estimators=200, learning_rate=0.01, max_depth=3)
xgb.fit(X_train.values, y_train_copy.values)

print("Accuracy : ", accuracy_score(y_test_copy.values, xgb.predict(X_test.values)))
print('Fit time : ', time.time() - start_time)

Accuracy :  0.7507345739471106
Fit time :  4.066618204116821


In [19]:
print("Train Accuracy : ", accuracy_score(y_train_copy.values, xgb.predict(X_train.values)))
print("Test Accuracy : ", accuracy_score(y_test_copy.values, xgb.predict(X_test.values)))

Train Accuracy :  0.783375314861461
Test Accuracy :  0.7507345739471106


---

## 8. Conclusion

---

## 9. Prediction

---

## 10. 모델 결과 분석

In [20]:
movie_raw = pd.read_csv('./data/join_v5.csv', encoding='utf-8-sig', thousands=',')

In [21]:
movie_raw.head()

Unnamed: 0,movieCd,movieNm,showTm,nationNm,genres_1,directorNm,actor1_Nm,dist_companyNm,view,openMonth,is_adult
0,20040598,해리포터와 아즈카반의 죄수,136,2,1,알폰소 쿠아론,다니엘 래드클리프,워너브러더스 코리아(주),1865469,7,0
1,20030440,자토이치,119,3,2,기타노 다케시,기타노 다케시,(주)영화사 진진,24634,1,0
2,20040487,범죄의 재구성,116,1,3,최동훈,김윤석,(주)쇼박스,943241,4,1
3,20040462,빅 피쉬,125,2,1,팀 버튼,이완 맥그리거,팝엔터테인먼트,85594,3,0
4,20040559,나두야 간다,105,1,5,정연원,권용운,롯데엔터테인먼트,193554,6,0


In [22]:
xgb_train = xgb.predict(X_train.values)
xgb_test = xgb.predict(X_test.values)

In [23]:
X_train_copy = X_train.copy()
X_test_copy = X_test.copy()

X_train_copy['view_cat'] = y_train
X_test_copy['view_cat'] = y_test

xgb_train_1 = xgb_train + 1
xgb_test_1 = xgb_test + 1

X_train_copy['xgb_pred'] = xgb_train_1
X_test_copy['xgb_pred'] = xgb_test_1

In [24]:
pred_df = pd.concat([X_train_copy, X_test_copy])

In [25]:
new = pd.merge(movie_raw, pred_df, left_index=True, right_index=True)

In [27]:
new = new.drop(['region_1', 'region_2', 'region_3', 'month_1', 'month_2', 'month_3',
       'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9',
       'month_10', 'month_11', 'month_12', 'is_adult_y'], axis=1)

In [28]:
new['correct'] = 0.

for idx, row in new.iterrows():
    if row['view_cat'] == row['xgb_pred']:
        new.loc[idx, 'correct'] = 1.

### 10.1 전체 데이터의 관객 범주별 정확도

In [29]:
view_cat = new.groupby(['view_cat']).agg({"correct":"sum", "movieCd":"count"}).reset_index()
view_cat['acc_rate'] = view_cat['correct']/view_cat['movieCd']
view_cat

Unnamed: 0,view_cat,correct,movieCd,acc_rate
0,1,2205.0,2421,0.910781
1,2,1691.0,2063,0.81968
2,3,798.0,1143,0.698163
3,4,145.0,426,0.340376
4,5,346.0,521,0.664107
5,6,60.0,175,0.342857
6,7,11.0,32,0.34375
7,8,9.0,25,0.36


### 10.2 트레인 데이터의 관객 범주별 정확도

In [31]:
train_df = pd.merge(movie_raw, X_train_copy, left_index=True, right_index=True)

In [33]:
train_df['correct'] = 0.

for idx, row in new.iterrows():
    if row['view_cat'] == row['xgb_pred']:
        train_df.loc[idx, 'correct'] = 1.

In [34]:
train_cat = train_df.groupby(['view_cat']).agg({"correct":"sum", "movieCd":"count"}).reset_index()
train_cat['acc_rate'] = train_cat['correct']/train_cat['movieCd']
train_cat

Unnamed: 0,view_cat,correct,movieCd,acc_rate
0,1.0,1546.0,1695,0.912094
1,2.0,1191.0,1444,0.824792
2,3.0,582.0,800,0.7275
3,4.0,102.0,298,0.342282
4,5.0,247.0,365,0.676712
5,6.0,49.0,122,0.401639
6,7.0,9.0,22,0.409091
7,8.0,6.0,18,0.333333


### 10.3 테스트 데이터의 관객 범주별 정확도

In [35]:
test_df = pd.merge(movie_raw, X_test_copy, left_index=True, right_index=True)

In [36]:
test_df['correct'] = 0.

for idx, row in new.iterrows():
    if row['view_cat'] == row['xgb_pred']:
        test_df.loc[idx, 'correct'] = 1.

In [37]:
test_cat = test_df.groupby(['view_cat']).agg({"correct":"sum", "movieCd":"count"}).reset_index()
test_cat['acc_rate'] = test_cat['correct']/test_cat['movieCd']
test_cat

Unnamed: 0,view_cat,correct,movieCd,acc_rate
0,1.0,659.0,726,0.907713
1,2.0,500.0,619,0.807754
2,3.0,216.0,343,0.629738
3,4.0,43.0,128,0.335938
4,5.0,99.0,156,0.634615
5,6.0,11.0,53,0.207547
6,7.0,2.0,10,0.2
7,8.0,3.0,7,0.428571


### 10.4.1 전체 데이터 감독별 정확도

In [38]:
dir_group = new.groupby(['directorNm']).agg({"view":"median", "correct":"sum", "movieCd":"count"}).reset_index()
dir_group['acc_rate'] = dir_group['correct']/dir_group['movieCd']
dir_group

Unnamed: 0,directorNm,view,correct,movieCd,acc_rate
0,D.J. 카루소,576120.0,3.0,5,0.600
1,D.R. 후드,2263.0,1.0,1,1.000
2,E. 엘리아스 메리지,4918.0,1.0,1,1.000
3,F. 게리 그레이,800006.0,3.0,5,0.600
4,J 블레이크슨,81668.0,1.0,2,0.500
...,...,...,...,...,...
3869,히라카와 유이치로,4392.0,1.0,1,1.000
3870,히로키 류이치,2132.5,3.0,4,0.750
3871,히로타 유스케,23737.0,1.0,1,1.000
3872,히시다 마사카즈,15749.5,5.0,8,0.625


In [40]:
dir_group[dir_group['movieCd'] > 10].sort_values('acc_rate', ascending=False).head(10)

Unnamed: 0,directorNm,view,correct,movieCd,acc_rate
2909,조성규,1524.0,9.0,11,0.818182
2329,유야마 쿠니히코,288230.0,13.0,16,0.8125
87,고레에다 히로카즈,42286.0,9.0,12,0.75
2537,이준익,1648634.5,9.0,12,0.75
3809,홍상수,37122.0,15.0,21,0.714286
3617,프랑수아 오종,9504.0,9.0,13,0.692308
854,뤽 베송,44245.0,7.0,11,0.636364
814,론 하워드,213980.0,4.0,11,0.363636
1715,스티븐 스필버그,763078.0,3.0,11,0.272727
876,리들리 스콧,589899.0,3.0,13,0.230769


### 10.4.2 트레인 데이터 감독별 정확도

In [41]:
dir_train = train_df.groupby(['directorNm']).agg({"view":"median", "correct":"sum", "movieCd":"count"}).reset_index()
dir_train['acc_rate'] = dir_train['correct']/dir_train['movieCd']

Unnamed: 0,directorNm,view,correct,movieCd,acc_rate
0,D.J. 카루소,576120.0,3.0,5,0.600000
1,D.R. 후드,2263.0,1.0,1,1.000000
2,E. 엘리아스 메리지,4918.0,1.0,1,1.000000
3,F. 게리 그레이,800006.0,3.0,5,0.600000
4,J 블레이크슨,81668.0,1.0,2,0.500000
...,...,...,...,...,...
3048,히라야나기 아츠코,11464.0,0.0,1,0.000000
3049,히라야마 미호,140395.0,1.0,1,1.000000
3050,히라오 타카유키,23057.0,1.0,1,1.000000
3051,히로키 류이치,2132.5,3.0,4,0.750000


In [55]:
dir_train[dir_train['movieCd'] > 5].sort_values('acc_rate', ascending=False).head(10)

Unnamed: 0,directorNm,view,correct,movieCd,acc_rate
1386,신보 아키유키,11302.0,7.0,7,1.0
449,대런 린 보우즈만,201911.5,6.0,6,1.0
2075,자비에 돌란,12549.0,6.0,7,0.857143
156,김기덕,44900.0,6.0,7,0.857143
1356,스티븐 소더버그,32288.0,5.0,6,0.833333
1053,박흥식,221147.0,5.0,6,0.833333
341,나카시마 테츠야,12891.5,5.0,6,0.833333
2304,조성규,1524.0,7.0,9,0.777778
1841,유야마 쿠니히코,300477.0,10.0,13,0.769231
634,로베르트 슈벤트케,282707.0,5.0,7,0.714286


### 10.4.3 테스트 데이터 감독별 정확도

In [43]:
dir_test = test_df.groupby(['directorNm']).agg({"view":"median", "correct":"sum", "movieCd":"count"}).reset_index()
dir_test['acc_rate'] = dir_test['correct']/dir_test['movieCd']

Unnamed: 0,directorNm,view,correct,movieCd,acc_rate
0,J.C 챈더,7694.0,0.0,1,0.000000
1,J.J. 에이브럼스,507889.0,1.0,3,0.333333
2,M. 나이트 샤말란,295058.5,0.0,2,0.000000
3,P.B. 셰므란,18312.0,1.0,1,1.000000
4,R. 엘리스 프레이저,4032.0,1.0,1,1.000000
...,...,...,...,...,...
1576,히라바야시 이사무,15870.0,2.0,2,1.000000
1577,히라카와 유이치로,4392.0,1.0,1,1.000000
1578,히로타 유스케,23737.0,1.0,1,1.000000
1579,히시다 마사카즈,129965.0,0.0,1,0.000000


In [52]:
dir_test[dir_test['movieCd']>3].sort_values('acc_rate', ascending=False).head(10)

Unnamed: 0,directorNm,view,correct,movieCd,acc_rate
344,뤽 베송,22379.0,5.0,5,1.0
1148,조 라이트,177118.5,4.0,4,1.0
1013,이준익,1590774.0,6.0,7,0.857143
34,고레에다 히로카즈,35223.0,4.0,5,0.8
1462,프랑수아 오종,8215.5,6.0,8,0.75
1358,클린트 이스트우드,151226.5,3.0,4,0.75
1069,장률,2301.0,3.0,4,0.75
1548,홍상수,34728.0,3.0,4,0.75
805,야마모토 야스이치로,152483.5,4.0,6,0.666667
987,이와이 슌지,12060.0,3.0,5,0.6


### 10.5.1 전체 데이터 장르별 정확도

In [56]:
genre_group = new.groupby(['genres_1']).agg({"view":"median","correct":"sum", "movieCd":"count"}).reset_index()
genre_group['acc_rate'] = genre_group['correct']/genre_group['movieCd']
genre_group['view'] = genre_group['view'].astype(int)
genre_group

Unnamed: 0,genres_1,view,correct,movieCd,acc_rate
0,1,13326,1850.0,2372,0.779933
1,2,148406,639.0,957,0.667712
2,3,49803,299.0,394,0.758883
3,5,31655,552.0,715,0.772028
4,7,17246,423.0,540,0.783333
5,8,74600,258.0,328,0.786585
6,10,7010,313.0,338,0.926036
7,11,81724,160.0,240,0.666667
8,12,39946,771.0,922,0.836226


### 10.5.2 트레인 데이터 장르별 정확도

In [57]:
genre_train = train_df.groupby(['genres_1']).agg({"view":"median","correct":"sum", "movieCd":"count"}).reset_index()
genre_train['acc_rate'] = genre_train['correct']/genre_train['movieCd']
genre_train['view'] = genre_train['view'].astype(int)
genre_train

Unnamed: 0,genres_1,view,correct,movieCd,acc_rate
0,1.0,13733,1281.0,1628,0.786855
1,2.0,146697,468.0,678,0.690265
2,3.0,47226,220.0,285,0.77193
3,5.0,33054,367.0,476,0.771008
4,7.0,16423,314.0,392,0.80102
5,8.0,70448,200.0,246,0.813008
6,10.0,7720,226.0,243,0.930041
7,11.0,63278,106.0,155,0.683871
8,12.0,43431,550.0,661,0.832073


### 10.5.3 테스트 데이터 장르별 정확도

In [58]:
genre_test = test_df.groupby(['genres_1']).agg({"view":"median","correct":"sum", "movieCd":"count"}).reset_index()
genre_test['acc_rate'] = genre_test['correct']/genre_test['movieCd']
genre_test['view'] = genre_test['view'].astype(int)
genre_test

Unnamed: 0,genres_1,view,correct,movieCd,acc_rate
0,1.0,12678,569.0,744,0.764785
1,2.0,156744,171.0,279,0.612903
2,3.0,52268,79.0,109,0.724771
3,5.0,30069,185.0,239,0.774059
4,7.0,23555,109.0,148,0.736486
5,8.0,86579,58.0,82,0.707317
6,10.0,5654,87.0,95,0.915789
7,11.0,150832,54.0,85,0.635294
8,12.0,33668,221.0,261,0.846743


### 10.6.1 전체 데이터 개봉월별 정확도

In [63]:
month_group = new.groupby(['openMonth']).agg({"view":"median","correct":"sum", "movieCd":"count"}).reset_index()
month_group['acc_rate'] = month_group['correct']/month_group['movieCd']
month_group['view'] = month_group['view'].astype(int)
month_group

Unnamed: 0,openMonth,view,correct,movieCd,acc_rate
0,1,41439,403.0,520,0.775
1,2,51487,398.0,538,0.739777
2,3,23178,437.0,564,0.774823
3,4,29048,456.0,579,0.787565
4,5,18083,429.0,552,0.777174
5,6,30576,412.0,518,0.795367
6,7,24457,395.0,518,0.762548
7,8,36139,461.0,584,0.789384
8,9,26739,456.0,585,0.779487
9,10,19616,485.0,615,0.788618


### 10.6.2 트레인 데이터 개봉월별 정확도

In [64]:
month_train = train_df.groupby(['openMonth']).agg({"view":"median","correct":"sum", "movieCd":"count"}).reset_index()
month_train['acc_rate'] = month_train['correct']/month_train['movieCd']
month_train['view'] = month_train['view'].astype(int)
month_train

Unnamed: 0,openMonth,view,correct,movieCd,acc_rate
0,1.0,40361,290.0,367,0.790191
1,2.0,48223,287.0,390,0.735897
2,3.0,22294,306.0,398,0.768844
3,4.0,30961,307.0,390,0.787179
4,5.0,16507,305.0,382,0.798429
5,6.0,31081,296.0,358,0.826816
6,7.0,24485,274.0,357,0.767507
7,8.0,44231,326.0,409,0.797066
8,9.0,29841,336.0,432,0.777778
9,10.0,18113,338.0,428,0.78972


### 10.6.3 테스트 데이터 개봉월별 정확도

In [65]:
month_test = test_df.groupby(['openMonth']).agg({"view":"median","correct":"sum", "movieCd":"count"}).reset_index()
month_test['acc_rate'] = month_test['correct']/month_test['movieCd']
month_test['view'] = month_test['view'].astype(int)
month_test

Unnamed: 0,openMonth,view,correct,movieCd,acc_rate
0,1.0,42112,113.0,153,0.738562
1,2.0,52324,111.0,148,0.75
2,3.0,24580,131.0,166,0.789157
3,4.0,26202,149.0,189,0.78836
4,5.0,20838,124.0,170,0.729412
5,6.0,27988,116.0,160,0.725
6,7.0,24337,121.0,161,0.751553
7,8.0,24319,135.0,175,0.771429
8,9.0,18222,120.0,153,0.784314
9,10.0,24682,147.0,187,0.786096


### 10.7.1 전체 데이터 배급사별 정확도

In [67]:
dist_group = new.groupby(['dist_companyNm']).agg({"view":"median","correct":"sum", "movieCd":"count"}).reset_index()
dist_group['acc_rate'] = dist_group['correct']/dist_group['movieCd']
dist_group['view'] = dist_group['view'].astype(int)

In [78]:
dist_group[dist_group['movieCd']>30].sort_values('acc_rate', ascending=False).head(10)

Unnamed: 0,dist_companyNm,view,correct,movieCd,acc_rate
104,(주)소나무픽쳐스,1657,76.0,79,0.962025
364,오드,14459,41.0,44,0.931818
407,프리비젼엔터테인먼트,5179,61.0,66,0.924242
58,(주)마운틴픽쳐스,4030,72.0,78,0.923077
168,(주)영화사 백두대간,3989,54.0,59,0.915254
199,(주)인디스토리,2624,72.0,79,0.911392
36,(주)드림팩트엔터테인먼트,4401,45.0,50,0.9
363,예지림엔터테인먼트,13630,51.0,57,0.894737
177,(주)영화사오원,3989,40.0,45,0.888889
82,(주)박수엔터테인먼트,18017,95.0,108,0.87963


In [76]:
dist_group[dist_group['movieCd']>30].sort_values('acc_rate', ascending=False).tail(10)

Unnamed: 0,dist_companyNm,view,correct,movieCd,acc_rate
256,CJ ENM,430001,329.0,499,0.659319
125,(주)시너지하우스 (시너지),185331,27.0,41,0.658537
373,유니버설픽쳐스인터내셔널 코리아(유),173887,141.0,223,0.632287
379,이십세기폭스코리아(주),276435,115.0,182,0.631868
307,소니픽쳐스릴리징코리아,282862,111.0,177,0.627119
17,(주)넥스트엔터테인먼트월드(NEW),340936,112.0,181,0.618785
62,(주)메가박스,137322,47.0,76,0.618421
368,워너브러더스 코리아(주),528537,108.0,185,0.583784
108,(주)쇼박스,708108,113.0,206,0.548544
370,월트디즈니컴퍼니코리아 유한책임회사,969543,60.0,112,0.535714


### 10.7.2 트레인 데이터 배급사별 정확도

In [71]:
dist_train = train_df.groupby(['dist_companyNm']).agg({"view":"median","correct":"sum", "movieCd":"count"}).reset_index()
dist_train['acc_rate'] = dist_train['correct']/dist_train['movieCd']
dist_train['view'] = dist_train['view'].astype(int)

In [73]:
dist_train[dist_train['movieCd']>30].sort_values('acc_rate', ascending=False).head(10)

Unnamed: 0,dist_companyNm,view,correct,movieCd,acc_rate
33,(주)드림팩트엔터테인먼트,3525,32.0,33,0.969697
95,(주)소나무픽쳐스,1653,63.0,65,0.969231
150,(주)영화사 백두대간,4076,41.0,44,0.931818
177,(주)인디스토리,2624,53.0,57,0.929825
337,찬란,5544,46.0,50,0.92
356,프리비젼엔터테인먼트,5437,43.0,47,0.914894
54,(주)마운틴픽쳐스,3828,46.0,51,0.901961
147,(주)엣나인필름,14893,51.0,57,0.894737
77,(주)박수엔터테인먼트,17746,72.0,81,0.888889
322,와이드 릴리즈(주),6044,59.0,67,0.880597


### 10.7.3 테스트 데이터 배급사별 정확도

In [74]:
dist_test = test_df.groupby(['dist_companyNm']).agg({"view":"median","correct":"sum", "movieCd":"count"}).reset_index()
dist_test['acc_rate'] = dist_test['correct']/dist_test['movieCd']
dist_test['view'] = dist_test['view'].astype(int)

In [75]:
dist_test[dist_test['movieCd']>30].sort_values('acc_rate', ascending=False).head(10)

Unnamed: 0,dist_companyNm,view,correct,movieCd,acc_rate
252,팝엔터테인먼트,11681,44.0,50,0.88
230,와이드 릴리즈(주),10315,38.0,45,0.844444
111,(주)영화사 진진,9713,46.0,55,0.836364
81,(주)시네마서비스,526622,25.0,34,0.735294
163,CJ ENM,335258,93.0,148,0.628378
183,롯데엔터테인먼트,285558,60.0,100,0.6
235,유니버설픽쳐스인터내셔널 코리아(유),146531,40.0,68,0.588235
11,(주)넥스트엔터테인먼트월드(NEW),296748,33.0,57,0.578947
231,워너브러더스 코리아(주),311334,35.0,62,0.564516
238,이십세기폭스코리아(주),247279,27.0,48,0.5625


### 10.8.1 전체 데이터 배우별 정확도

In [79]:
new.head()

Unnamed: 0,movieCd,movieNm,showTm,nationNm,genres_1,directorNm,actor1_Nm,dist_companyNm,view,openMonth,is_adult_x,runTm,genre,director,actor,distributor,view_cat,xgb_pred,correct
0,20040598,해리포터와 아즈카반의 죄수,136,2,1,알폰소 쿠아론,다니엘 래드클리프,워너브러더스 코리아(주),1865469,7,0,0.422727,0.044669,0.084233,0.016127,0.24632,5,4,0.0
1,20030440,자토이치,119,3,2,기타노 다케시,기타노 다케시,(주)영화사 진진,24634,1,0,0.345455,1.0,0.000148,0.001578,0.003291,2,1,0.0
2,20040487,범죄의 재구성,116,1,3,최동훈,김윤석,(주)쇼박스,943241,4,1,0.331818,0.30265,0.535544,0.293501,0.330167,4,6,0.0
3,20040462,빅 피쉬,125,2,1,팀 버튼,이완 맥그리거,팝엔터테인먼트,85594,3,0,0.372727,0.044669,0.044767,0.005595,0.005163,2,2,1.0
4,20040559,나두야 간다,105,1,5,정연원,권용운,롯데엔터테인먼트,193554,6,0,0.281818,0.174298,0.017004,0.018691,0.165962,3,3,1.0


In [80]:
act_group = new.groupby(['actor1_Nm']).agg({"view":"median","correct":"sum", "movieCd":"count"}).reset_index()
act_group['acc_rate'] = act_group['correct']/act_group['movieCd']
act_group['view'] = act_group['view'].astype(int)

In [82]:
act_group[act_group['movieCd']>10].sort_values('acc_rate', ascending=False).head(10)

Unnamed: 0,actor1_Nm,view,correct,movieCd,acc_rate
1135,베네딕트 컴버배치,313022,10.0,11,0.909091
1740,엄상현,46717,23.0,26,0.884615
214,김서영,89150,10.0,12,0.833333
645,로버트 드 니로,17790,10.0,12,0.833333
1240,사무엘 L. 잭슨,11646,9.0,11,0.818182
427,니콜 키드먼,34119,13.0,16,0.8125
2603,줄리엣 비노쉬,5996,12.0,15,0.8
557,드웨인 존슨,1149628,10.0,13,0.769231
2534,조지 클루니,52800,10.0,13,0.769231
428,니콜라스 케이지,117267,16.0,21,0.761905


### 10.8.2 트레인 데이터 배우별 정확도

In [83]:
act_train = train_df.groupby(['actor1_Nm']).agg({"view":"median","correct":"sum", "movieCd":"count"}).reset_index()
act_train['acc_rate'] = act_train['correct']/act_train['movieCd']
act_train['view'] = act_train['view'].astype(int)

In [85]:
act_train[act_train['movieCd']>10].sort_values('acc_rate', ascending=False).head(10)

Unnamed: 0,actor1_Nm,view,correct,movieCd,acc_rate
1062,성룡,41574,11.0,11,1.0
1384,엄상현,57130,17.0,20,0.85
338,니콜 키드먼,18141,11.0,13,0.846154
444,드웨인 존슨,1057269,9.0,11,0.818182
339,니콜라스 케이지,44546,9.0,13,0.692308
578,리암 니슨,264439,11.0,16,0.6875
1967,조니 뎁,458243,7.0,11,0.636364
1926,제이슨 스타뎀,96338,8.0,13,0.615385
208,김윤석,2560649,7.0,12,0.583333
605,마동석,789651,7.0,12,0.583333


### 10.8.3 테스트 데이터 배우별 정확도

In [86]:
act_test = test_df.groupby(['actor1_Nm']).agg({"view":"median","correct":"sum", "movieCd":"count"}).reset_index()
act_test['acc_rate'] = act_test['correct']/act_test['movieCd']
act_test['view'] = act_test['view'].astype(int)

In [91]:
act_test[act_test['movieCd']>5].sort_values('acc_rate', ascending=False).head(10)

Unnamed: 0,actor1_Nm,view,correct,movieCd,acc_rate
1291,페넬로페 크루즈,20833,6.0,6,1.0
722,엄상현,31336,6.0,6,1.0
176,니콜라스 케이지,328626,7.0,8,0.875
1039,조니 뎁,456742,6.0,7,0.857143
1232,타카야마 미나미,420288,5.0,6,0.833333
261,로버트 드 니로,15375,4.0,6,0.666667
480,벤 스틸러,1513886,4.0,6,0.666667
551,설경구,1975583,4.0,6,0.666667
1331,하정우,3655261,4.0,6,0.666667
1005,제라드 버틀러,243592,5.0,8,0.625


### 10.9.1 전체 데이터 지역별 정확도

In [92]:
region_group = new.groupby(['nationNm']).agg({"view":"median","correct":"sum", "movieCd":"count"}).reset_index()
region_group['acc_rate'] = region_group['correct']/region_group['movieCd']
region_group['view'] = region_group['view'].astype(int)
region_group

Unnamed: 0,nationNm,view,correct,movieCd,acc_rate
0,1,63278,1446.0,1941,0.744977
1,2,78228,1512.0,2111,0.716248
2,3,11026,2307.0,2754,0.837691


### 10.9.2 트레인 데이터 지역별 정확도

In [93]:
region_train = train_df.groupby(['nationNm']).agg({"view":"median","correct":"sum", "movieCd":"count"}).reset_index()
region_train['acc_rate'] = region_train['correct']/region_train['movieCd']
region_train['view'] = region_train['view'].astype(int)
region_train

Unnamed: 0,nationNm,view,correct,movieCd,acc_rate
0,1.0,60944,1033.0,1361,0.759001
1,2.0,81694,1069.0,1470,0.727211
2,3.0,11000,1630.0,1933,0.843249


### 10.9.3 테스트 데이터 지역별 정확도

In [94]:
region_test = test_df.groupby(['nationNm']).agg({"view":"median","correct":"sum", "movieCd":"count"}).reset_index()
region_test['acc_rate'] = region_test['correct']/region_test['movieCd']
region_test['view'] = region_test['view'].astype(int)
region_test

Unnamed: 0,nationNm,view,correct,movieCd,acc_rate
0,1.0,70525,413.0,580,0.712069
1,2.0,68595,443.0,641,0.691108
2,3.0,11055,677.0,821,0.824604


---

## 11. 2022 개봉예정작 예측

In [99]:
coming = pd.read_excel('./data/comingsoon.xlsx')

In [100]:
coming = coming.drop(['view'], axis=1)
coming_copy = coming.copy()
coming_copy = coming_copy.drop(['movieNm'], axis=1)
coming_copy = pd.get_dummies(data = coming_copy, columns=['nation'], prefix = 'region')
coming_copy = pd.get_dummies(data = coming_copy, columns=['openMonth'], prefix = 'month')

coming_copy[['region_1', 'region_3', 'month_1', 'month_2', 'month_4', 'month_5', 'month_8', 'month_9', 'month_10']] = 0

coming_copy = coming_copy.reindex(columns=['runTm', 'genre', 'director', 'actor', 'is_adult', 'distributor', 'region_1', 'region_2', 'region_3', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12'])

In [101]:
for idx, row in coming_copy.iterrows():
    act = row['actor']
    dir = row['director']
    g = row['genre']
    dist = row['distributor']

    try:
        a_val = actor[actor['actor'] == act]['view'].values[0]
    except:
        a_val = 1001
    
    try:
        dir_val = director[director['director'] == dir]['view'].values[0]
    except:
        dir_val = 1001
    
    try:
        dis_val = distributor[distributor['distributor'] == dist]['view'].values[0]
    except:
        dis_val = 1001

    
    coming_copy.loc[idx, 'actor'] = a_val
    coming_copy.loc[idx, 'director'] = dir_val
    coming_copy.loc[idx, 'distributor'] = dis_val

    g_val = genre[genre['genre'] == g]['view'].values[0]
    coming_copy.loc[idx, 'genre'] = g_val

In [102]:
coming_copy.iloc[:,0:6] = mm.transform(coming_copy.iloc[:, 0:6])

xgb_prediction = xgb.predict(coming_copy.values)
gb_prediction = gb_clf.predict(coming_copy)

xgb_prediction += 1

coming['xgb_pred'] = xgb_prediction
coming['gb_pred'] = gb_prediction

In [103]:
coming

Unnamed: 0,movieNm,runTm,nation,genre,director,actor,is_adult,distributor,openMonth,xgb_pred,gb_pred
0,존윅4,120,2,2,채드 스타헬스키,키아누 리브스,1,롯데엔터테인먼트,3,3,3
1,토르4,118,2,2,타이카 와이티티,크리스 헴스워스,0,월트디즈니컴퍼니코리아 유한책임회사,7,6,6
2,미션임파서블7,128,2,2,크리스토퍼 맥쿼리,톰 크루즈,0,롯데엔터테인먼트,7,6,6
3,스파이더맨: 어크로스 더 스파이더버스 (파트 원),117,2,12,조아킹 도스 산토스,샤메익 무어,0,소니픽쳐스릴리징코리아,6,1,1
4,블랙팬서2,135,2,2,라이언 쿠글러,레티티아 라이트,0,월트디즈니컴퍼니코리아 유한책임회사,11,1,1
5,아바타2,162,2,2,제임스 카메론,샘 워싱턴,0,월트디즈니컴퍼니코리아 유한책임회사,12,3,3
6,버즈 라이트이어,105,2,12,앤거스 맥클레인,크리스 에반스,0,월트디즈니컴퍼니코리아 유한책임회사,6,1,1
7,탑건2,130,2,2,조셉 코신스키,톰 크루즈,0,롯데엔터테인먼트,6,3,3
8,바빌로,112,2,1,데이미언 셔젤,토비 맥과이어,0,롯데엔터테인먼트,12,5,5
9,노프,110,2,8,조던 필,다니엘 칼루야,0,유니버설픽쳐스인터내셔널 코리아(유),7,5,5


---

## 12. 2022 개봉작 예측

In [104]:
released = pd.read_excel('./data/already.xlsx')

In [105]:
released_copy = released.copy()
released_copy = released_copy.drop(['movieNm', 'view'], axis=1)
released_copy = pd.get_dummies(data = released_copy, columns=['nation'], prefix = 'region')
released_copy = pd.get_dummies(data = released_copy, columns=['openMonth'], prefix = 'month')

In [106]:
released_copy.head()

Unnamed: 0,runTm,genre,director,actor,is_adult,distributor,region_1,month_1,month_2,month_3,month_4,month_5
0,126,2,샘 레이미,베네딕트 컴버배치,0,월트디즈니컴퍼니코리아 유한책임회사,1,0,0,0,0,1
1,142,1,데이빗 예이츠,에디 레드메인,0,워너브러더스 코리아(주),1,0,0,0,1,0
2,176,2,맷 리브스,로버트 패틴슨,0,워너브러더스 코리아(주),1,0,0,1,0,0
3,110,12,가스 제닝스,매튜 매커너히,0,유니버설픽쳐스인터내셔널 코리아(유),1,1,0,0,0,0
4,116,2,루벤 플레셔,톰 홀랜드,0,소니픽쳐스릴리징코리아,1,0,1,0,0,0


In [107]:
released_copy[['region_2', 'region_3', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12',]] = 0

released_copy = released_copy.reindex(columns=['runTm', 'genre', 'director', 'actor', 'is_adult', 'distributor', 'region_1', 'region_2', 'region_3', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12'])

In [108]:
for idx, row in released_copy.iterrows():
    act = row['actor']
    dir = row['director']
    g = row['genre']
    dist = row['distributor']

    try:
        a_val = actor[actor['actor'] == act]['view'].values[0]
    except:
        a_val = 1001
    
    try:
        dir_val = director[director['director'] == dir]['view'].values[0]
    except:
        dir_val = 1001
    
    try:
        dis_val = distributor[distributor['distributor'] == dist]['view'].values[0]
    except:
        dis_val = 1001

    
    released_copy.loc[idx, 'actor'] = a_val
    released_copy.loc[idx, 'director'] = dir_val
    released_copy.loc[idx, 'distributor'] = dis_val

    g_val = genre[genre['genre'] == g]['view'].values[0]
    released_copy.loc[idx, 'genre'] = g_val

In [109]:
released_copy.iloc[:,0:6] = mm.transform(released_copy.iloc[:, 0:6])

xgb_prediction = xgb.predict(released_copy.values)
gb_prediction = gb_clf.predict(released_copy)

xgb_prediction += 1

released['xgb_pred'] = xgb_prediction
released['gb_pred'] = gb_prediction

In [111]:
released['view_cat'] = ''

In [112]:
for idx, row in released.iterrows():
    val = row['view']
    if val <10000:
        released.loc[idx, 'view_cat'] = 1
    elif val <100000:
        released.loc[idx, 'view_cat'] = 2
    elif val <500000:
        released.loc[idx, 'view_cat'] = 3
    elif val <1000000:
        released.loc[idx, 'view_cat'] = 4
    elif val <3000000:
        released.loc[idx, 'view_cat'] = 5
    elif val <7000000:
        released.loc[idx, 'view_cat'] = 6
    elif val <10000000:
        released.loc[idx, 'view_cat'] = 7
    else:
        released.loc[idx, 'view_cat'] = 8

In [113]:
released

Unnamed: 0,movieNm,runTm,nation,genre,director,actor,is_adult,distributor,view,openMonth,xgb_pred,gb_pred,view_cat
0,닥터스트레인지,126,1,2,샘 레이미,베네딕트 컴버배치,0,월트디즈니컴퍼니코리아 유한책임회사,5860420,5,4,4,6
1,신비한 동물사전,142,1,1,데이빗 예이츠,에디 레드메인,0,워너브러더스 코리아(주),1194113,4,6,6,5
2,더 배트맨,176,1,2,맷 리브스,로버트 패틴슨,0,워너브러더스 코리아(주),904156,3,2,2,4
3,씽2게더,110,1,12,가스 제닝스,매튜 매커너히,0,유니버설픽쳐스인터내셔널 코리아(유),883795,1,2,2,4
4,언차티드,116,1,2,루벤 플레셔,톰 홀랜드,0,소니픽쳐스릴리징코리아,730924,2,5,5,4
5,모비우스,104,1,2,다니엘 에스피노사,자레드 레토,0,소니픽쳐스릴리징코리아,474560,3,3,3,3
6,수퍼 소닉2,122,1,12,제프 파울러,제임스 마스던,0,롯데엔터테인먼트,309989,4,3,3,3
7,나일강의 죽음,126,1,11,케네스 브래너,케네스 브래너,0,월트디즈니컴퍼니코리아 유한책임회사,232883,2,4,4,3
8,문폴,130,1,2,롤랜드 에머리히,할리 베리,0,(주)누리픽쳐스,195346,3,2,2,3
9,하우스오브구찌,158,1,11,리들리 스콧,아담 드라이버,0,유니버설픽쳐스인터내셔널 코리아(유),143605,1,2,2,3
