# 두번째 시도

Key Attempts:
- 감독, 배급사, 배우는 각 레이블의 관객수 중압값으로 치환
- 개봉월, 국가, 장르는 0,1 레이블 인코딩
- 감독, 배급사, 배우, 상영시간 normalise

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('./data/join_final_v2.csv', encoding='utf-8-sig', thousands=',')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6806 entries, 0 to 6805
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   runTm        6806 non-null   float64
 1   nation       6806 non-null   int64  
 2   genre        6806 non-null   int64  
 3   director     6806 non-null   object 
 4   actor        6806 non-null   object 
 5   is_adult     6806 non-null   float64
 6   distributor  6806 non-null   object 
 7   view         6806 non-null   int64  
 8   openMonth    6806 non-null   int64  
dtypes: float64(2), int64(4), object(3)
memory usage: 478.7+ KB


## 1. 메디안 값 치환

In [5]:
# 감독, 배우, 배급사 각각의 중앙 값 테이블 만들기
director = df.groupby('director').agg({"view":"median"}).reset_index()
actor = df.groupby('actor').agg({"view":"median"}).reset_index()
distributor = df.groupby('distributor').agg({"view":"median"}).reset_index()

In [6]:
director

Unnamed: 0,director,view
0,D.J. 카루소,576120.0
1,D.R. 후드,2263.0
2,E. 엘리아스 메리지,4918.0
3,F. 게리 그레이,800006.0
4,J 블레이크슨,81668.0
...,...,...
3869,히라카와 유이치로,4392.0
3870,히로키 류이치,2132.5
3871,히로타 유스케,23737.0
3872,히시다 마사카즈,15749.5


In [7]:
for idx, row in df.iterrows():
    act = row['actor']
    dist = row['distributor']
    dir = row['director']

    a_val = actor[actor['actor'] == act]['view'].values[0]
    df.loc[idx, 'actor'] = a_val

    dist_val = distributor[distributor['distributor'] == dist]['view'].values[0]
    df.loc[idx, 'distributor'] = dist_val

    dir_val = director[director['director'] == dir]['view'].values[0]
    df.loc[idx, 'director'] = dir_val

In [8]:
df.head()

Unnamed: 0,runTm,nation,genre,director,actor,is_adult,distributor,view,openMonth
0,136.0,2,1,954845.5,167141.0,0.0,528537.0,1865469,7
1,119.0,3,2,2674.0,17258.0,0.0,8048.0,24634,1
2,116.0,1,3,6065474.0,3024666.0,1.0,708108.5,943241,4
3,125.0,2,1,507938.5,58644.0,0.0,13547.0,85594,3
4,105.0,1,5,193554.0,193554.0,0.0,62026.0,193554,6


## 2. 관객 수 범주형 변환

In [9]:
for idx, row in df.iterrows():
    val = row['view']
    if val <10000:
        df.loc[idx, 'view'] = 1
    elif val <100000:
        df.loc[idx, 'view'] = 2
    elif val <200000:
        df.loc[idx, 'view'] = 3
    elif val <400000:
        df.loc[idx, 'view'] = 4
    elif val <600000:
        df.loc[idx, 'view'] = 5
    elif val <800000:
        df.loc[idx, 'view'] = 6
    elif val <1000000:
        df.loc[idx, 'view'] = 7
    else:
        df.loc[idx, 'view'] = 8

## 3. 라벨인코딩

In [10]:
y = df['view']
X = df.drop('view', axis=1)

In [11]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6806 entries, 0 to 6805
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   runTm        6806 non-null   float64
 1   nation       6806 non-null   int64  
 2   genre        6806 non-null   int64  
 3   director     6806 non-null   object 
 4   actor        6806 non-null   object 
 5   is_adult     6806 non-null   float64
 6   distributor  6806 non-null   object 
 7   openMonth    6806 non-null   int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 425.5+ KB


In [12]:
X = pd.get_dummies(data = X, columns=['nation'], prefix='region')

In [13]:
X = pd.get_dummies(data = X, columns=['openMonth'], prefix='month')

In [14]:
X = pd.get_dummies(data = X, columns=['genre'], prefix='genre')

In [15]:
X.columns

Index(['runTm', 'director', 'actor', 'is_adult', 'distributor', 'region_1',
       'region_2', 'region_3', 'month_1', 'month_2', 'month_3', 'month_4',
       'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10',
       'month_11', 'month_12', 'genre_1', 'genre_2', 'genre_3', 'genre_5',
       'genre_7', 'genre_8', 'genre_10', 'genre_11', 'genre_12'],
      dtype='object')

## 4. 데이터 스플릿

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=13, stratify=y)

## 5. Normalise

In [17]:
X_train

Unnamed: 0,runTm,director,actor,is_adult,distributor,region_1,region_2,region_3,month_1,month_2,...,month_12,genre_1,genre_2,genre_3,genre_5,genre_7,genre_8,genre_10,genre_11,genre_12
619,111.0,219824.0,219824.0,0.0,374215.0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1314,78.0,214097.0,32517.5,0.0,16143.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
861,98.0,161342.5,146660.5,1.0,352226.5,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
5384,123.0,11067.0,11067.0,0.0,50196.0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4283,119.0,44770.5,49832.0,0.0,7213.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4209,104.0,47923.0,603634.0,0.0,528537.0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
726,110.0,1049616.5,446780.0,0.0,352226.5,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1414,166.0,477329.5,275944.5,0.0,367308.0,0,1,0,0,1,...,0,1,0,0,0,0,0,0,0,0
1523,100.0,60219.0,117006.0,0.0,361682.5,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0


In [18]:
# train data의 runTm 컬럼만 normalisation fit 적용
from sklearn.preprocessing import MinMaxScaler

mm = MinMaxScaler()
X_train.iloc[:,0:5] = mm.fit_transform(X_train.iloc[:, 0:5])

In [19]:
X_train

Unnamed: 0,runTm,director,actor,is_adult,distributor,region_1,region_2,region_3,month_1,month_2,...,month_12,genre_1,genre_2,genre_3,genre_5,genre_7,genre_8,genre_10,genre_11,genre_12
619,0.309091,0.019324,0.021998,0.0,0.145299,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1314,0.159091,0.018818,0.003168,0.0,0.005895,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
861,0.250000,0.014160,0.014643,1.0,0.136739,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
5384,0.363636,0.000889,0.001012,0.0,0.019153,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4283,0.345455,0.003865,0.004909,0.0,0.002419,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4209,0.277273,0.004144,0.060582,0.0,0.205379,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
726,0.304545,0.092602,0.044814,0.0,0.136739,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1414,0.559091,0.042064,0.027640,0.0,0.142610,0,1,0,0,1,...,0,1,0,0,0,0,0,0,0,0
1523,0.259091,0.005230,0.011662,0.0,0.140420,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0


In [20]:
# test 데이터의 runTm 컬럼도 같은 minMax scaler transform
X_test.iloc[:,0:5] = mm.transform(X_test.iloc[:, 0:5])

## 6. Modelling

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr_clf = LogisticRegression(random_state=13, solver='liblinear')
lr_clf.fit(X_train, y_train)
lr_pred = lr_clf.predict(X_test)
lr_train = lr_clf.predict(X_train)

print("train accuracy : ", accuracy_score(y_train, lr_train))
print("test accuracy : ", accuracy_score(y_test, lr_pred))

train accuracy :  0.5226700251889169
test accuracy :  0.5176297747306562


In [22]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier(random_state=13, max_depth=8)
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)
dt_train = dt_clf.predict(X_train)

print("train accuracy : ", accuracy_score(y_train, dt_train))
print("test accuracy : ", accuracy_score(y_test, dt_pred))

train accuracy :  0.8087741393786734
test accuracy :  0.7242899118511263


In [23]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state=13, n_jobs=-1, n_estimators=100)
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
rf_train = rf_clf.predict(X_train)

print("train accuracy : ", accuracy_score(y_train, rf_train))
print("test accuracy : ", accuracy_score(y_test, rf_pred))

train accuracy :  0.9997900923593619
test accuracy :  0.7296767874632712


In [24]:
from lightgbm import LGBMClassifier

lgbm_clf = LGBMClassifier(random_state=13, n_jobs=-1, n_estimators=1000, num_leaves=64, boost_from_average = False)
lgbm_clf.fit(X_train, y_train)
lgbm_pred = lgbm_clf.predict(X_test)
lgbm_train = rf_clf.predict(X_train)

print("train accuracy : ", accuracy_score(y_train, lgbm_train))
print("test accuracy : ", accuracy_score(y_test, lgbm_pred))

train accuracy :  0.9997900923593619
test accuracy :  0.7321253672869735


---

## 7. Conclusion

- 첫 시도에 비해 중앙값으로 치환을 하니 확실히 성능이 좋아졌다.
- 다만 모델이 학습데이터에 과적합 된 성향이 보인다.