### 데이터 불러오기

In [1]:
import pandas as pd

train = pd.read_csv('./data/train.csv')
test  = pd.read_csv('./data/test.csv')
sub   = pd.read_csv('./data/sample_submission.csv')

In [29]:
train.columns

Index(['id', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'runtime', 'spoken_languages',
       'status', 'tagline', 'title', 'Keywords', 'cast', 'crew', 'revenue'],
      dtype='object')

### 라벨링
#### --> original_language, production_companies, production_countries, spoken_languages

In [2]:
from sklearn.preprocessing import LabelEncoder

#### 1) original_language

In [3]:
print(len(train.original_language.value_counts()))
train.original_language.value_counts()

36


en    2575
fr      78
ru      47
es      43
hi      42
ja      37
it      24
cn      20
ko      20
zh      19
de      18
ta      16
sv       8
pt       6
nl       6
da       5
fa       5
ro       4
tr       3
hu       3
no       2
pl       2
te       2
fi       2
ml       2
mr       1
nb       1
vi       1
sr       1
cs       1
ar       1
he       1
id       1
el       1
ur       1
bn       1
Name: original_language, dtype: int64

In [4]:
print(len(test.original_language.value_counts()))
test.original_language.value_counts()

39


en    3776
fr     121
hi      76
ru      62
ja      53
es      52
it      32
de      31
ko      29
zh      27
cn      21
ta      15
sv      12
da      12
ml      10
pt       7
te       7
tr       6
ro       5
th       5
nl       5
he       5
no       3
pl       3
cs       2
sr       2
fi       2
xx       2
bm       2
bn       2
id       2
el       2
ur       1
kn       1
ka       1
hu       1
af       1
ca       1
is       1
Name: original_language, dtype: int64

In [5]:
label = LabelEncoder()
label.fit(train['original_language'])

LabelEncoder()

In [6]:
label.classes_

array(['ar', 'bn', 'cn', 'cs', 'da', 'de', 'el', 'en', 'es', 'fa', 'fi',
       'fr', 'he', 'hi', 'hu', 'id', 'it', 'ja', 'ko', 'ml', 'mr', 'nb',
       'nl', 'no', 'pl', 'pt', 'ro', 'ru', 'sr', 'sv', 'ta', 'te', 'tr',
       'ur', 'vi', 'zh'], dtype=object)

In [7]:
label.transform(train['original_language'])

array([7, 7, 7, ..., 7, 7, 7], dtype=int64)

In [8]:
train['original_language'] = label.transform(train['original_language'])
train['original_language'][0:10]

0     7
1     7
2     7
3    13
4    18
5     7
6     7
7     7
8     7
9     7
Name: original_language, dtype: int64

#### but, train 데이터와 test 데이터의 original_language 의 갯수가 다르다. 
#### 즉, 각각 라벨링 했을 경우 정확하지 않을 수 있다.
#### train과 test를 같이 묶어서 데이터 전처리를 하고 분리하는 방법이 낫다.  

In [9]:
print(train.shape)
print(test.shape)

(3000, 23)
(4398, 22)


In [10]:
# test데이터에는 예측값인 revenue가 없다. 임시로 만들어 준다. 

test['revenue'] = 0 

In [11]:
print(train.shape)
print(test.shape)

(3000, 23)
(4398, 23)


In [12]:
# 위에서 라벨링을 해주었기에 여기서 다시 데이터를 불러온다.

train = pd.read_csv('./data/train.csv')
test  = pd.read_csv('./data/test.csv')

In [13]:
all = pd.concat([train, test], ignore_index = True )
print(all.shape)

(7398, 23)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [14]:
print(len(all['original_language'].value_counts()))

44


In [15]:
tr_lan = dict(train['original_language'].value_counts())
te_lan = dict(test['original_language'].value_counts())

In [16]:
print('train : ', len(tr_lan), '   test : ', len(te_lan))

train :  36    test :  39


In [17]:
tr_lan

{'en': 2575,
 'fr': 78,
 'ru': 47,
 'es': 43,
 'hi': 42,
 'ja': 37,
 'it': 24,
 'cn': 20,
 'ko': 20,
 'zh': 19,
 'de': 18,
 'ta': 16,
 'sv': 8,
 'pt': 6,
 'nl': 6,
 'da': 5,
 'fa': 5,
 'ro': 4,
 'tr': 3,
 'hu': 3,
 'no': 2,
 'pl': 2,
 'te': 2,
 'fi': 2,
 'ml': 2,
 'mr': 1,
 'nb': 1,
 'vi': 1,
 'sr': 1,
 'cs': 1,
 'ar': 1,
 'he': 1,
 'id': 1,
 'el': 1,
 'ur': 1,
 'bn': 1}

In [18]:
tr_lan = list(tr_lan.keys())
tr_lan

['en',
 'fr',
 'ru',
 'es',
 'hi',
 'ja',
 'it',
 'cn',
 'ko',
 'zh',
 'de',
 'ta',
 'sv',
 'pt',
 'nl',
 'da',
 'fa',
 'ro',
 'tr',
 'hu',
 'no',
 'pl',
 'te',
 'fi',
 'ml',
 'mr',
 'nb',
 'vi',
 'sr',
 'cs',
 'ar',
 'he',
 'id',
 'el',
 'ur',
 'bn']

In [19]:
te_lan = list(te_lan.keys())
te_lan

['en',
 'fr',
 'hi',
 'ru',
 'ja',
 'es',
 'it',
 'de',
 'ko',
 'zh',
 'cn',
 'ta',
 'sv',
 'da',
 'ml',
 'pt',
 'te',
 'tr',
 'ro',
 'th',
 'nl',
 'he',
 'no',
 'pl',
 'cs',
 'sr',
 'fi',
 'xx',
 'bm',
 'bn',
 'id',
 'el',
 'ur',
 'kn',
 'ka',
 'hu',
 'af',
 'ca',
 'is']

In [20]:
all_lan = tr_lan + te_lan
print(len(tr_lan), ' + ', len(te_lan), ' = ', '중복포함 ',len(all_lan))

36  +  39  =  중복포함  75


In [21]:
all_lan = list(set(all_lan))
len(all_lan)

44

#### 전체 데이터를 기준으로 라벨링 다시 하기

In [22]:
from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()
label.fit(all['original_language'])
print(len(label.classes_))
label.classes_

44


array(['af', 'ar', 'bm', 'bn', 'ca', 'cn', 'cs', 'da', 'de', 'el', 'en',
       'es', 'fa', 'fi', 'fr', 'he', 'hi', 'hu', 'id', 'is', 'it', 'ja',
       'ka', 'kn', 'ko', 'ml', 'mr', 'nb', 'nl', 'no', 'pl', 'pt', 'ro',
       'ru', 'sr', 'sv', 'ta', 'te', 'th', 'tr', 'ur', 'vi', 'xx', 'zh'],
      dtype=object)

In [23]:
all['original_language'] = label.transform(all['original_language'])
all.head(7)

Unnamed: 0,Keywords,belongs_to_collection,budget,cast,crew,genres,homepage,id,imdb_id,original_language,...,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title
0,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...","[{'id': 35, 'name': 'Comedy'}]",,1,tt2637294,10,...,/tQtWuwvMf0hCc2QR2tkolwl7c3c.jpg,"[{'name': 'Paramount Pictures', 'id': 4}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",2/20/15,12314651.0,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2
1,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,2,tt0368933,10,...,/w9Z7A0GHEhIp7etpj0vyKOeU1Wx.jpg,"[{'name': 'Walt Disney Pictures', 'id': 2}]","[{'iso_3166_1': 'US', 'name': 'United States o...",8/6/04,95149435.0,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement
2,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...",,3300000,"[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...","[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,3,tt2582802,10,...,/lIv1QinFqz4dlp5U4lQ6HaiskOZ.jpg,"[{'name': 'Bold Films', 'id': 2266}, {'name': ...","[{'iso_3166_1': 'US', 'name': 'United States o...",10/10/14,13092000.0,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash
3,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...",,1200000,"[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...","[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,4,tt1821480,16,...,/aTXRaPrWSinhcmCrcfJK17urp3F.jpg,,"[{'iso_3166_1': 'IN', 'name': 'India'}]",3/9/12,16000000.0,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Kahaani
4,,,0,"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...","[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,5,tt1380152,24,...,/m22s7zvkVFDU9ir56PiiqIEWFdT.jpg,,"[{'iso_3166_1': 'KR', 'name': 'South Korea'}]",2/5/09,3923970.0,118.0,"[{'iso_639_1': 'ko', 'name': '한국어/조선말'}]",Released,,Marine Boy
5,,,8000000,"[{'cast_id': 6, 'character': 'Pinocchio (voice...","[{'credit_id': '52fe46f49251416c9106558b', 'de...","[{'id': 16, 'name': 'Animation'}, {'id': 12, '...",,6,tt0093743,10,...,/6IDqA1D2NBIVhzEEaMMRL28iBrq.jpg,,,8/6/87,3261638.0,83.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Pinocchio and the Emperor of the Night
6,,,14000000,"[{'cast_id': 23, 'character': 'Clyde', 'credit...","[{'credit_id': '52fe4981c3a368484e12ee29', 'de...","[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",http://www.thepossessionmovie.com/,7,tt0431021,10,...,/4QjzFuaZmB4btGnLwAgdp23BzIU.jpg,"[{'name': 'Ghost House Pictures', 'id': 768}, ...","[{'iso_3166_1': 'US', 'name': 'United States o...",8/30/12,85446075.0,92.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Fear The Demon That Doesn't Fear God,The Possession


In [24]:
print(len(all['original_language'].value_counts()))

44


### 데이터 조정 및 머신러닝

In [25]:
f_names = ['budget', 'popularity', 'original_language' ]

In [26]:
print(train.shape)
print(test.shape)

(3000, 23)
(4398, 22)


In [38]:
train[:1]

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651


In [41]:
train[-1:]

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
2999,3000,,35000000,"[{'id': 53, 'name': 'Thriller'}, {'id': 28, 'n...",http://www.abductionthefilm.com/,tt1600195,en,Abduction,A young man sets out to uncover the truth abou...,10.512109,...,9/22/11,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,They stole his life. He's taking it back.,Abduction,"[{'id': 591, 'name': 'cia'}, {'id': 822, 'name...","[{'cast_id': 2, 'character': 'Nathan Harper', ...","[{'credit_id': '5391990d0e0a260fb5001629', 'de...",82087155


In [46]:
train[train['id'] == 3000]

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
2999,3000,,35000000,"[{'id': 53, 'name': 'Thriller'}, {'id': 28, 'n...",http://www.abductionthefilm.com/,tt1600195,en,Abduction,A young man sets out to uncover the truth abou...,10.512109,...,9/22/11,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,They stole his life. He's taking it back.,Abduction,"[{'id': 591, 'name': 'cia'}, {'id': 822, 'name...","[{'cast_id': 2, 'character': 'Nathan Harper', ...","[{'credit_id': '5391990d0e0a260fb5001629', 'de...",82087155


In [48]:
test.head(2)

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,production_countries,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew
0,3001,"[{'id': 34055, 'name': 'Pokémon Collection', '...",0,"[{'id': 12, 'name': 'Adventure'}, {'id': 16, '...",http://www.pokemon.com/us/movies/movie-pokemon...,tt1226251,ja,ディアルガVSパルキアVSダークライ,Ash and friends (this time accompanied by newc...,3.851534,...,"[{'iso_3166_1': 'JP', 'name': 'Japan'}, {'iso_...",7/14/07,90.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Somewhere Between Time & Space... A Legend Is ...,Pokémon: The Rise of Darkrai,"[{'id': 11451, 'name': 'pok√©mon'}, {'id': 115...","[{'cast_id': 3, 'character': 'Tonio', 'credit_...","[{'credit_id': '52fe44e7c3a368484e03d683', 'de..."
1,3002,,88000,"[{'id': 27, 'name': 'Horror'}, {'id': 878, 'na...",,tt0051380,en,Attack of the 50 Foot Woman,When an abused wife grows to giant size becaus...,3.559789,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",5/19/58,65.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A titanic beauty spreads a macabre wave of hor...,Attack of the 50 Foot Woman,"[{'id': 9748, 'name': 'revenge'}, {'id': 9951,...","[{'cast_id': 2, 'character': 'Nancy Fowler Arc...","[{'credit_id': '55807805c3a3685b1300060b', 'de..."


In [49]:
test.tail(2)

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,production_countries,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew
4396,7397,,2000000,"[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",,tt3235888,en,It Follows,"For 19-year-old Jay, fall should be about scho...",20.359336,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",2/4/15,100.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"It doesn't think, it doesn't feel, it doesn't ...",It Follows,"[{'id': 3713, 'name': 'chase'}, {'id': 6152, '...","[{'cast_id': 1, 'character': 'Jay Height', 'cr...","[{'credit_id': '537770b20e0a261431002299', 'de..."
4397,7398,,64000,"[{'id': 18, 'name': 'Drama'}]",,tt0056663,fr,Vivre sa vie: film en douze tableaux,Twelve episodic tales in the life of a Parisia...,11.30591,...,"[{'iso_3166_1': 'FR', 'name': 'France'}]",9/20/62,85.0,"[{'iso_639_1': 'fr', 'name': 'Français'}]",Released,The many faces of a woman trying to find herse...,Vivre Sa Vie,"[{'id': 90, 'name': 'paris'}, {'id': 490, 'nam...","[{'cast_id': 8, 'character': 'Nana Kleinfranke...","[{'credit_id': '52fe4306c3a36847f80349a5', 'de..."


In [61]:
train.iloc[2999]

id                                                                    3000
belongs_to_collection                                                  NaN
budget                                                            35000000
genres                   [{'id': 53, 'name': 'Thriller'}, {'id': 28, 'n...
homepage                                  http://www.abductionthefilm.com/
imdb_id                                                          tt1600195
original_language                                                       en
original_title                                                   Abduction
overview                 A young man sets out to uncover the truth abou...
popularity                                                         10.5121
poster_path                               /cUT6NQP5LAJpmUoStGtXmvNt4zA.jpg
production_companies     [{'name': 'Lions Gate Films', 'id': 35}, {'nam...
production_countries     [{'iso_3166_1': 'US', 'name': 'United States o...
release_date             

In [69]:
x_train = all[0:3000]
x_test = all[3000:]

In [70]:
x_train = x_train[f_names]
x_test  = x_test[f_names]

label_name = 'revenue'
y_train = train[label_name]

In [71]:
x_train.head()

Unnamed: 0,budget,popularity,original_language
0,14000000,6.575393,10
1,40000000,8.248895,10
2,3300000,64.29999,10
3,1200000,3.174936,16
4,0,1.14807,24


In [72]:
x_test.tail()

Unnamed: 0,budget,popularity,original_language
4393,42000000,9.970359,10
4394,19000000,6.046516,10
4395,16000000,9.596883,10
4396,2000000,20.359336,10
4397,64000,11.30591,14


#### Linear Regression

In [73]:
from sklearn import linear_model

LR_model = linear_model.LinearRegression()
LR_model.fit(x_train, y_train)

predictions = LR_model.predict(x_test)
predictions[0:10]

array([ 1.81494503e+06, -3.08263420e+06,  8.61647107e+06,  2.86046835e+07,
        8.13385621e+05,  1.01814750e+07, -5.67151843e+05,  8.10885741e+07,
        4.93414530e+07,  2.59347124e+08])

In [74]:
sub['revenue'] = predictions
sub.to_csv('LR2.csv', index = False)

In [None]:
#                                        6.16543
# original_language 라벨링 + 모델 기본 : 6.03117

####   
#### DecisionTree Regressor 

In [75]:
from sklearn.tree import DecisionTreeRegressor

DT_model = DecisionTreeRegressor()
DT_model.fit(x_train, y_train)

predictions = DT_model.predict(x_test)
predictions[0:10]

array([2.1533900e+05, 5.2500000e+02, 1.6775000e+04, 5.0275800e+05,
       2.2606220e+06, 3.4585416e+07, 3.7659700e+05, 9.5714875e+07,
       5.9209000e+04, 6.6094078e+08])

In [76]:
sub['revenue'] = predictions
sub.to_csv('DT2.csv', index = False)

In [None]:
#                                         3.11207
# original_language 라벨링 + 모델 기본 :  3.18784

####   
#### RandomForest Regressor

In [78]:
from sklearn.ensemble import RandomForestRegressor

RF_model = RandomForestRegressor()
RF_model.fit(x_train, y_train)

predictions = RF_model.predict(x_test)
predictions[0:10]

array([1.55575881e+07, 1.38669170e+06, 1.42066228e+07, 9.50815320e+06,
       9.48620830e+06, 2.60908462e+07, 5.72903870e+06, 5.22927557e+07,
       4.05677780e+07, 3.20374460e+08])

In [79]:
sub['revenue'] = predictions
sub.to_csv('RF1.csv', index = False)

In [None]:
#                                        2.65834
# original_language 라벨링 + 모델 기본 : 2.65130


####   
#### Xgboost Regressor 

In [80]:
import xgboost as xgb

xgb_model = xgb.XGBRegressor()
xgb_model.fit(x_train, y_train)

predictions = xgb_model.predict(x_test)
predictions[0:10]

array([5.5947945e+06, 5.5947945e+06, 1.9142972e+07, 3.7316384e+07,
       5.5947945e+06, 2.2716292e+07, 5.5947945e+06, 6.2679552e+07,
       4.0635500e+07, 2.8384006e+08], dtype=float32)

In [81]:
sub['revenue'] = predictions
sub.to_csv('XGB2.csv', index = False)

In [None]:
#                                        2.79813
# original_language 라벨링 + 모델 기본 : 2.79577


In [67]:
# 리스트에서 중복값 세기 
                        
from collections import Counter

Counter(all_lan)

Counter({'en': 1,
         'it': 1,
         'de': 1,
         'pt': 1,
         'he': 1,
         'bm': 1,
         'zh': 1,
         'el': 1,
         'hu': 1,
         'vi': 1,
         'cs': 1,
         'pl': 1,
         'kn': 1,
         'nb': 1,
         'fr': 1,
         'ar': 1,
         'cn': 1,
         'xx': 1,
         'ru': 1,
         'id': 1,
         'ro': 1,
         'bn': 1,
         'ka': 1,
         'te': 1,
         'is': 1,
         'sr': 1,
         'mr': 1,
         'af': 1,
         'hi': 1,
         'es': 1,
         'tr': 1,
         'fi': 1,
         'ur': 1,
         'da': 1,
         'sv': 1,
         'ca': 1,
         'ko': 1,
         'th': 1,
         'ml': 1,
         'no': 1,
         'ta': 1,
         'nl': 1,
         'fa': 1,
         'ja': 1})

In [83]:
from collections import Counter

sk = [1,3,2,1,3,3,23,2,4,2,2,3]
Counter(sk)

Counter({1: 2, 3: 4, 2: 4, 23: 1, 4: 1})