# Random Forests Algorithm Adaptation

In [112]:
import pandas as pd
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.metrics import hamming_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [113]:
imdb5k = pd.read_pickle('imdb5000onehot.pkl')
imdb5k.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,actor_1_name,...,i_Drama,i_Action,i_Documentary,i_Musical,i_History,i_Family,i_Fantasy,i_Game-Show,i_Sport,i_Biography
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,CCH Pounder,...,0,1,0,0,0,0,1,0,0,0
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Johnny Depp,...,0,1,0,0,0,0,1,0,0,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Christoph Waltz,...,0,1,0,0,0,0,0,0,0,0
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Tom Hardy,...,0,1,0,0,0,0,0,0,0,0
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Doug Walker,...,0,0,1,0,0,0,0,0,0,0


In [114]:
imdb5k.columns

Index([u'color', u'director_name', u'num_critic_for_reviews', u'duration',
       u'director_facebook_likes', u'actor_3_facebook_likes', u'actor_2_name',
       u'actor_1_facebook_likes', u'gross', u'actor_1_name', u'movie_title',
       u'num_voted_users', u'cast_total_facebook_likes', u'actor_3_name',
       u'facenumber_in_poster', u'plot_keywords', u'movie_imdb_link',
       u'num_user_for_reviews', u'language', u'country', u'content_rating',
       u'budget', u'title_year', u'actor_2_facebook_likes', u'imdb_score',
       u'aspect_ratio', u'movie_facebook_likes', u'i_Sci-Fi', u'i_Crime',
       u'i_Romance', u'i_Animation', u'i_Music', u'i_Comedy', u'i_War',
       u'i_Horror', u'i_Film-Noir', u'i_Adventure', u'i_News', u'i_Reality-TV',
       u'i_Thriller', u'i_Western', u'i_Mystery', u'i_Short', u'i_Drama',
       u'i_Action', u'i_Documentary', u'i_Musical', u'i_History', u'i_Family',
       u'i_Fantasy', u'i_Game-Show', u'i_Sport', u'i_Biography'],
      dtype='object')

In [115]:
# print categoricals
for col in imdb5k.columns:
    if imdb5k[col].dtype == 'object':
        print col

color
director_name
actor_2_name
actor_1_name
movie_title
actor_3_name
plot_keywords
movie_imdb_link
language
country
content_rating


In [116]:
imdb5k['plot_keywords'].head()

0               avatar|future|marine|native|paraplegic
1    goddess|marriage ceremony|marriage proposal|pi...
2                  bomb|espionage|sequel|spy|terrorist
3    deception|imprisonment|lawlessness|police offi...
4                                                  NaN
Name: plot_keywords, dtype: object

In [117]:
# one-hot encode select categoricals
# good_categories = ['color', 'country', 'content_rating']
good_categories = []
imdb_quant = pd.DataFrame()
for col in imdb5k.columns:
    if imdb5k[col].dtype != 'object':
        imdb_quant[col] = imdb5k[col]
    elif col in good_categories:
        imdb_quant = pd.concat([imdb_quant, pd.get_dummies(imdb5k[col])], axis=1)

In [118]:
# x y split
imdb5k_X = imdb_quant.ix[:,0:list(imdb_quant.columns).index('i_Sci-Fi')]
imdb5k_Y = imdb_quant.ix[:,list(imdb_quant.columns).index('i_Sci-Fi'):len(imdb_quant.columns)]

# impute
def impute(dataf):
    for col in dataf.columns:
        dataf[col].fillna(dataf[col].mean(), inplace=True)
impute(imdb5k_X)
impute(imdb5k_Y)

# train test split
x_train, x_test, y_train, y_test = train_test_split(imdb5k_X, imdb5k_Y, stratify = imdb5k_Y)

In [119]:
params = {
    'n_estimators': [100],
    'max_features': ['auto', 'log2', None, 1] + range(5, 20, 5),
    'n_jobs' : [-1]
}

CV_rf = GridSearchCV(RandomForestClassifier(), param_grid=params, cv=5, n_jobs=-1)
CV_rf.fit(x_train, y_train)
print CV_rf.best_params_
print CV_rf.best_score_
print CV_rf.score(x_test, y_test)

{'max_features': None, 'n_estimators': 100, 'n_jobs': -1}
0.486515071391
0.584456780333


In [120]:
hamming_loss(y_test, CV_rf.predict(x_test))

0.04221313975477338

In [121]:
print ['auto', 'log2', None, 1] + range(5, 30, 5)

['auto', 'log2', None, 1, 5, 10, 15, 20, 25]


In [122]:
pd.DataFrame({'feature': x_train.columns, 'importance': CV_rf.best_estimator_.feature_importances_})

Unnamed: 0,feature,importance
0,num_critic_for_reviews,0.066859
1,duration,0.107029
2,director_facebook_likes,0.056514
3,actor_3_facebook_likes,0.054535
4,actor_1_facebook_likes,0.048569
5,gross,0.074388
6,num_voted_users,0.069728
7,cast_total_facebook_likes,0.049365
8,facenumber_in_poster,0.03587
9,num_user_for_reviews,0.067889
