# Multilabel Random Forest

Performing multilabel random forest classification for both metadata features and features generated through deep learning on the movie posters.

In [58]:
import pandas as pd
import sklearn as sk
from sklearn.metrics import hamming_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
import pickle
import sklearn.metrics as skmetrics
import numpy as np




In [46]:
train_features = np.load('bottleneck_features_train.npy')
val_features = np.load('bottleneck_features_validation.npy')

In [3]:
train_multilabels = np.load('train_multilabels.npy')
val_multilabels = np.load('val_multilabels.npy')

In [9]:
clean_train = pd.read_csv("clean_train.csv")
clean_val = pd.read_csv("clean_val.csv")

In [10]:
clean_train.drop("id", axis=1, inplace=True)
clean_val.drop("id", axis=1, inplace=True)

In [11]:
# x y split
train_X = clean_train.ix[:,0:list(clean_train.columns).index('Action')]
train_Y = clean_train.ix[:,list(clean_train.columns).index('Action'):len(clean_train.columns)]

test_X = clean_val.ix[:,0:list(clean_val.columns).index('Action')]
test_Y = clean_val.ix[:,list(clean_val.columns).index('Action'):len(clean_val.columns)]


In [47]:
train_features.shape

(9108, 7, 7, 512)

In [48]:
train_features_reshaped = train_features.reshape(9108, 7 * 7 * 512)

In [49]:
val_features.shape

(1139, 7, 7, 512)

In [50]:
val_features_reshaped = val_features.reshape(1139, 7 * 7 * 512)

## Random Forest on Bottleneck Features from Poster Images

In [51]:
rf = RandomForestClassifier(n_jobs=-1, n_estimators=100)

In [52]:
rf.fit(train_features_reshaped, train_multilabels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [56]:
print rf.score(train_features_reshaped, train_Y)
print rf.score(val_features_reshaped, test_Y)
print hamming_loss(test_Y, rf.predict(val_features_reshaped))

0.999231444884
0.00263388937665
0.132835820896


In [62]:
bottle_neck_preds = rf.predict(val_features_reshaped)
print 'Performance of Bottleneck Random Forest Predictions on Validation Set'
print 'hamming loss:', skmetrics.hamming_loss(test_Y, bottle_neck_preds)
print 'jaccard similarity:', skmetrics.jaccard_similarity_score(test_Y, bottle_neck_preds)
print 'zero one loss:', skmetrics.zero_one_loss(test_Y, bottle_neck_preds)
print 'exact match ratio:', skmetrics.accuracy_score(test_Y, bottle_neck_preds)

Performance of Bottleneck Random Forest Predictions on Validation Set
hamming loss: 0.132835820896
jaccard similarity: 0.0126134035704
zero one loss: 0.997366110623
exact match ratio: 0.00263388937665


## Random Forest on Metadata

In [12]:
params = {
    'n_estimators': [100],
    'max_features': ['auto', 'log2', None, 1, 3, 5, 7, 9],
    'n_jobs' : [-1]
}

CV_rf = GridSearchCV(RandomForestClassifier(), param_grid=params, cv=5, n_jobs=-1)
CV_rf.fit(train_X, train_Y)
print CV_rf.best_params_
print CV_rf.best_score_
print CV_rf.score(test_X, test_Y)

{'max_features': None, 'n_estimators': 100, 'n_jobs': -1}
0.0707070707071
0.0684811237928


In [14]:
pd.DataFrame({'feature': train_X.columns, 'importance': CV_rf.best_estimator_.feature_importances_}).to_csv("")

Unnamed: 0,feature,importance
0,budget,0.066877
1,popularity,0.155417
2,rating,0.114598
3,revenue,0.051359
4,runtime,0.153397
5,vote_average,0.093953
6,vote_count,0.106557
7,votes,0.125159
8,year,0.132685


In [63]:
meta_preds = CV_rf.predict(test_X)
print 'Performance of Metadata Random Forest Predictions on Validation Set'
print 'hamming loss:', skmetrics.hamming_loss(test_Y, meta_preds)
print 'jaccard similarity:', skmetrics.jaccard_similarity_score(test_Y, meta_preds)
print 'zero one loss:', skmetrics.zero_one_loss(test_Y, meta_preds)
print 'exact match ratio:', skmetrics.accuracy_score(test_Y, meta_preds)

Performance of Bottleneck Random Forest Predictions on Validation Set
hamming loss: 0.119710272169
jaccard similarity: 0.234351352481
zero one loss: 0.931518876207
exact match ratio: 0.0684811237928
