In [78]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from time import time
import numpy as np

In [79]:
movies_df = pd.read_csv('joined_movies_w_factors_and_genomes.csv')
print(movies_df.columns.values)
print(movies_df.dtypes)
movies_df

['title_script' 'genre_script' 'filename' ..., 'G1126' 'G1127' 'G1128']
title_script     object
genre_script     object
filename         object
title_p          object
movieId           int64
title            object
genres           object
X1              float64
X2              float64
X3              float64
X4              float64
X5              float64
X6              float64
X7              float64
X8              float64
X9              float64
X10             float64
X11             float64
X12             float64
X13             float64
X14             float64
X15             float64
X16             float64
X17             float64
X18             float64
X19             float64
X20             float64
X21             float64
X22             float64
X23             float64
                 ...   
G1099           float64
G1100           float64
G1101           float64
G1102           float64
G1103           float64
G1104           float64
G1105           float64
G1106           

Unnamed: 0,title_script,genre_script,filename,title_p,movieId,title,genres,X1,X2,X3,...,G1119,G1120,G1121,G1122,G1123,G1124,G1125,G1126,G1127,G1128
0,10 Things I Hate About You,"Comedy,Romance",10ThingsIHateAboutYou.txt,10 things i hate about you,2572,10 Things I Hate About You (1999),Comedy|Romance,-0.016455,0.018789,-0.013277,...,0.03600,0.02150,0.02275,0.01975,0.26425,0.09425,0.01425,0.01475,0.08425,0.01900
1,12 Years a Slave,Drama,12YearsaSlave.txt,12 years a slave,105844,12 Years a Slave (2013),Drama,0.008645,0.003144,0.001322,...,0.07650,0.04550,0.08550,0.06875,0.16950,0.11200,0.02500,0.05175,0.09950,0.02600
2,127 Hours,"Adventure,Drama,Thriller",127Hours.txt,127 hours,81562,127 Hours (2010),Adventure|Drama|Thriller,0.012059,-0.001237,-0.005137,...,0.06000,0.02250,0.01625,0.11225,0.16725,0.10250,0.02800,0.01700,0.18200,0.03775
3,1492: Conquest of Paradise,"Adventure,Drama",1492ConquestofParadise.txt,1492: conquest of paradise,8905,1492: Conquest of Paradise (1992),Adventure|Drama,-0.001974,-0.005688,-0.007619,...,0.09000,0.04075,0.02200,0.02625,0.21350,0.05575,0.02325,0.02800,0.07675,0.01575
4,15 Minutes,"Action,Crime,Thriller",15Minutes.txt,15 minutes,4167,15 Minutes (2001),Thriller,-0.011999,-0.005283,-0.017477,...,0.04275,0.01425,0.01225,0.03300,0.33225,0.04800,0.02475,0.01300,0.10300,0.01825
5,17 Again,"Comedy,Drama,Romance",17Again.txt,17 again,68135,17 Again (2009),Comedy|Drama,-0.009621,0.009796,0.000850,...,0.02575,0.01700,0.01575,0.01975,0.23350,0.05800,0.01500,0.00725,0.05775,0.01350
6,187,Drama,187.txt,187,1609,187 (One Eight Seven) (1997),Drama|Thriller,-0.004879,-0.009280,-0.012055,...,0.04350,0.01600,0.00975,0.05325,0.23750,0.08475,0.02025,0.00900,0.07475,0.01625
7,2001: A Space Odyssey,"Adventure,Sci-Fi",2001ASpaceOdyssey.txt,2001: a space odyssey,924,2001: A Space Odyssey (1968),Adventure|Drama|Sci-Fi,0.040611,-0.009755,0.003340,...,0.04100,0.02650,0.02225,0.07375,0.16975,0.10350,0.08850,0.01375,0.05700,0.01875
8,2012,"Action,Adventure,Drama,Sci-Fi,Thriller",2012.txt,2012,72378,2012 (2009),Action|Drama|Sci-Fi|Thriller,-0.021496,0.002300,0.003894,...,0.04725,0.01900,0.01825,0.01425,0.26750,0.08300,0.02050,0.01175,0.10775,0.01800
9,25th Hour,"Crime,Drama",25thHour.txt,25th hour,5954,25th Hour (2002),Crime|Drama,0.020300,-0.008028,-0.020965,...,0.10450,0.02650,0.01925,0.17400,0.33475,0.21325,0.03325,0.01850,0.06600,0.02000


For some reason there are still movies without scripts and movies with NA's (check why the NAs are there!!!). Remove them here.

In [80]:
movies_df = movies_df[pd.notnull(movies_df["filename"])]
movies_df = movies_df[movies_df["filename"].str.contains(r"\.txt$")]
movies_df = movies_df[(movies_df["filename"] != "Apollo13.txt") & (movies_df["filename"] != "ScaryMovie2.txt")]
movies_df = movies_df[pd.notnull(movies_df["X1"])]
#by removing movies with NA genome, we are remove some movies (over a dozen)
#that would be good for regressing against factors X1
movies_df = movies_df[pd.notnull(movies_df["G1"])]
movies_df.reset_index(inplace=True)

In [81]:
script_files = ["raw/" + file for file in movies_df["filename"]]

Some stuff on CountVectorizer from the documentation:

"Convert a collection of text documents to a matrix of token counts"

Makes everything lowercase by default.

Default tokenizer: "The default regexp select tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator)."

Can use a built-in stop word list for English.

It cannot be parallelized, but you can use HashingVectorizer instead, which has some limitations.

In [82]:
vectorizer = CountVectorizer(input='filename',decode_error='ignore',stop_words='english', max_features=20000)

In [83]:
start = time()
features = vectorizer.fit_transform(script_files)
print(time()-start)

12.092567443847656


In [90]:
print(vectorizer.vocabulary_["great"])
print(movies_df.ix[484]["title"])
#great is 7769
#movie 496 is man on the moon
print(features[484,7769])
#we can verify that man of the moon has 17 "great"s

7769
Man on the Moon (1999)
17


In [117]:
forest = RandomForestRegressor(n_jobs=-1)

param_grid = {"max_features": [141,6666],
              "max_depth": [None],
              "n_estimators": [200,500,1000]}

#3-fold by default
grid_search = GridSearchCV(forest,param_grid=param_grid)
start = time()
grid_search.fit(features,movies_df["G106"])
print(time() - start)

144.9140660762787


In [118]:
#scores are R^2, higher is better
grid_search.grid_scores_



[mean: 0.35038, std: 0.04502, params: {'max_features': 141, 'max_depth': None, 'n_estimators': 200},
 mean: 0.35504, std: 0.03350, params: {'max_features': 141, 'max_depth': None, 'n_estimators': 500},
 mean: 0.37048, std: 0.04502, params: {'max_features': 141, 'max_depth': None, 'n_estimators': 1000},
 mean: 0.75457, std: 0.03180, params: {'max_features': 6666, 'max_depth': None, 'n_estimators': 200},
 mean: 0.74114, std: 0.05510, params: {'max_features': 6666, 'max_depth': None, 'n_estimators': 500},
 mean: 0.75149, std: 0.04604, params: {'max_features': 6666, 'max_depth': None, 'n_estimators': 1000}]

In [119]:
grid_search.best_params_

{'max_depth': None, 'max_features': 6666, 'n_estimators': 200}

In [120]:
importances = grid_search.best_estimator_.feature_importances_
indices = np.argsort(importances)[::-1]

print("Feature ranking:")
for f in range(25):
    print("%d. %s (%f)" % (f+1,vectorizer.get_feature_names()[indices[f]], importances[indices[f]]))

Feature ranking:
1. baseball (0.183727)
2. catcher (0.134344)
3. plate (0.093668)
4. batter (0.078203)
5. fastball (0.076632)
6. dugout (0.056950)
7. inning (0.051195)
8. pitcher (0.037296)
9. umpire (0.035708)
10. infield (0.028031)
11. batting (0.027920)
12. pitch (0.022207)
13. hitter (0.018340)
14. baseman (0.014520)
15. ballpark (0.012006)
16. pitches (0.009953)
17. mound (0.007726)
18. delivers (0.005698)
19. locker (0.005672)
20. team (0.005522)
21. dodger (0.004978)
22. outfield (0.004974)
23. stump (0.004921)
24. players (0.004508)
25. increases (0.004413)


In [116]:
movies_df.sort_values(by=["G106"],ascending=[0])

Unnamed: 0,index,title_script,genre_script,filename,title_p,movieId,title,genres,X1,X2,...,G1119,G1120,G1121,G1122,G1123,G1124,G1125,G1126,G1127,G1128
150,161,Bull Durham,"Comedy,Drama,Romance",BullDurham.txt,bull durham,3361,Bull Durham (1988),Comedy|Drama|Romance,0.011649,0.026957,...,0.04900,0.02725,0.02650,0.05150,0.22650,0.11125,0.02250,0.02000,0.11225,0.02650
280,300,Field of Dreams,"Drama,Family,Fantasy",FieldofDreams.txt,field of dreams,1302,Field of Dreams (1989),Children|Drama|Fantasy,-0.000976,0.042919,...,0.04750,0.02725,0.02725,0.03700,0.18975,0.09800,0.02675,0.02425,0.06825,0.01975
11,11,42,Drama,42.txt,42,101895,42 (2013),Drama,-0.004325,0.003346,...,0.04775,0.07500,0.13600,0.04525,0.24775,0.10000,0.01025,0.09900,0.08450,0.02200
715,769,Sugar,Drama,Sugar.txt,sugar,67223,Sugar (2008),Drama,0.002297,0.000050,...,0.02675,0.03275,0.01950,0.11725,0.32425,0.07175,0.02850,0.00725,0.06975,0.02000
184,199,Cobb,Drama,Cobb.txt,cobb,354,Cobb (1994),Drama,0.000235,-0.007836,...,0.07425,0.02250,0.02075,0.07675,0.35350,0.22800,0.02525,0.01125,0.08775,0.01750
519,558,Moneyball,Drama,Moneyball.txt,moneyball,89492,Moneyball (2011),Drama,0.009304,0.011955,...,0.13725,0.02450,0.03375,0.05900,0.24950,0.20650,0.01950,0.03075,0.09075,0.02250
480,517,Major League,Comedy,MajorLeague.txt,major league,4623,Major League (1989),Comedy,-0.008272,0.003835,...,0.05000,0.02400,0.03025,0.02725,0.11175,0.03800,0.02075,0.02150,0.10050,0.02375
621,668,"Replacements, The",Comedy,ReplacementsThe.txt,replacements,3861,"Replacements, The (2000)",Comedy,-0.023455,0.002488,...,0.03125,0.02075,0.01650,0.02025,0.13475,0.04150,0.01650,0.01325,0.09600,0.01975
213,230,"Damned United, The",Drama,DamnedUnitedThe.txt,damned united,68194,"Damned United, The (2009)",Drama,0.005242,0.000879,...,0.12350,0.05000,0.02975,0.10100,0.20125,0.10525,0.02475,0.02150,0.09225,0.02625
752,810,Tin Cup,"Comedy,Drama,Romance",TinCup.txt,tin cup,852,Tin Cup (1996),Comedy|Drama|Romance,-0.014728,0.015321,...,0.04350,0.02350,0.01650,0.03225,0.19200,0.08625,0.02025,0.01350,0.08375,0.01775
