In [92]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from time import time
import numpy as np
import warnings
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import wordnet
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")
import functools

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
movies_df = pd.read_csv('joined_movies_w_factors_and_genomes.csv')
print(movies_df.columns.values)
print(movies_df.dtypes)
movies_df

['title_script' 'genre_script' 'filename' ..., 'G1126' 'G1127' 'G1128']
title_script     object
genre_script     object
filename         object
title_p          object
movieId           int64
title            object
genres           object
X1              float64
X2              float64
X3              float64
X4              float64
X5              float64
X6              float64
X7              float64
X8              float64
X9              float64
X10             float64
X11             float64
X12             float64
X13             float64
X14             float64
X15             float64
X16             float64
X17             float64
X18             float64
X19             float64
X20             float64
X21             float64
X22             float64
X23             float64
                 ...   
G1099           float64
G1100           float64
G1101           float64
G1102           float64
G1103           float64
G1104           float64
G1105           float64
G1106           

Unnamed: 0,title_script,genre_script,filename,title_p,movieId,title,genres,X1,X2,X3,...,G1119,G1120,G1121,G1122,G1123,G1124,G1125,G1126,G1127,G1128
0,10 Things I Hate About You,"Comedy,Romance",10ThingsIHateAboutYou.txt,10 things i hate about you,2572,10 Things I Hate About You (1999),Comedy|Romance,-0.016455,0.018789,-0.013277,...,0.03600,0.02150,0.02275,0.01975,0.26425,0.09425,0.01425,0.01475,0.08425,0.01900
1,12 Years a Slave,Drama,12YearsaSlave.txt,12 years a slave,105844,12 Years a Slave (2013),Drama,0.008645,0.003144,0.001322,...,0.07650,0.04550,0.08550,0.06875,0.16950,0.11200,0.02500,0.05175,0.09950,0.02600
2,127 Hours,"Adventure,Drama,Thriller",127Hours.txt,127 hours,81562,127 Hours (2010),Adventure|Drama|Thriller,0.012059,-0.001237,-0.005137,...,0.06000,0.02250,0.01625,0.11225,0.16725,0.10250,0.02800,0.01700,0.18200,0.03775
3,1492: Conquest of Paradise,"Adventure,Drama",1492ConquestofParadise.txt,1492: conquest of paradise,8905,1492: Conquest of Paradise (1992),Adventure|Drama,-0.001974,-0.005688,-0.007619,...,0.09000,0.04075,0.02200,0.02625,0.21350,0.05575,0.02325,0.02800,0.07675,0.01575
4,15 Minutes,"Action,Crime,Thriller",15Minutes.txt,15 minutes,4167,15 Minutes (2001),Thriller,-0.011999,-0.005283,-0.017477,...,0.04275,0.01425,0.01225,0.03300,0.33225,0.04800,0.02475,0.01300,0.10300,0.01825
5,17 Again,"Comedy,Drama,Romance",17Again.txt,17 again,68135,17 Again (2009),Comedy|Drama,-0.009621,0.009796,0.000850,...,0.02575,0.01700,0.01575,0.01975,0.23350,0.05800,0.01500,0.00725,0.05775,0.01350
6,187,Drama,187.txt,187,1609,187 (One Eight Seven) (1997),Drama|Thriller,-0.004879,-0.009280,-0.012055,...,0.04350,0.01600,0.00975,0.05325,0.23750,0.08475,0.02025,0.00900,0.07475,0.01625
7,2001: A Space Odyssey,"Adventure,Sci-Fi",2001ASpaceOdyssey.txt,2001: a space odyssey,924,2001: A Space Odyssey (1968),Adventure|Drama|Sci-Fi,0.040611,-0.009755,0.003340,...,0.04100,0.02650,0.02225,0.07375,0.16975,0.10350,0.08850,0.01375,0.05700,0.01875
8,2012,"Action,Adventure,Drama,Sci-Fi,Thriller",2012.txt,2012,72378,2012 (2009),Action|Drama|Sci-Fi|Thriller,-0.021496,0.002300,0.003894,...,0.04725,0.01900,0.01825,0.01425,0.26750,0.08300,0.02050,0.01175,0.10775,0.01800
9,25th Hour,"Crime,Drama",25thHour.txt,25th hour,5954,25th Hour (2002),Crime|Drama,0.020300,-0.008028,-0.020965,...,0.10450,0.02650,0.01925,0.17400,0.33475,0.21325,0.03325,0.01850,0.06600,0.02000


For some reason there are still movies without scripts and movies with NA's (check why the NAs are there!!!). Remove them here.

In [3]:
movies_df = movies_df[pd.notnull(movies_df["filename"])]
movies_df = movies_df[movies_df["filename"].str.contains(r"\.txt$")]
movies_df = movies_df[(movies_df["filename"] != "Apollo13.txt") & (movies_df["filename"] != "ScaryMovie2.txt")]
movies_df = movies_df[pd.notnull(movies_df["X1"])]
#by removing movies with NA genome, we are remove some movies (over a dozen)
#that would be good for regressing against factors X1
movies_df = movies_df[pd.notnull(movies_df["G1"])]
movies_df.reset_index(inplace=True)

In [4]:
script_files = ["raw/" + file for file in movies_df["filename"]]

Some stuff on CountVectorizer from the documentation:

"Convert a collection of text documents to a matrix of token counts"

Makes everything lowercase by default.

Default tokenizer: "The default regexp select tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator)."

Can use a built-in stop word list for English.

It cannot be parallelized, but you can use HashingVectorizer instead, which has some limitations (can't figure out what words are associated with each feature?). CountVectorizer is fast enough for our ~800 scripts.

In [119]:
#We want to do some stemming before building the bag-of-words.
#We do this by building a new class that inherits from CountVectorizer.
#This is the simplest thing to do, as it allows us to use all the nice
#features from CountVectorizer (preprocessing to remove uppercase and 
#punctuation, decent tokenizing, stop words) and just add the stemming
#step, rather than having to rewrite everything.

class StemmedCountVectorizer(CountVectorizer):
    def __init__(self, stemmer, *args, **kwargs):
        super(StemmedCountVectorizer,self).__init__(*args, **kwargs)
        self.stemmer = stemmer
    
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer,self).build_analyzer()
        return lambda doc:[self.stemmer.stem(w) for w in analyzer(doc)]
    
class LemmaCountVectorizer(CountVectorizer):
    def __init__(self, *args, **kwargs):
        super(LemmaCountVectorizer,self).__init__(*args, **kwargs)
        self.wnl = WordNetLemmatizer()
        
    #the lemmatizer always needs a part of speech
    #use pos_tag to get the part of speech and then lemmatize
    #memoize this since pos_tag is very sloow
    @functools.lru_cache(maxsize=100000)
    def pos_tag_and_lemmatize(self,word):
        pos_tag = nltk.pos_tag([word])
        pos = get_wordnet_pos(pos_tag[0][1])
        return self.wnl.lemmatize(word,pos)
    
    def build_analyzer(self):
        analyzer = super(LemmaCountVectorizer,self).build_analyzer()
        return lambda doc:[self.pos_tag_and_lemmatize(w) for w in analyzer(doc)]

#The parts of speech returned by pos_tag do not match the ones
#from wordnet. Convert them.
#This could probably be a lot more efficient.
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [120]:
#vectorizer = StemmedCountVectorizer(input='filename',decode_error='ignore',stop_words='english', max_features=10000, stemmer=PorterStemmer())
vectorizer = LemmaCountVectorizer(input='filename',decode_error='ignore',stop_words='english', max_features=10000)

In [121]:
start = time()
features = vectorizer.fit_transform(script_files)
print(time()-start)

34.94370460510254


In [123]:
print(vectorizer.vocabulary_["great"])
print(movies_df.ix[484]["title"])
#great is 3993
#movie 484 is man on the moon
print(features[484,4033])
#we can verify that man of the moon has 17 "great"s

4033
Man on the Moon (1999)
18


In [20]:
def random_forest_grid_search(target):
    forest = RandomForestRegressor(n_jobs=-1)

    param_grid = {"max_features": [100,3333],
                  "max_depth": [None],
                  "n_estimators": [200,500,1000]}

    #3-fold by default
    grid_search = GridSearchCV(forest,param_grid=param_grid)
    start = time()
    grid_search.fit(features,target)
    elapsed = time() - start
    print("Time to fit: %f" % elapsed)
    #print("Fit scores (R^2):")
    #print(grid_search.grid_scores_)
    return grid_search

In [10]:
def print_importances(rf_model,n = 25):
    importances = rf_model.feature_importances_
    indices = np.argsort(importances)[::-1] #[::-1] just reverses
    
    print("Feature ranking:")
    for f in range(25):
        print("%d. %s (%f)" % (f+1,vectorizer.get_feature_names()[indices[f]], importances[indices[f]]))

In [11]:
def elastic_net_grid_search(target, features):
    elastic = ElasticNet(normalize=True)

    param_grid_el = {"l1_ratio": [0.0,0.1,0.3,0.5,0.7,0.9,1.0],
                     "alpha": [1,1e-1,1e-2,1e-3,1e-4,1e-5]}

    grid_search_el = GridSearchCV(elastic,param_grid=param_grid_el,cv=10,n_jobs=36)
    start = time()
    # We will often get a warning that the fit did not converge,
    # because many of the grid values may be bad
    with warnings.catch_warnings():
        grid_search_el.fit(features,target)
    print(time() - start)
    return grid_search_el

In [12]:
#First run an random forest regression to find the N most important features,
#then use only these features in a elastic net fit.
#From experimentation (though only with one particular target),
#it seems like 20-100 are decent values for n (they produce the best scores
#for the elastic net part). 100 was the best, but 20 and 50
#were also decent. 10 and 200 were significantly worse.
def elastic_forest(target, n=30):
    rf_grid = random_forest_grid_search(target)

    importances = rf_grid.best_estimator_.feature_importances_
    indices = np.argsort(importances)[::-1][0:n] #[::-1] just reverses
    feature_names = [vectorizer.get_feature_names()[i] for i in indices]
    importances_dict = dict(zip(feature_names,importances[indices]))

    en_grid = elastic_net_grid_search(target, features[:,indices])
    coefs = dict(zip(feature_names,en_grid.best_estimator_.coef_))

    return (rf_grid,en_grid,importances_dict,coefs)

In [124]:
x4_fit = elastic_forest(movies_df["X2"])

Time to fit: 166.221263




2.9274404048919678


In [125]:
x4_fit[0].grid_scores_



[mean: 0.11387, std: 0.01591, params: {'max_depth': None, 'max_features': 100, 'n_estimators': 200},
 mean: 0.11377, std: 0.01569, params: {'max_depth': None, 'max_features': 100, 'n_estimators': 500},
 mean: 0.11552, std: 0.01371, params: {'max_depth': None, 'max_features': 100, 'n_estimators': 1000},
 mean: 0.16044, std: 0.01012, params: {'max_depth': None, 'max_features': 3333, 'n_estimators': 200},
 mean: 0.16346, std: 0.01149, params: {'max_depth': None, 'max_features': 3333, 'n_estimators': 500},
 mean: 0.16711, std: 0.01069, params: {'max_depth': None, 'max_features': 3333, 'n_estimators': 1000}]

In [126]:
sorted(x4_fit[2].items(), key = lambda x: -x[1])

[('fuck', 0.054850781623666474),
 ('potter', 0.010621330855795729),
 ('blood', 0.010142782592096243),
 ('broom', 0.010052971894180775),
 ('gun', 0.0092559381637273858),
 ('spell', 0.00832434736554955),
 ('body', 0.0079501563313197736),
 ('marry', 0.0079494400527964654),
 ('fuckin', 0.0052358163829878809),
 ('oh', 0.0043346439210582488),
 ('lime', 0.0039777632427616307),
 ('wait', 0.0038316668809804315),
 ('hand', 0.0037419249590590182),
 ('mayhem', 0.0036766077433422217),
 ('scream', 0.00357860564689934),
 ('fucker', 0.0034804136724760092),
 ('ask', 0.0034727568767437115),
 ('hey', 0.0034491445060173512),
 ('pussy', 0.0031532051303577824),
 ('nod', 0.0031365851120619655),
 ('clothes', 0.0029444739969021897),
 ('shit', 0.0028440260978972978),
 ('motherfucker', 0.0027141203948427345),
 ('human', 0.0026611389927100283),
 ('personal', 0.0024712106269434812),
 ('potion', 0.0024525304451895927),
 ('wizard', 0.0024037023123344168),
 ('box', 0.0023545094528469633),
 ('ll', 0.002227311593665476

In [127]:
sorted(x4_fit[3].items(), key = lambda x: -x[1])

[('spell', 0.00097064238886463252),
 ('marry', 0.00079752562591627019),
 ('potter', 0.00014232661062047217),
 ('wait', 8.1402844837560184e-05),
 ('ask', 5.8391724075111484e-05),
 ('nod', 3.8462105303540334e-05),
 ('oh', 7.9112957164473072e-06),
 ('box', 0.0),
 ('wizard', 0.0),
 ('shit', -0.0),
 ('ll', 0.0),
 ('take', 0.0),
 ('broom', 0.0),
 ('human', -0.0),
 ('hey', 0.0),
 ('personal', 0.0),
 ('clothes', 0.0),
 ('mayhem', -0.0),
 ('potion', 0.0),
 ('hand', -0.0),
 ('gun', -1.7625198901923125e-05),
 ('fuckin', -3.3097260757019857e-05),
 ('fucker', -3.3685175614689498e-05),
 ('blood', -7.7865151526705301e-05),
 ('body', -9.157694115680816e-05),
 ('fuck', -0.00010641758753083732),
 ('lime', -0.00012587928935720136),
 ('scream', -0.00014828840409440982),
 ('motherfucker', -0.00029465984430895864),
 ('pussy', -0.00057671940286379035)]

In [128]:
#Logistic regression sometimes outperforms random forest after we have done feature selection
print(x4_fit[0].best_score_)
print(x4_fit[1].best_score_)

0.167108429766
0.199927441271


In [190]:
movies_df.sort_values(by="X4")[["title","X4"]]

Unnamed: 0,title,X4
635,"Rocky Horror Picture Show, The (1975)",-0.054130
288,"Flintstones, The (1994)",-0.050798
656,Serial Mom (1994),-0.049507
183,Clueless (1995),-0.048725
210,"Crying Game, The (1992)",-0.047368
523,Moonstruck (1987),-0.047073
21,"Addams Family, The (1991)",-0.046600
585,"Piano, The (1993)",-0.046274
494,Mary Poppins (1964),-0.044215
740,Thelma & Louise (1991),-0.040924


In [None]:
#next steps: visualization

In [118]:
LemmaCountVectorizer(input='content',decode_error='ignore',stop_words='english', max_features=10000).build_analyzer()("laughing")

['laugh']