## Import stuff

In [1]:
# !pip install lda
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from sklearn import preprocessing as prep
from sklearn.feature_extraction import DictVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LassoLarsCV
from sklearn.linear_model import RidgeCV

import csv

from sklearn.metrics import r2_score
from sklearn.svm import SVR
from sklearn import tree

from sklearn.preprocessing import normalize
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
import warnings
import matplotlib.pyplot as plt

import lda

import pickle

%matplotlib inline

## import raw data

In [2]:
full = pd.read_csv('5000data.csv')
full = full.dropna()
full = full[full['language'] == 'English']
full.shape

(3598, 28)

### Text mining for plot-keywords

In [3]:
vocab = full['plot_keywords'].str.split('|').apply(pd.Series, 1).stack().tolist()
vocab = list(set(vocab))
len(vocab)

6519

In [4]:
vocab.sort()

In [5]:
X = []
for index, movie in full.iterrows():
    plot = movie['plot_keywords'].split('|')
    dict_ = {}
    for word in vocab:
        dict_[word] = 0
    for word in plot:
        dict_[word] = 1

    X.append(dict_)

X = pd.DataFrame(X)

In [6]:
mylist = list(X.columns)
myfile = open('dictionary.csv', "w", newline="")
wr = csv.writer(myfile, delimiter=',', quotechar=',', quoting=csv.QUOTE_MINIMAL)
wr.writerow(mylist)

83360

In [7]:
X = X.values

In [8]:
model = lda.LDA(n_topics=30, n_iter=2000, random_state=1)
model.fit(X)

topic_word = model.topic_word_
print("type(topic_word): {}".format(type(topic_word)))
# print("shape: {}".format(topic_word.shape))

type(topic_word): <class 'numpy.ndarray'>


In [9]:
# n = 10
# for i, topic_dist in enumerate(topic_word):
#     topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n+1):-1]
#     print('*Topic {}\n- {}'.format(i, ','.join(topic_words)))

*Topic 0
- president,hotel,photographer,investigation,farm,assassin,television,evil,satire,concert
*Topic 1
- cia,terrorist,spy,russian,assassin,rescue,rape,bomb,mission,british
*Topic 2
- police,detective,murder,heist,drugs,gangster,bank,prison,partner,criminal
*Topic 3
- island,monster,father daughter relationship,gold,alaska,forest,mountain,snow,food,survival
*Topic 4
- vomiting,coach,basketball,football,title directed by female,high school,college,halloween,teenage girl,two word title
*Topic 5
- soldier,revenge,ship,army,military,conspiracy,cult film,jew,california,colonel
*Topic 6
- new york city,love,writer,singer,reporter,band,actress,song,small town,magazine
*Topic 7
- battle,king,warrior,scientist,island,experiment,england,dragon,orphan,greek
*Topic 8
- friend,boy,baby,girl,restaurant,book,catholic,twin,france,soccer
*Topic 9
- death,vampire,murder,blood,1950s,hotel,independent film,team,revenge,artist
*Topic 10
- drugs,superhero,stripper,male objectification,undercover,cocain

In [10]:
# save the model
pickle.dump(model, open("lda.pkl","wb"))

In [11]:
index = full.index
doc_topic = model.doc_topic_
doc_topic = pd.DataFrame(doc_topic, index=index)
topic_name = [('topic' + str(i)) for i in range(30)]
doc_topic.columns = topic_name
print(doc_topic.dropna().shape)
print(full.dropna().shape)
processed = pd.concat([full, doc_topic], axis=1, join_axes=[full.index]).dropna()
print(processed.shape)
# movie_title = full.movie_title.tolist()
# for i in range(20):
#      print("{} (top topic: {})".format(movie_title[i], doc_topic[i].argmax()))

del processed['plot_keywords']

(3598, 30)
(3598, 28)
(3598, 58)


### Split genres

In [12]:
genres = processed['genres'].str.split('|').apply(pd.Series).stack().drop_duplicates().tolist()
for genre in genres:
    processed.insert(processed.shape[1], genre, np.zeros(shape=(processed.shape[0],1)))
for index, row in processed.iterrows():
    for genre in genres:        
        if genre in row['genres']:
            processed.set_value(index,genre,1.0)
del processed['genres']

In [13]:
movie = processed.drop(['facenumber_in_poster', 'title_year', 'Film-Noir', 'color', 'director_name', 'movie_title', 'actor_3_name', 'actor_2_name', 'actor_1_name', 'movie_imdb_link', 'content_rating', 'aspect_ratio', 'language', 'country'], axis=1)

In [14]:
# movie = movie.dropna()
movie.to_csv('meaningful.csv', index=False)
movie.shape

(3598, 64)

## further processing -- dealing with the mixture of continuous and discrete features

In [15]:
data = pd.read_csv('meaningful.csv')

### standardize (x - u)/s

In [16]:
target = data['gross']
del data['gross']
data_continuous = data.ix[:,:'topic29']
data_discrete = data.ix[:, 'Action':]
data_continuous.shape

(3598, 42)

In [17]:
d1 = data_continuous.describe()

col_names = data_continuous.columns.tolist()
for col in col_names:
    data_continuous[col] = prep.scale(data_continuous[col].astype('float64'))

In [18]:
d2 = data_continuous.describe()

### rescale to [-1, 1]

In [19]:
data_continuous -= data_continuous.min()
data_continuous /= data_continuous.max() - data_continuous.min()
data_continuous *= 2
data_continuous -= 1.0

In [20]:
frames = [data_continuous, data_discrete, target]
newData = pd.concat(frames, axis=1)
newData.shape

(3598, 64)

In [21]:
newData.to_csv('rescaled.csv', index=False)

### save mean/std/min/max

In [22]:
d = pd.DataFrame(d1.loc['mean'])
d = d.transpose()
d = d.append(d1.loc['std'], ignore_index=True)
d = d.append(d2.loc['min'], ignore_index=True)
d = d.append(d2.loc['max'], ignore_index=True)
d

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,num_voted_users,cast_total_facebook_likes,num_user_for_reviews,budget,actor_2_facebook_likes,...,topic20,topic21,topic22,topic23,topic24,topic25,topic26,topic27,topic28,topic29
0,167.959144,110.023624,822.84547,800.038911,8029.099222,107521.360756,11950.276265,341.292107,39724470.0,2100.710117,...,0.031027,0.030955,0.032679,0.031668,0.037181,0.033424,0.034188,0.035756,0.0314,0.03281
1,124.501918,22.198206,3111.794348,1929.937188,15780.513679,153724.786732,19411.4348,414.049811,43716750.0,4627.174791,...,0.068477,0.070052,0.07083,0.071616,0.087466,0.073814,0.075631,0.076858,0.072546,0.074268
2,-1.33317,-3.290076,-0.264465,-0.414599,-0.508869,-0.698946,-0.615716,-0.81473,-0.9087999,-0.454057,...,-0.27059,-0.263481,-0.284937,-0.267694,-0.282214,-0.283505,-0.286803,-0.302629,-0.260567,-0.273505
3,5.181691,9.911024,7.127797,11.504544,40.053116,10.294128,33.221107,11.398058,8.013501,29.157757,...,8.857867,8.659726,8.540259,8.460653,6.864398,8.184922,7.978207,7.830329,8.355887,8.143096


In [23]:
d.to_csv("statistics.csv", index = False)

In [24]:
d.loc[0]

num_critic_for_reviews       1.679591e+02
duration                     1.100236e+02
director_facebook_likes      8.228455e+02
actor_3_facebook_likes       8.000389e+02
actor_1_facebook_likes       8.029099e+03
num_voted_users              1.075214e+05
cast_total_facebook_likes    1.195028e+04
num_user_for_reviews         3.412921e+02
budget                       3.972447e+07
actor_2_facebook_likes       2.100710e+03
imdb_score                   6.427043e+00
movie_facebook_likes         9.491271e+03
topic0                       3.338895e-02
topic1                       3.495232e-02
topic2                       3.788054e-02
topic3                       3.114233e-02
topic4                       3.230733e-02
topic5                       3.219616e-02
topic6                       3.383067e-02
topic7                       3.334759e-02
topic8                       3.481336e-02
topic9                       3.347763e-02
topic10                      3.227722e-02
topic11                      3.464