# Import movie files

In [None]:
import pandas as pd
import datetime
import numpy as np
import re

In [None]:
movie_file = 'movies2000_2016.csv'
plot_file = 'plots2000_2016.csv'
actor_file = 'actors2000_2016.csv'
director_file = 'directors2000_2016.csv'

In [39]:
movie_df = pd.read_csv(movie_file)
plot_df = pd.read_csv(plot_file)
actor_df = pd.read_csv(actor_file)
director_df = pd.read_csv(director_file)

In [40]:
movie_df.columns = [cat.lower() for cat in movie_df.columns]
movie_df = movie_df[['\nrunning time\n', 'based on' ,'box office' ,'budget',
 'cinematography' ,'country' ,'directed by', 'distributed by',
 'edited by', 'language' , 'music by', 'narrated by',
 'original title', 'produced by' , 'screenplay by',
 'simplified', 'starring' ,'story by' ,'traditional' ,'written by', 'site',
 'title', 'studio' ,'revenues' ,'costs', 'release_dates' ,'release_year' ,'release_month',
 'release_day']]

movie_df.columns = ['length', 'based_on', 'box_office', 'budget', 'cinematography', 'country', 'director', 'distributor',
                   'editor', 'language', 'music', 'narration', 'original_title', 'producer', 'screenplay', 'simplified', 'starring', 'story', 
                   'traditional', 'written_by', 'site', 'title', 'studio', 'revenues' ,'costs', 'release_dates' ,'release_year' ,'release_month',
 'release_day']
print movie_df.columns.values
print "---------------------------------------------------------------------------"
print movie_df[:4]

['length' 'based_on' 'box_office' 'budget' 'cinematography' 'country'
 'director' 'distributor' 'editor' 'language' 'music' 'narration'
 'original_title' 'producer' 'screenplay' 'simplified' 'starring' 'story'
 'traditional' 'written_by' 'site' 'title' 'studio' 'revenues' 'costs'
 'release_dates' 'release_year' 'release_month' 'release_day']
---------------------------------------------------------------------------
        length                                        based_on  \
0  100 minutes  The Hundred and One Dalmatians\nby Dodie Smith   
1  103 minutes  The Hundred and One Dalmatians\nby Dodie Smith   
2  104 minutes                                             NaN   
3   82 minutes                                             NaN   

          box_office           budget cinematography        country  \
0  $183.6 million[1]      $85 million  Adrian Biddle  United States   
1  $320.6 million[2]   $75 million[1]  Adrian Biddle  United States   
2   $62.2 million[1]   $43 million[1

In [41]:
plot_df.columns = ['site', 'plot']
print plot_df.columns.values
print "---------------------------------------------------------------------------"
print plot_df[:2]

['site' 'plot']
---------------------------------------------------------------------------
                                                site  \
0  https://en.wikipedia.org/wiki/%22Crocodile%22_...   
1  https://en.wikipedia.org/wiki/%C3%86on_Flux_(f...   

                                                plot  
0  A year has passed since the events of Crocodil...  
1  In 2011, a deadly pathogenic virus has killed ...  


In [42]:
print actor_df.columns.values
print "---------------------------------------------------------------------------"
print actor_df[:2]

['actor_name' 'actor_name.1' 'movies' 'revenues' 'costs' 'gross_profit'
 'return' 'count']
---------------------------------------------------------------------------
     actor_name  actor_name.1  \
0  Jacki Weaver  Jacki Weaver   
1   Paul McGill   Paul McGill   

                                              movies     revenues       costs  \
0  [https://en.wikipedia.org/wiki/Magic_in_the_Mo...  301356377.0  70800000.0   
1   [https://en.wikipedia.org/wiki/Fame_(2009_film)]   77184633.0  18000000.0   

   gross_profit    return  count  
0    79878188.5  1.128223      5  
1    20592316.5  1.144018      1  


In [43]:
print director_df.columns.values
print "---------------------------------------------------------------------------"
print director_df[:2]

['0' 'director_name' 'movies' 'revenues' 'costs' 'gross_profit' 'return'
 'count']
---------------------------------------------------------------------------
               0  director_name  \
0            NaN            NaN   
1  Jamal Hill[1]  Jamal Hill[1]   

                                              movies  revenues      costs  \
0  [https://en.wikipedia.org/wiki/List_of_Marvel_...       0.0        0.0   
1  [https://en.wikipedia.org/wiki/Brotherly_Love_...  478595.0  1900000.0   

   gross_profit    return  count  
0           0.0       NaN      2  
1    -1660702.5 -0.874054      1  


In [44]:
movie_list_df = movie_df.merge(plot_df, on=[u'site'])

In [45]:
print movie_list_df.columns.values
print "---------------------------------------------------------------------------"
print movie_list_df[:2]

['length' 'based_on' 'box_office' 'budget' 'cinematography' 'country'
 'director' 'distributor' 'editor' 'language' 'music' 'narration'
 'original_title' 'producer' 'screenplay' 'simplified' 'starring' 'story'
 'traditional' 'written_by' 'site' 'title' 'studio' 'revenues' 'costs'
 'release_dates' 'release_year' 'release_month' 'release_day' 'plot']
---------------------------------------------------------------------------
        length                                        based_on  \
0  100 minutes  The Hundred and One Dalmatians\nby Dodie Smith   
1  103 minutes  The Hundred and One Dalmatians\nby Dodie Smith   

          box_office          budget cinematography        country  \
0  $183.6 million[1]     $85 million  Adrian Biddle  United States   
1  $320.6 million[2]  $75 million[1]  Adrian Biddle  United States   

        director           distributor          editor language  \
0     Kevin Lima  Buena Vista Pictures  Gregory Perler  English   
1  Stephen Herek  Buena Vista

# Clean data

### Replace nulls in text columns with empty string (otherwise sometimes causes error)

In [46]:
movie_list_df['based_on'].replace(to_replace=np.nan, value="", inplace=True)
movie_list_df['cinematography'].replace(to_replace=np.nan, value="", inplace=True)
movie_list_df['country'].replace(to_replace=np.nan, value="", inplace=True)
movie_list_df['director'].replace(to_replace=np.nan, value="", inplace=True)
movie_list_df['distributor'].replace(to_replace=np.nan, value="", inplace=True)
movie_list_df['editor'].replace(to_replace=np.nan, value="", inplace=True)
movie_list_df['language'].replace(to_replace=np.nan, value="", inplace=True)
movie_list_df['language'].replace(to_replace=np.nan, value="", inplace=True)
movie_list_df['music'].replace(to_replace=np.nan, value="", inplace=True)
movie_list_df['narration'].replace(to_replace=np.nan, value="", inplace=True)
movie_list_df['producer'].replace(to_replace=np.nan, value="", inplace=True)
movie_list_df['screenplay'].replace(to_replace=np.nan, value="", inplace=True)
movie_list_df['starring'].replace(to_replace=np.nan, value="", inplace=True)
movie_list_df['story'].replace(to_replace=np.nan, value="", inplace=True)
movie_list_df['written_by'].replace(to_replace=np.nan, value="", inplace=True)
movie_list_df['studio'].replace(to_replace=np.nan, value="", inplace=True)
movie_list_df['plot'].replace(to_replace=np.nan, value="", inplace=True)

### Convert to numeric values

#### Revenue

In [47]:
revenue_arr = []
for index, row in movie_list_df.iterrows():
    try:
        revenue = float(row['revenues'])
    except:
        revenue = np.nan
    revenue_arr.append(revenue)

movie_list_df['revenues_clean'] = revenue_arr
print movie_list_df['revenues_clean']

0       1.836000e+08
1       3.206000e+08
2       6.220000e+07
3       9.800000e+06
4       9.610000e+07
5       3.510000e+07
6       1.810000e+07
7       4.740000e+07
8       3.430000e+07
9       4.380500e+04
10      4.040000e+07
11      9.070000e+07
12      1.547197e+07
13      2.463650e+06
14      2.970000e+07
15      1.441000e+08
16      3.200000e+06
17      9.038321e+07
18      1.500000e+06
19      8.527517e+06
20      2.080000e+07
21      1.739594e+08
22      1.739594e+08
23      4.044301e+07
24      2.878026e+07
25      4.770000e+07
26      2.486000e+08
27      5.342529e+07
28      2.585062e+07
29      9.040000e+07
            ...     
3604    2.120000e+08
3605    4.520000e+07
3606    2.600000e+06
3607    1.624000e+08
3608    2.250000e+06
3609             NaN
3610    1.824000e+08
3611    2.964000e+08
3612    1.195000e+08
3613    2.920000e+07
3614    1.732000e+08
3615    1.680000e+07
3616    2.330000e+07
3617    1.547000e+08
3618    2.360000e+07
3619    6.300000e+06
3620    1.615

#### Length

In [48]:
length_arr = []
for index, row in movie_list_df.iterrows():
    try:
        length = re.sub("[^0-9]+", "", row['length'])
        length = int(length)
    except:
        length = np.nan
    length_arr.append(length)

movie_list_df['length_clean'] = length_arr
print movie_list_df['length_clean']

0        100
1        103
2        104
3         82
4        124
5        921
6       1171
7        122
8       1011
9        941
10       117
11       103
12       119
13       135
14      1171
15       115
16       112
17        93
18       103
19       133
20        90
21        98
22        98
23       107
24      1201
25       901
26       811
27       106
28        94
29        99
        ... 
3604    1231
3605     891
3606    1031
3607    1331
3608     128
3609     207
3610      87
3611    1271
3612    1071
3613     941
3614    1121
3615    1201
3616     921
3617    1281
3618     962
3619      92
3620    1181
3621    1051
3622     991
3623    1031
3624    1121
3625    1081
3626    1211
3627     921
3628     107
3629    1331
3630    1151
3631    1081
3632     102
3633    1284
Name: length_clean, dtype: object


#### Cost

In [49]:
cost_arr = []
for index, row in movie_list_df.iterrows():
    try:
        cost = float(row['costs'])
    except:
        cost = np.nan
    cost_arr.append(cost)

movie_list_df['costs_clean'] = cost_arr
print movie_list_df['costs_clean']

0        85000000.0
1        75000000.0
2        43000000.0
3         3400000.0
4        82000000.0
5        76000000.0
6        57000000.0
7        60000000.0
8         7000000.0
9               NaN
10       60000000.0
11       65000000.0
12       51000000.0
13       10000000.0
14       73000000.0
15       50000000.0
16       14000000.0
17       48000000.0
18         770000.0
19              NaN
20       10000000.0
21       30000000.0
22       30000000.0
23       65000000.0
24        7000000.0
25       15000000.0
26          60000.0
27       35000000.0
28       35000000.0
29       11000000.0
           ...     
3604     35000000.0
3605      5000000.0
3606     10000000.0
3607    108000000.0
3608      2000000.0
3609            NaN
3610     70000000.0
3611    110000000.0
3612    156000000.0
3613     25000000.0
3614     45000000.0
3615      8500000.0
3616      8500000.0
3617     44000000.0
3618      9900000.0
3619     10000000.0
3620     60000000.0
3621     40000000.0
3622      9000000.0


### View results

In [50]:
print movie_list_df.columns.values
print "---------------------------------------------------------------------------"
print movie_list_df[:2]

['length' 'based_on' 'box_office' 'budget' 'cinematography' 'country'
 'director' 'distributor' 'editor' 'language' 'music' 'narration'
 'original_title' 'producer' 'screenplay' 'simplified' 'starring' 'story'
 'traditional' 'written_by' 'site' 'title' 'studio' 'revenues' 'costs'
 'release_dates' 'release_year' 'release_month' 'release_day' 'plot'
 'revenues_clean' 'length_clean' 'costs_clean']
---------------------------------------------------------------------------
        length                                        based_on  \
0  100 minutes  The Hundred and One Dalmatians\nby Dodie Smith   
1  103 minutes  The Hundred and One Dalmatians\nby Dodie Smith   

          box_office          budget cinematography        country  \
0  $183.6 million[1]     $85 million  Adrian Biddle  United States   
1  $320.6 million[2]  $75 million[1]  Adrian Biddle  United States   

        director           distributor          editor language     ...      \
0     Kevin Lima  Buena Vista Picture

# Derive additional features

### Add release week and day of week

In [54]:
week_arr = []
day_of_week_arr = []
for index, row in movie_list_df.iterrows():
    try:
        release_year = int(row['release_year'])
        release_month = int(row['release_month'])
        release_day = int(row['release_day'].split("/")[1])
        release_week = datetime.date(release_year, release_month, release_day).isocalendar()[1]
        release_day_of_week = datetime.datetime.weekday(datetime.datetime.strptime(str(release_year)+"-"+str(release_month)+"-"+str(release_day), "%Y-%m-%d"))
    except:
        release_week = np.nan   
        release_day_of_week = np.nan
    day_of_week_arr.append(release_day_of_week)
    week_arr.append(release_week)

movie_list_df['release_week'] = week_arr
movie_list_df['release_day_of_week'] = day_of_week_arr

print movie_list_df[['release_week', 'release_day_of_week']]

      release_week  release_day_of_week
0             47.0                  2.0
1             48.0                  2.0
2             15.0                  4.0
3              9.0                  2.0
4             46.0                  4.0
5             26.0                  4.0
6             52.0                  0.0
7              NaN                  NaN
8             15.0                  4.0
9             41.0                  4.0
10            34.0                  4.0
11            32.0                  4.0
12             NaN                  NaN
13            40.0                  4.0
14            19.0                  4.0
15             6.0                  4.0
16            39.0                  4.0
17            42.0                  4.0
18            49.0                  6.0
19             4.0                  4.0
20             NaN                  NaN
21            22.0                  4.0
22            22.0                  4.0
23            32.0                  4.0


### Combine writing categories

In [55]:
movie_list_df['writer'] = movie_list_df['screenplay'] + movie_list_df['written_by']
print movie_list_df['writer']

0       Kristen Buckley\nBrian Regan\nBob Tzudiker\nNo...
1                                             John Hughes
2                                          Susannah Grant
3                                                 DJ Pooh
4                 Cormac Wibberley and Marianne Wibberley
5                                        Kenneth Lonergan
6                                               Ted Tally
7                                           Cameron Crowe
8               \n\n\nMary Harron\nGuinevere Turner\n\n\n
9                           Edward Bunker\nJohn Steppling
10                   \n\n\nWayne Beach\nSimon Barry\n\n\n
11                                        Allison Burnett
12          Andrew Scheinman\nAdam Scheinman\nTony Gilroy
13                                              Spike Lee
14               \n\n\nJ. D. Shapiro\nCorey Mandell\n\n\n
15                                             John Hodge
16                                          Jon Bernstein
17            

### Determine if adaptation

In [56]:
movie_list_df['adaptation'] = (movie_list_df['based_on'] == "").astype(int)
print movie_list_df['adaptation']

0       0
1       0
2       1
3       1
4       1
5       0
6       0
7       1
8       0
9       0
10      1
11      1
12      1
13      1
14      0
15      0
16      1
17      0
18      1
19      0
20      1
21      1
22      1
23      0
24      1
25      1
26      1
27      1
28      1
29      1
       ..
3604    0
3605    1
3606    1
3607    0
3608    0
3609    1
3610    1
3611    0
3612    0
3613    1
3614    0
3615    1
3616    0
3617    1
3618    1
3619    0
3620    0
3621    1
3622    0
3623    1
3624    0
3625    0
3626    0
3627    1
3628    1
3629    0
3630    1
3631    0
3632    0
3633    1
Name: adaptation, dtype: int64


### Add cast and crew

#### Actor
1. Retain top n results

2. Generate dummy variables 

3. Combine duplicate columns and merge in to main dataframe


In [60]:
no_top_actor = 20
actor_prefix = "actor_"
actor_col_arr = [actor_prefix + str(i) for i in range(no_top_actor)]
actor_arr = []
actor_master_list = set()
for index, row in movie_list_df.iterrows():
    actor_list = filter(None, row['starring'].split("\r"))[:no_top_actor]
    actor_list = [re.sub('\[[0-9]+\]',"",item) for item in actor_list]
    actor_list = [re.sub('[^A-Za-z0-9\s]+',"",item) for item in actor_list]
    for actor in actor_list:
        actor_master_list.add(actor)
    actor_list_len = len(actor_list)
    if actor_list_len < no_top_actor:
        for i in range(no_top_actor):
            if i >= actor_list_len:
                actor_list.append("") 
    actor_arr.append(actor_list)

actor_arr_tp = np.transpose(actor_arr)

for item in enumerate(actor_col_arr):
    movie_list_df[item[1]] = actor_arr_tp[item[0]]

print movie_list_df[actor_col_arr]

                                                actor_0 actor_1 actor_2  \
0     Glenn Close\nIoan Gruffudd\nAlice Evans\nTim M...                   
1     \n\n\nGlenn Close\nJeff Daniels\nJoely Richard...                   
2     \n\n\nSandra Bullock\nViggo Mortensen\nDominic...                   
3     \n\n\nBrian Hooks\nNBushe Wright\nFaizon Love\...                   
4     \n\n\nArnold Schwarzenegger\nMichael Rapaport\...                   
5     \n\n\nJune Foray\nKeith Scott\nPiper Perabo\nR...                   
6     \n\n\nMatt Damon\nPenlope Cruz\nHenry Thomas\n...                   
7     Billy Crudup\nFrances McDormand\nKate Hudson\n...                   
8     \n\n\nChristian Bale\nWillem Dafoe\nJared Leto...                   
9     Willem Dafoe\nEdward Furlong\nDanny Trejo\nJoh...                   
10    \n\n\nWesley Snipes\nAnne Archer\nMaury Chayki...                   
11    \n\n\nRichard Gere\nWinona Ryder\nAnthony LaPa...                   
12                       

In [61]:
actor_dummy_dup_df = pd.get_dummies(movie_list_df[actor_col_arr])
actor_dummy_dup_col_arr = list(actor_dummy_dup_df.columns.values)

print actor_dummy_dup_col_arr

['actor_0_', 'actor_0_\n\n\nAaron Eckhart\nBill Nighy\nYvonne Strahovski\nMiranda Otto\nSocratis Otto\nJai Courtney\nKevin Grevioux\n\n\n', 'actor_0_\n\n\nAaron Eckhart\nHilary Swank\nDelroy Lindo\nStanley Tucci\nTchky Karyo\nDJ Qualls\nBruce Greenwood\nAlfre Woodard\n\n\n', 'actor_0_\n\n\nAaron Eckhart\nMichelle Rodriguez\nRamon Rodriguez\nBridget Moynahan\nNeYo\nMichael Pea\n\n\n', 'actor_0_\n\n\nAaron Johnson\nChristopher MintzPlasse\nMark Strong\nChlo Grace Moretz\nNicolas Cage\n\n\n', 'actor_0_\n\n\nAaron Paul\nDominic Cooper\nImogen Poots\nRamn Rodrguez\nMichael Keaton\n\n\n', 'actor_0_\n\n\nAaron TaylorJohnson\nChristopher MintzPlasse\nChlo Grace Moretz\nClark Duke\nMorris Chestnut\nJim Carrey\n\n\n', 'actor_0_\n\n\nAaron TaylorJohnson\nKen Watanabe\nElizabeth Olsen\nJuliette Binoche\nSally Hawkins\nDavid Strathairn\nBryan Cranston\n\n\n', 'actor_0_\n\n\nAdam GreavesNeal\nSean Bean\nDavid Bradley\nLee Boardman\nJonathan Bailey\nDavid Burke\n\n\n', 'actor_0_\n\n\nAdam Sandler\nDa

In [62]:
actor_dummy_df = pd.DataFrame()

for unq_actor in actor_master_list:
    unq_actor_col_nme = actor_prefix + unq_actor

    unq_actor_col_arr = [actor_dup_dummy for actor_dup_dummy in actor_dummy_dup_col_arr if unq_actor in actor_dup_dummy]
    actor_dummy_df[unq_actor_col_nme] = actor_dummy_dup_df[unq_actor_col_arr].sum(axis=1)

actor_dummy_col_arr = list(actor_dummy_df.columns.values)

movie_list_df = movie_list_df.join(actor_dummy_df)
print movie_list_df[actor_dummy_col_arr]

      actor_Angelina Jolie\nDan Futterman\nIrrfan Khan\nArchie Panjabi\nWill Patton  \
0                                                   0.0                               
1                                                   0.0                               
2                                                   0.0                               
3                                                   0.0                               
4                                                   0.0                               
5                                                   0.0                               
6                                                   0.0                               
7                                                   0.0                               
8                                                   0.0                               
9                                                   0.0                               
10                                         

In [63]:
#test_actor = 'Chlo'
test_actor = 'Liam Neeson'
test_actor_dup_col_arr = [actor_dup_dummy for actor_dup_dummy in actor_dummy_dup_col_arr if test_actor in actor_dup_dummy]
test_actor_col_arr = [actor_dummy for actor_dummy in actor_dummy_col_arr if test_actor in actor_dummy]
print test_actor_dup_col_arr
print test_actor_col_arr

print actor_dummy_dup_df[test_actor_dup_col_arr]
print actor_dummy_df[test_actor_col_arr]

print [sum(actor_dummy_dup_df[item]) for item in test_actor_dup_col_arr]
print np.sum(actor_dummy_df[test_actor_col_arr])

['actor_0_\n\n\nChristian Bale\nMichael Caine\nLiam Neeson\nKatie Holmes\nGary Oldman\nCillian Murphy\nTom Wilkinson\nRutger Hauer\nKen Watanabe\nMorgan Freeman\n\n\n', 'actor_0_\n\n\nGeorgie Henley\nSkandar Keynes\nWilliam Moseley\nAnna Popplewell\nBen Barnes\nPeter Dinklage\nSergio Castellitto\nEddie Izzard\nLiam Neeson\n\n\n', 'actor_0_\n\n\nGeorgie Henley\nSkandar Keynes\nWilliam Moseley\nAnna Popplewell\nTilda Swinton\nJames McAvoy\nJim Broadbent\nLiam Neeson\n\n\n', 'actor_0_\n\n\nHarrison Ford\nLiam Neeson\nPeter Sarsgaard\n\n\n', 'actor_0_\n\n\nLiam Neeson\nBradley Cooper\nJessica Biel\nQuinton Jackson\nSharlto Copley\nBrian Bloom\nPatrick Wilson\n\n\n', 'actor_0_\n\n\nLiam Neeson\nDan Stevens\nDavid Harbour\nBrian Bradley\nBoyd Holbrook\n\n\n', 'actor_0_\n\n\nLiam Neeson\nDiane Kruger\nJanuary Jones\nAidan Quinn\nFrank Langella\n\n\n', 'actor_0_\n\n\nLiam Neeson\nEwan McGregor\nNatalie Portman\nJake Lloyd\nIan McDiarmid\nAnthony Daniels\nKenny Baker\nPernilla August\nFrank Oz\

#### Cinematographer
1. Retain top n results

2. Generate dummy variables 

3. Combine duplicate columns and merge in to main dataframe


In [None]:
no_top_cinematographer = 1
cinematographer_prefix = "cinematographer_"
cinematographer_col_arr = [cinematographer_prefix + str(i) for i in range(no_top_cinematographer)]
cinematographer_arr = []
cinematographer_master_list = set()
for index, row in movie_list_df.iterrows():
    cinematographer_list = filter(None, row['cinematography'].split("\r"))[:no_top_cinematographer]
    cinematographer_list = [re.sub('\[[0-9]+\]',"",item) for item in cinematographer_list]
    cinematographer_list = [re.sub('[^A-Za-z0-9\s]+',"",item) for item in cinematographer_list]
    for cinematographer in cinematographer_list:
        cinematographer_master_list.add(cinematographer)
    cinematographer_list_len = len(cinematographer_list)
    if cinematographer_list_len < no_top_cinematographer:
        for i in range(no_top_cinematographer):
            if i >= cinematographer_list_len:
                cinematographer_list.append("") 
    cinematographer_arr.append(cinematographer_list)

cinematographer_arr_tp = np.transpose(cinematographer_arr)

for item in enumerate(cinematographer_col_arr):
    movie_list_df[item[1]] = cinematographer_arr_tp[item[0]]

print movie_list_df[cinematographer_col_arr]

In [None]:
cinematographer_dummy_dup_df = pd.get_dummies(movie_list_df[cinematographer_col_arr])
cinematographer_dummy_dup_col_arr = list(cinematographer_dummy_dup_df.columns.values)

print cinematographer_dummy_dup_col_arr

In [None]:
cinematographer_dummy_df = pd.DataFrame()

for unq_cinematographer in cinematographer_master_list:
    unq_cinematographer_col_nme = cinematographer_prefix + unq_cinematographer

    unq_cinematographer_col_arr = [cinematographer_dup_dummy for cinematographer_dup_dummy in cinematographer_dummy_dup_col_arr if unq_cinematographer in cinematographer_dup_dummy]
    cinematographer_dummy_df[unq_cinematographer_col_nme] = cinematographer_dummy_dup_df[unq_cinematographer_col_arr].sum(axis=1)

cinematographer_dummy_col_arr = list(cinematographer_dummy_df.columns.values)

movie_list_df = movie_list_df.join(cinematographer_dummy_df)
print movie_list_df[cinematographer_dummy_col_arr]

#### Director
1. Retain top n results

2. Generate dummy variables 

3. Combine duplicate columns and merge in to main dataframe


In [None]:
no_top_director = 2
director_prefix = "director_"
director_col_arr = [director_prefix + str(i) for i in range(no_top_director)]
director_arr = []
director_master_list = set()
for index, row in movie_list_df.iterrows():
    director_list = filter(None, row['director'].split("\r"))[:no_top_director]
    director_list = [re.sub('\[[0-9]+\]',"",item) for item in director_list]
    director_list = [re.sub('[^A-Za-z0-9\s]+',"",item) for item in director_list]
    for director in director_list:
        director_master_list.add(director)
    director_list_len = len(director_list)
    if director_list_len < no_top_director:
        for i in range(no_top_director):
            if i >= director_list_len:
                director_list.append("") 
    director_arr.append(director_list)

director_arr_tp = np.transpose(director_arr)

for item in enumerate(director_col_arr):
    movie_list_df[item[1]] = director_arr_tp[item[0]]

print movie_list_df[director_col_arr]

In [None]:
director_dummy_dup_df = pd.get_dummies(movie_list_df[director_col_arr])
director_dummy_dup_col_arr = list(director_dummy_dup_df.columns.values)

print director_dummy_dup_col_arr

In [None]:
director_dummy_df = pd.DataFrame()

for unq_director in director_master_list:
    unq_director_col_nme = director_prefix + unq_director

    unq_director_col_arr = [director_dup_dummy for director_dup_dummy in director_dummy_dup_col_arr if unq_director in director_dup_dummy]
    director_dummy_df[unq_director_col_nme] = director_dummy_dup_df[unq_director_col_arr].sum(axis=1)

director_dummy_col_arr = list(director_dummy_df.columns.values)

movie_list_df = movie_list_df.join(director_dummy_df)
print movie_list_df[director_dummy_col_arr]

#### Distributor
1. Retain top n results

2. Generate dummy variables 

3. Combine duplicate columns and merge in to main dataframe


In [None]:
no_top_distributor = 1
distributor_prefix = "distributor_"
distributor_col_arr = [distributor_prefix + str(i) for i in range(no_top_distributor)]
distributor_arr = []
distributor_master_list = set()
for index, row in movie_list_df.iterrows():
    distributor_list = filter(None, row['distributor'].split("\r"))[:no_top_distributor]
    distributor_list = [re.sub('\[[0-9]+\]',"",item) for item in distributor_list]
    distributor_list = [re.sub('[^A-Za-z0-9\s]+',"",item) for item in distributor_list]
    for distributor in distributor_list:
        distributor_master_list.add(distributor)
    distributor_list_len = len(distributor_list)
    if distributor_list_len < no_top_distributor:
        for i in range(no_top_distributor):
            if i >= distributor_list_len:
                distributor_list.append("") 
    distributor_arr.append(distributor_list)

distributor_arr_tp = np.transpose(distributor_arr)

for item in enumerate(distributor_col_arr):
    movie_list_df[item[1]] = distributor_arr_tp[item[0]]

print movie_list_df[distributor_col_arr]

In [None]:
distributor_dummy_dup_df = pd.get_dummies(movie_list_df[distributor_col_arr])
distributor_dummy_dup_col_arr = list(distributor_dummy_dup_df.columns.values)

print distributor_dummy_dup_col_arr

In [None]:
distributor_dummy_df = pd.DataFrame()

for unq_distributor in distributor_master_list:
    unq_distributor_col_nme = distributor_prefix + unq_distributor

    unq_distributor_col_arr = [distributor_dup_dummy for distributor_dup_dummy in distributor_dummy_dup_col_arr if unq_distributor in distributor_dup_dummy]
    distributor_dummy_df[unq_distributor_col_nme] = distributor_dummy_dup_df[unq_distributor_col_arr].sum(axis=1)

distributor_dummy_col_arr = list(distributor_dummy_df.columns.values)

movie_list_df = movie_list_df.join(distributor_dummy_df)
print movie_list_df[distributor_dummy_col_arr]

#### Editor
1. Retain top n results

2. Generate dummy variables 

3. Combine duplicate columns and merge in to main dataframe


In [None]:
no_top_editor = 1
editor_prefix = "editor_"
editor_col_arr = [editor_prefix + str(i) for i in range(no_top_editor)]
editor_arr = []
editor_master_list = set()
for index, row in movie_list_df.iterrows():
    editor_list = filter(None, row['editor'].split("\r"))[:no_top_editor]
    editor_list = [re.sub('\[[0-9]+\]',"",item) for item in editor_list]
    editor_list = [re.sub('[^A-Za-z0-9\s]+',"",item) for item in editor_list]
    for editor in editor_list:
        editor_master_list.add(editor)
    editor_list_len = len(editor_list)
    if editor_list_len < no_top_editor:
        for i in range(no_top_editor):
            if i >= editor_list_len:
                editor_list.append("") 
    editor_arr.append(editor_list)

editor_arr_tp = np.transpose(editor_arr)

for item in enumerate(editor_col_arr):
    movie_list_df[item[1]] = editor_arr_tp[item[0]]

print movie_list_df[editor_col_arr]

In [None]:
editor_dummy_dup_df = pd.get_dummies(movie_list_df[editor_col_arr])
editor_dummy_dup_col_arr = list(editor_dummy_dup_df.columns.values)

print editor_dummy_dup_col_arr

In [None]:
editor_dummy_df = pd.DataFrame()

for unq_editor in editor_master_list:
    unq_editor_col_nme = editor_prefix + unq_editor

    unq_editor_col_arr = [editor_dup_dummy for editor_dup_dummy in editor_dummy_dup_col_arr if unq_editor in editor_dup_dummy]
    editor_dummy_df[unq_editor_col_nme] = editor_dummy_dup_df[unq_editor_col_arr].sum(axis=1)

editor_dummy_col_arr = list(editor_dummy_df.columns.values)

movie_list_df = movie_list_df.join(editor_dummy_df)
print movie_list_df[editor_dummy_col_arr]

#### Music
1. Retain top n results

2. Generate dummy variables 

3. Combine duplicate columns and merge in to main dataframe


In [None]:
no_top_music = 1
music_prefix = "music_"
music_col_arr = [music_prefix + str(i) for i in range(no_top_music)]
music_arr = []
music_master_list = set()
for index, row in movie_list_df.iterrows():
    music_list = filter(None, row['music'].split("\r"))[:no_top_music]
    music_list = [re.sub('\[[0-9]+\]',"",item) for item in music_list]
    music_list = [re.sub('[^A-Za-z0-9\s]+',"",item) for item in music_list]
    for music in music_list:
        music_master_list.add(music)
    music_list_len = len(music_list)
    if music_list_len < no_top_music:
        for i in range(no_top_music):
            if i >= music_list_len:
                music_list.append("") 
    music_arr.append(music_list)

music_arr_tp = np.transpose(music_arr)

for item in enumerate(music_col_arr):
    movie_list_df[item[1]] = music_arr_tp[item[0]]

print movie_list_df[music_col_arr]

In [None]:
music_dummy_dup_df = pd.get_dummies(movie_list_df[music_col_arr])
music_dummy_dup_col_arr = list(music_dummy_dup_df.columns.values)

print music_dummy_dup_col_arr

In [None]:
music_dummy_df = pd.DataFrame()

for unq_music in music_master_list:
    unq_music_col_nme = music_prefix + unq_music

    unq_music_col_arr = [music_dup_dummy for music_dup_dummy in music_dummy_dup_col_arr if unq_music in music_dup_dummy]
    music_dummy_df[unq_music_col_nme] = music_dummy_dup_df[unq_music_col_arr].sum(axis=1)

music_dummy_col_arr = list(music_dummy_df.columns.values)

movie_list_df = movie_list_df.join(music_dummy_df)
print movie_list_df[music_dummy_col_arr]

#### Producer
1. Retain top n results

2. Generate dummy variables 

3. Combine duplicate columns and merge in to main dataframe


In [None]:
no_top_producer = 4
producer_prefix = "producer_"
producer_col_arr = [producer_prefix + str(i) for i in range(no_top_producer)]
producer_arr = []
producer_master_list = set()
for index, row in movie_list_df.iterrows():
    producer_list = filter(None, row['producer'].split("\r"))[:no_top_producer]
    producer_list = [re.sub('\[[0-9]+\]',"",item) for item in producer_list]
    producer_list = [re.sub('[^A-Za-z0-9\s]+',"",item) for item in producer_list]
    for producer in producer_list:
        producer_master_list.add(producer)
    producer_list_len = len(producer_list)
    if producer_list_len < no_top_producer:
        for i in range(no_top_producer):
            if i >= producer_list_len:
                producer_list.append("") 
    producer_arr.append(producer_list)

producer_arr_tp = np.transpose(producer_arr)

for item in enumerate(producer_col_arr):
    movie_list_df[item[1]] = producer_arr_tp[item[0]]

print movie_list_df[producer_col_arr]

In [None]:
producer_dummy_dup_df = pd.get_dummies(movie_list_df[producer_col_arr])
producer_dummy_dup_col_arr = list(producer_dummy_dup_df.columns.values)

print producer_dummy_dup_col_arr

In [None]:
producer_dummy_df = pd.DataFrame()

for unq_producer in producer_master_list:
    unq_producer_col_nme = producer_prefix + unq_producer

    unq_producer_col_arr = [producer_dup_dummy for producer_dup_dummy in producer_dummy_dup_col_arr if unq_producer in producer_dup_dummy]
    producer_dummy_df[unq_producer_col_nme] = producer_dummy_dup_df[unq_producer_col_arr].sum(axis=1)

producer_dummy_col_arr = list(producer_dummy_df.columns.values)

movie_list_df = movie_list_df.join(producer_dummy_df)
print movie_list_df[producer_dummy_col_arr]

#### Writer
1. Retain top n results

2. Generate dummy variables 

3. Combine duplicate columns and merge in to main dataframe


In [None]:
no_top_writer = 3
writer_prefix = "writer_"
writer_col_arr = [writer_prefix + str(i) for i in range(no_top_writer)]
writer_arr = []
writer_master_list = set()
for index, row in movie_list_df.iterrows():
    writer_list = filter(None, row['writer'].split("\r"))[:no_top_writer]
    writer_list = [re.sub('\[[0-9]+\]',"",item) for item in writer_list]
    writer_list = [re.sub('[^A-Za-z0-9\s]+',"",item) for item in writer_list]
    for writer in writer_list:
        writer_master_list.add(writer)
    writer_list_len = len(writer_list)
    if writer_list_len < no_top_writer:
        for i in range(no_top_writer):
            if i >= writer_list_len:
                writer_list.append("") 
    writer_arr.append(writer_list)

writer_arr_tp = np.transpose(writer_arr)

for item in enumerate(writer_col_arr):
    movie_list_df[item[1]] = writer_arr_tp[item[0]]

print movie_list_df[writer_col_arr]

In [None]:
writer_dummy_dup_df = pd.get_dummies(movie_list_df[writer_col_arr])
writer_dummy_dup_col_arr = list(writer_dummy_dup_df.columns.values)

print writer_dummy_dup_col_arr

In [None]:
writer_dummy_df = pd.DataFrame()

for unq_writer in writer_master_list:
    unq_writer_col_nme = writer_prefix + unq_writer

    unq_writer_col_arr = [writer_dup_dummy for writer_dup_dummy in writer_dummy_dup_col_arr if unq_writer in writer_dup_dummy]
    writer_dummy_df[unq_writer_col_nme] = writer_dummy_dup_df[unq_writer_col_arr].sum(axis=1)

writer_dummy_col_arr = list(writer_dummy_df.columns.values)

movie_list_df = movie_list_df.join(writer_dummy_df)
print movie_list_df[writer_dummy_col_arr]

#### Story
1. Retain top n results

2. Generate dummy variables 

3. Combine duplicate columns and merge in to main dataframe


In [None]:
no_top_story = 1
story_prefix = "story_"
story_col_arr = [story_prefix + str(i) for i in range(no_top_story)]
story_arr = []
story_master_list = set()
for index, row in movie_list_df.iterrows():
    story_list = filter(None, row['story'].split("\r"))[:no_top_story]
    story_list = [re.sub('\[[0-9]+\]',"",item) for item in story_list]
    story_list = [re.sub('[^A-Za-z0-9\s]+',"",item) for item in story_list]
    for story in story_list:
        story_master_list.add(story)
    story_list_len = len(story_list)
    if story_list_len < no_top_story:
        for i in range(no_top_story):
            if i >= story_list_len:
                story_list.append("") 
    story_arr.append(story_list)

story_arr_tp = np.transpose(story_arr)

for item in enumerate(story_col_arr):
    movie_list_df[item[1]] = story_arr_tp[item[0]]

print movie_list_df[story_col_arr]

In [None]:
story_dummy_dup_df = pd.get_dummies(movie_list_df[story_col_arr])
story_dummy_dup_col_arr = list(story_dummy_dup_df.columns.values)

print story_dummy_dup_col_arr

In [None]:
story_dummy_df = pd.DataFrame()

for unq_story in story_master_list:
    unq_story_col_nme = story_prefix + unq_story

    unq_story_col_arr = [story_dup_dummy for story_dup_dummy in story_dummy_dup_col_arr if unq_story in story_dup_dummy]
    story_dummy_df[unq_story_col_nme] = story_dummy_dup_df[unq_story_col_arr].sum(axis=1)

story_dummy_col_arr = list(story_dummy_df.columns.values)

movie_list_df = movie_list_df.join(story_dummy_df)
print movie_list_df[story_dummy_col_arr]

#### Studio
1. Retain top n results

2. Generate dummy variables 

3. Combine duplicate columns and merge in to main dataframe


In [None]:
no_top_studio = 5
studio_prefix = "studio_"
studio_col_arr = [studio_prefix + str(i) for i in range(no_top_studio)]
studio_arr = []
studio_master_list = set()
for index, row in movie_list_df.iterrows():
    studio_list = filter(None, row['studio'].split("\r"))[:no_top_studio]
    studio_list = [re.sub('\[[0-9]+\]',"",item) for item in studio_list]
    studio_list = [re.sub('[^A-Za-z0-9\s]+',"",item) for item in studio_list]
    for studio in studio_list:
        studio_master_list.add(studio)
    studio_list_len = len(studio_list)
    if studio_list_len < no_top_studio:
        for i in range(no_top_studio):
            if i >= studio_list_len:
                studio_list.append("") 
    studio_arr.append(studio_list)

studio_arr_tp = np.transpose(studio_arr)

for item in enumerate(studio_col_arr):
    movie_list_df[item[1]] = studio_arr_tp[item[0]]

print movie_list_df[studio_col_arr]

In [None]:
studio_dummy_dup_df = pd.get_dummies(movie_list_df[studio_col_arr])
studio_dummy_dup_col_arr = list(studio_dummy_dup_df.columns.values)

print studio_dummy_dup_col_arr

In [None]:
studio_dummy_df = pd.DataFrame()

for unq_studio in studio_master_list:
    unq_studio_col_nme = studio_prefix + unq_studio

    unq_studio_col_arr = [studio_dup_dummy for studio_dup_dummy in studio_dummy_dup_col_arr if unq_studio in studio_dup_dummy]
    studio_dummy_df[unq_studio_col_nme] = studio_dummy_dup_df[unq_studio_col_arr].sum(axis=1)

studio_dummy_col_arr = list(studio_dummy_df.columns.values)

movie_list_df = movie_list_df.join(studio_dummy_df)
print movie_list_df[studio_dummy_col_arr]

### View results

In [None]:
print movie_list_df.columns.values
print "---------------------------------------------------------------------------"
print movie_list_df[:2]

# Decompose plots into topics using Non-Negative Matrix Factorization (NNMF), Latent Dirichlet Allocation (LDA)

In [None]:
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
no_word_features = 10000
no_topics = 100

## Clean up plots

In [None]:
print movie_list_df['plot'][1]
print "---------------------------------------------------------------------------"
print movie_list_df['plot'][4]

### Remove special characters

In [None]:
movie_list_df['plot_clean'] = movie_list_df['plot'].replace(to_replace='\[[0-9]+\]', value=" ", regex=True)
movie_list_df['plot_clean'].replace(to_replace='[^A-Za-z0-9]+', value=" ", inplace=True, regex=True)

### Use NLTK to remove proper nouns

### View results

In [None]:
print movie_list_df['plot_clean'][1]
print "---------------------------------------------------------------------------"
print movie_list_df['plot_clean'][4]

In [None]:
print movie_list_df.columns.values
print "---------------------------------------------------------------------------"
print movie_list_df[:2]

## Fit NNMF model

### Vectorize plots for NNMF using tf-idf
Max number of features is number of words for the "bag of words"

In [None]:
nnmf_tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_word_features, stop_words='english')
nnmf_tfidf = nnmf_tfidf_vectorizer.fit_transform(movie_list_df['plot_clean'])
nnmf_tfidf_feature_names = nnmf_tfidf_vectorizer.get_feature_names()
print nnmf_tfidf_feature_names

### Run NNMF model

In [None]:
nnmf_model = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(nnmf_tfidf)
nnmf_W = nnmf_model.transform(nnmf_tfidf)
nnmf_H = nnmf_model.components_

### Add full list of NNMF topic scores to dataframe

In [None]:
movie_list_df['nnmf_topic_scores'] = nnmf_W.tolist()

### Add top n NNMF topics to dataframe

In [None]:
no_top_n_nnmf_topics = 5
nnmf_topic_col_arr = ["nnmf_topic_" + str(i) for i in range(no_top_n_nnmf_topics)]
top_n_nnmf_topic_arr = []
for index, row in movie_list_df.iterrows():
    top_n_nnmf_topic_arr.append(np.array(row['nnmf_topic_scores']).argsort()[-1*no_top_n_nnmf_topics:][::-1])

top_n_nnmf_topic_arr = np.transpose(top_n_nnmf_topic_arr)
    
for topic in enumerate(nnmf_topic_col_arr):
    movie_list_df[topic[1]] = top_n_nnmf_topic_arr[topic[0]]

### View results

In [None]:
print movie_list_df.columns.values
print "---------------------------------------------------------------------------"
print movie_list_df[:2]

## Fit LDA Model

### Vectorize plots for LDA using tf

Max number of features is number of words for the "bag of words".

LDA can only use raw term counts for LDA because it is a probabilistic graphical model


In [None]:
lda_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_word_features, stop_words='english')
lda_tf = lda_tf_vectorizer.fit_transform(movie_list_df['plot_clean'])
lda_tf_feature_names = lda_tf_vectorizer.get_feature_names()
print lda_tf_feature_names

### Run LDA model

In [None]:
lda_model = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(lda_tf)
lda_W = lda_model.transform(lda_tf)
lda_H = lda_model.components_

### Add LDA topics to main dataframe

In [None]:
movie_list_df['lda_topic_scores'] = lda_W.tolist()

### Add top n LDA topics to dataframe

In [None]:
no_top_n_lda_topics = 5
lda_topic_col_arr = ["lda_topic_" + str(i) for i in range(no_top_n_lda_topics)]
top_n_lda_topic_arr = []
for index, row in movie_list_df.iterrows():
    top_n_lda_topic_arr.append(np.array(row['lda_topic_scores']).argsort()[-1*no_top_n_lda_topics:][::-1])

top_n_lda_topic_arr = np.transpose(top_n_lda_topic_arr)
    
for topic in enumerate(lda_topic_col_arr):
    movie_list_df[topic[1]] = top_n_lda_topic_arr[topic[0]]

### View results

In [None]:
print movie_list_df.columns.values
print "---------------------------------------------------------------------------"
print movie_list_df[:2]

# Display plot model results
Will display top associated words, top movies for each topic

In [None]:
no_top_words = 100
no_top_documents = 5

def display_topics(H, W, feature_names, titles, plots, no_top_words, no_top_documents):
    for topic_idx, topic in enumerate(H):
        print "Topic %d:" % (topic_idx)
        print " ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]])
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
        for doc_index in top_doc_indices:
            print "\nMovie: " + titles[doc_index]
            print "Plot:\n" + plots[doc_index] + "\n"
        print "---------------------------------------------------------------------------"

### NNMF

In [None]:
display_topics(nnmf_H, nnmf_W, nnmf_tfidf_feature_names, movie_list_df['title'], movie_list_df['plot'], no_top_words, no_top_documents)

### LDA

In [None]:
display_topics(lda_H, lda_W, lda_tf_feature_names, movie_list_df['title'], movie_list_df['plot'], no_top_words, no_top_documents)

# Build revenue prediction model

## Create model input arrays

### Build movie feature data frame 

Use either NNMF or LDA topic as needed

In [None]:
movie_prediction_features = ['costs_clean', 'length_clean', 'release_week', 'release_day_of_week', 'adaptation']
revenue_column = 'revenues_clean'

#movie_prediction_features += nnmf_topic_col_arr
movie_prediction_features += lda_topic_col_arr

movie_prediction_features += actor_dummy_col_arr
movie_prediction_features += cinematographer_dummy_col_arr
movie_prediction_features += director_dummy_col_arr
movie_prediction_features += distributor_dummy_col_arr
movie_prediction_features += editor_dummy_col_arr
movie_prediction_features += music_dummy_col_arr
movie_prediction_features += producer_dummy_col_arr
movie_prediction_features += writer_dummy_col_arr
movie_prediction_features += story_dummy_col_arr
movie_prediction_features += studio_dummy_col_arr

# add revenue here for cleaning below; will be dropped after
movie_prediction_features.append(revenue_column)

movie_feature_df = movie_list_df[movie_prediction_features]
print movie_feature_df.columns.values

### Filter out null values

In [None]:
print "Number of rows in feature dataframe before change = " + str(len(movie_feature_df))

movie_feature_df = movie_feature_df.dropna()

print "Number of rows in feature dataframe after change = " + str(len(movie_feature_df))

### Build numpy arrays for features, revenue

In [None]:
revenue_actl = np.array(movie_feature_df[revenue_column]).flatten()
print revenue_actl

In [None]:
del movie_feature_df[revenue_column]
print movie_feature_df.columns.values

In [None]:
movie_feature_arr = movie_feature_df.as_matrix()
print "Number of rows in feature numpy array = " + str(len(movie_feature_arr))

### View results

In [None]:
print len(revenue_actl)
print revenue_actl

In [None]:
print "Number of features: " + str(len(movie_prediction_features))
print movie_prediction_features[:200]

In [None]:
print movie_feature_arr[:10]

### Split data set into training, validation, and test data sets

In [None]:
training_data, training_revenue = movie_feature_arr[:600], revenue_actl[:600]
validation_data, validation_revenue = movie_feature_arr[601:1000], revenue_actl[601:1000]
test_data, test_revenue = movie_feature_arr[1001:], revenue_actl[1001:]

## Functions to evaluate results

### Squared Error Loss

In [None]:
def GetSquaredErrorLoss(revenue_actl, revenue_pred):
    return sum((revenue_actl - revenue_pred)**2)/(1.0*len(revenue_actl))

In [None]:
print GetSquaredErrorLoss(revenue_actl, revenue_actl)

### Root Mean Squared Logarithmic Error (RMSLE)

In [None]:
def GetRMSLE(revenue_actl, revenue_pred):
    return ((1.0/len(revenue_actl)) * sum((np.log(revenue_pred) - np.log(revenue_actl))**2))**0.5

In [None]:
print GetRMSLE(revenue_actl, revenue_actl)

### Compare predicted and actual revenue

In [None]:
def ComparePredictedResults(revenue_actl, revenue_pred, no_of_rows):

    print "{: >20} {: >20}".format('Actual Revenue', 'Predicted Revenue')
    for i in range(no_of_rows):
        print "{: >20} {: >20}".format(str(revenue_actl[i]), str(revenue_pred[i]))

## Predict results of validation data set using training data

Predict using a variety of models using the following procedure

1. Fit model using "training" data set
2. Predict revenues of validation data set
3. Evaluate results

In [None]:
# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR

### Linear Regression

In [None]:
lm = LinearRegression()
lm.fit(training_data, training_revenue)

In [None]:
lm_validation_revenue_pred = np.round(lm.predict(validation_data))
lm_validation_revenue_pred[lm_validation_revenue_pred < 0] = 0
print lm_validation_revenue_pred

In [None]:
lm_error = GetSquaredErrorLoss(validation_revenue, lm_validation_revenue_pred)
print lm_error

In [None]:
ComparePredictedResults(validation_revenue, lm_validation_revenue_pred, 50)

### K Nearest Neighbors

k = 3

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(training_data, training_revenue)

In [None]:
knn_validation_revenue_pred = np.round(knn.predict(validation_data))
knn_validation_revenue_pred[knn_validation_revenue_pred < 0] = 0
print knn_validation_revenue_pred

In [None]:
knn_error = GetSquaredErrorLoss(validation_revenue, knn_validation_revenue_pred)
print knn_error

In [None]:
ComparePredictedResults(validation_revenue, knn_validation_revenue_pred, 50)

### Decision Tree

In [None]:
dt = DecisionTreeClassifier(random_state=0)
dt.fit(training_data, training_revenue)

In [None]:
dt_validation_revenue_pred = np.round(dt.predict(validation_data))
dt_validation_revenue_pred[dt_validation_revenue_pred < 0] = 0
print dt_validation_revenue_pred

In [None]:
dt_error = GetSquaredErrorLoss(validation_revenue, dt_validation_revenue_pred)
print dt_error

In [None]:
ComparePredictedResults(validation_revenue, dt_validation_revenue_pred, 50)

### Random Forest Classifier

In [None]:
rnfc = RandomForestClassifier(n_estimators=1000, max_features=1000)
rnfc.fit(training_data, training_revenue)

In [None]:
rnfc_validation_revenue_pred = np.round(rnfc.predict(validation_data))
rnfc_validation_revenue_pred[rnfc_validation_revenue_pred < 0] = 0
print rnfc_validation_revenue_pred

In [None]:
rnfc_error = GetSquaredErrorLoss(validation_revenue, rnfc_validation_revenue_pred)
print rnfc_error

In [None]:
ComparePredictedResults(validation_revenue, rnfc_validation_revenue_pred, 50)

### Decision Tree Regressor

In [None]:
dtr = DecisionTreeRegressor(max_features=1000)
dtr.fit(training_data, training_revenue)

In [None]:
dtr_validation_revenue_pred = np.round(dtr.predict(validation_data))
dtr_validation_revenue_pred[dtr_validation_revenue_pred < 0] = 0
print dtr_validation_revenue_pred

In [None]:
dtr_error = GetSquaredErrorLoss(validation_revenue, dtr_validation_revenue_pred)
print dtr_error

In [None]:
ComparePredictedResults(validation_revenue, dtr_validation_revenue_pred, 50)

### Random Forest Regressor

In [None]:
rnfr = RandomForestRegressor(n_estimators=1000, max_features=1000)
rnfr.fit(training_data, training_revenue)

In [None]:
rnfr_validation_revenue_pred = np.round(rnfr.predict(validation_data))
rnfr_validation_revenue_pred[rnfr_validation_revenue_pred < 0] = 0
print rnfr_validation_revenue_pred

In [None]:
rnfr_error = GetSquaredErrorLoss(validation_revenue, rnfr_validation_revenue_pred)
print rnfr_error

In [None]:
ComparePredictedResults(validation_revenue, rnfr_validation_revenue_pred, 50)

### AdaBoost Regressor

In [None]:
adbr = AdaBoostRegressor(n_estimators=1000)
adbr.fit(training_data, training_revenue)

In [None]:
adbr_validation_revenue_pred = np.round(adbr.predict(validation_data))
adbr_validation_revenue_pred[adbr_validation_revenue_pred < 0] = 0
print adbr_validation_revenue_pred

In [None]:
adbr_error = GetSquaredErrorLoss(validation_revenue, adbr_validation_revenue_pred)
print adbr_error

In [None]:
ComparePredictedResults(validation_revenue, adbr_validation_revenue_pred, 50)

### Bagging Regressor

In [None]:
bgr = BaggingRegressor(n_estimators = 1000)
bgr.fit(training_data, training_revenue)

In [None]:
bgr_validation_revenue_pred = np.round(bgr.predict(validation_data))
bgr_validation_revenue_pred[bgr_validation_revenue_pred < 0] = 0
print bgr_validation_revenue_pred

In [None]:
bgr_error = GetSquaredErrorLoss(validation_revenue, bgr_validation_revenue_pred)
print bgr_error

In [None]:
ComparePredictedResults(validation_revenue, bgr_validation_revenue_pred, 50)

### Gradient Boosting Regressor

In [None]:
gbr = GradientBoostingRegressor(n_estimators = 1000)
gbr.fit(training_data, training_revenue)

In [None]:
gbr_validation_revenue_pred = np.round(gbr.predict(validation_data))
gbr_validation_revenue_pred[gbr_validation_revenue_pred < 0] = 0
print gbr_validation_revenue_pred

In [None]:
gbr_error = GetSquaredErrorLoss(validation_revenue, gbr_validation_revenue_pred)
print gbr_error

In [None]:
ComparePredictedResults(validation_revenue, gbr_validation_revenue_pred, 100)

### Stochastic Gradient Descent Regressor

In [None]:
sgdr = SGDRegressor()
sgdr.fit(training_data, training_revenue)

In [None]:
sgdr_validation_revenue_pred = np.round(sgdr.predict(validation_data))
sgdr_validation_revenue_pred[sgdr_validation_revenue_pred < 0] = 0
print sgdr_validation_revenue_pred

In [None]:
sgdr_error = GetSquaredErrorLoss(validation_revenue, sgdr_validation_revenue_pred)
print sgdr_error

In [None]:
ComparePredictedResults(validation_revenue, sgdr_validation_revenue_pred, 50)

### Support Vector Machine Regressor

In [None]:
svmr = SVR()
svmr.fit(training_data, training_revenue)

In [None]:
svmr_validation_revenue_pred = np.round(svmr.predict(validation_data))
svmr_validation_revenue_pred[svmr_validation_revenue_pred < 0] = 0
print svmr_validation_revenue_pred

In [None]:
svmr_error = GetSquaredErrorLoss(validation_revenue, svmr_validation_revenue_pred)
print svmr_error

In [None]:
ComparePredictedResults(validation_revenue, svmr_validation_revenue_pred, 50)

### Plot comparison of actual vs predicted revenues for best model

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(20, 20))    

#plt.plot(range(len(validation_revenue)), validation_revenue, 'o', ms=6, color='blue', label='Actual')
#plt.plot(range(len(gbr_validation_revenue_pred)), gbr_validation_revenue_pred, 'o', ms=6, color='red', label='Predicted')
#plt.plot(range(len(validation_revenue)), abs(validation_revenue - gbr_validation_revenue_pred), 'o', ms=6, color='blue', label='Actual')
data = abs(validation_revenue - gbr_validation_revenue_pred)

plt.hist(data, bins=100)

#plt.xlabel('Movie')
#plt.ylabel('Revenue')
#plt.legend()
plt.show()
plt.savefig('residuals.png')


# Interface Functions

### Get associated topic scores for input plot 

### Get prediction array for input movie  

### Get predicted revenue for input movie 