In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Initial Analysis to see if foreign success predictions will work

In [141]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
%matplotlib inline

In [136]:
df = pd.DataFrame([
   {
       "x1": 2,
       "genre": "G1,G2"
   },
  {
       "x1": 3,
       "genre": "G1,G3"
   },
   {
       "x1": 4,
       "genre": "G2,G3"
   }])


In [137]:
df

Unnamed: 0,genre,x1
0,"G1,G2",2
1,"G1,G3",3
2,"G2,G3",4


In [142]:
def split_genres(row):
    genres = row['genre'].split(',')
    for genre in genres:
        row[genre] = 1
    return pd.Series(row)

In [143]:
df_new = df.apply(split_genres,axis=1)

In [144]:
df_new

Unnamed: 0,G1,G2,G3,genre,x1
0,1.0,1.0,,"G1,G2",2
1,1.0,,1.0,"G1,G3",3
2,,1.0,1.0,"G2,G3",4


In [260]:
################## Make it this way (the fast way) for the blog post
# load data into list of lists and then put it into pandas
##################

allFiles = glob.glob("/Users/williamcosby/Documents/metis/Project_Luther/movie_*.csv")
movie_data = pd.DataFrame()
list_ = []
for file_ in allFiles:
    df = pd.read_csv(file_,index_col=None, header=0)
    list_.append(df)
movie_data_raw = pd.concat(list_)

In [261]:
# movie_data_2015_2016 = pd.read_csv('movie_data_2015_2016.csv')

In [262]:
movie_data_raw.head()

Unnamed: 0,MOVIE_NAME,DOMESTIC_OPENING,FOREIGN_TOTAL,BUDGET,GENRE,RELEASE_DATE
0,Marvel's The Avengers,207438708,896200000,220000000.0,Action / Adventure,2012-05-04 00:00:00
1,The Dark Knight Rises,160887295,636800000,250000000.0,Action Thriller,2012-07-20 00:00:00
2,The Hunger Games,152535747,286384032,78000000.0,Action / Adventure,2012-03-23 00:00:00
3,Skyfall,88364714,804200736,200000000.0,Action,2012-11-09 00:00:00
4,The Hobbit:An Unexpected Journey,84617303,718100000,0.0,Fantasy,2012-12-14 00:00:00


# Set the index to the movie name (dont need it in the analysis)

In [263]:
movie_data_raw = movie_data_raw.set_index(['MOVIE_NAME'])

# Find number of movies that have foreign gross info

In [264]:
# total number of movies
movie_data_raw.shape

(4256, 5)

In [265]:
# movies with foreign earnings information
movie_data_raw[movie_data_raw['FOREIGN_TOTAL']!=0].shape

(1528, 5)

# Subset to get just the movies with foreign earnings information

In [266]:
movie_data = movie_data_raw[movie_data_raw['FOREIGN_TOTAL']!=0]

In [267]:
movie_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1528 entries, Marvel's The Avengers to Capture the Flag
Data columns (total 5 columns):
DOMESTIC_OPENING    1528 non-null int64
FOREIGN_TOTAL       1528 non-null int64
BUDGET              1527 non-null float64
GENRE               1528 non-null object
RELEASE_DATE        1528 non-null object
dtypes: float64(1), int64(2), object(2)
memory usage: 71.6+ KB


# Start some analysis

## Combine the genres so they arent "action / adventure"

In [268]:
def split_genres(row):
    genres=[]
    if '/' in row['GENRE']:
        genres = row['GENRE'].split('/')
    else:
        genres = row['GENRE'].split()
    for genre in genres:
        # need to strip the whitespace off the columns to avoid things like 'Action' and 'Action '
        row[genre.strip()] = 1
    return pd.Series(row)

In [269]:
movie_data_split = movie_data.apply(split_genres,axis=1)

In [270]:
movie_data_split.head(3)

Unnamed: 0_level_0,Action,Adventure,Animation,BUDGET,Comedy,Concert,Crime,DOMESTIC_OPENING,Documentary,Drama,...,Musical,Period,RELEASE_DATE,Romance,Romantic,Sci-Fi,Sports,Thriller,War,Western
MOVIE_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Marvel's The Avengers,1.0,1.0,,220000000.0,,,,207438708,,,...,,,2012-05-04 00:00:00,,,,,,,
The Dark Knight Rises,1.0,,,250000000.0,,,,160887295,,,...,,,2012-07-20 00:00:00,,,,,1.0,,
The Hunger Games,1.0,1.0,,78000000.0,,,,152535747,,,...,,,2012-03-23 00:00:00,,,,,,,


## Now fill in the missing values with 0


In [271]:
movie_data = movie_data_split.fillna(0)

## Need to drop the 'GENRE' column now

In [272]:
movie_data.drop(['GENRE'],axis=1,inplace=True)

In [273]:
movie_data.columns

Index([u'Action', u'Adventure', u'Animation', u'BUDGET', u'Comedy', u'Concert',
       u'Crime', u'DOMESTIC_OPENING', u'Documentary', u'Drama',
       u'FOREIGN_TOTAL', u'Family', u'Fantasy', u'Foreign', u'Historical',
       u'Horror', u'IMAX', u'Music', u'Musical', u'Period', u'RELEASE_DATE',
       u'Romance', u'Romantic', u'Sci-Fi', u'Sports', u'Thriller', u'War',
       u'Western'],
      dtype='object')

In [274]:
movie_data.head(2)

Unnamed: 0_level_0,Action,Adventure,Animation,BUDGET,Comedy,Concert,Crime,DOMESTIC_OPENING,Documentary,Drama,...,Musical,Period,RELEASE_DATE,Romance,Romantic,Sci-Fi,Sports,Thriller,War,Western
MOVIE_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Marvel's The Avengers,1.0,1.0,0.0,220000000.0,0.0,0.0,0.0,207438708,0.0,0.0,...,0.0,0.0,2012-05-04 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Dark Knight Rises,1.0,0.0,0.0,250000000.0,0.0,0.0,0.0,160887295,0.0,0.0,...,0.0,0.0,2012-07-20 00:00:00,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### Will want to encode the dates as 1st quarter, 2nd quarter, 3rd quarter, 4th quarter
#### Do this by first transforming the column into strings "1","2","3","4", then apply pandas .get_dummies to make it one-hot

In [275]:
from pandas import DatetimeIndex
import dateutil.parser

In [276]:
# turn RELEASE_DATE into actual datetime
movie_data['RELEASE_DATE'] = movie_data['RELEASE_DATE'].apply(lambda x: dateutil.parser.parse(x))

In [277]:
movie_data['RELEASE_DATE'].dtype

dtype('<M8[ns]')

In [278]:
movie_data["RELEASE_QUARTER"] = movie_data["RELEASE_DATE"].apply(lambda x: x.quarter)

In [279]:
# now just want release quarter
movie_data.drop(['RELEASE_DATE'],axis=1,inplace=True)

#### Now vectorize release quarter column

In [280]:
movie_data = pd.get_dummies(movie_data,columns=["RELEASE_QUARTER"])

In [281]:
movie_data.head()

Unnamed: 0_level_0,Action,Adventure,Animation,BUDGET,Comedy,Concert,Crime,DOMESTIC_OPENING,Documentary,Drama,...,Romantic,Sci-Fi,Sports,Thriller,War,Western,RELEASE_QUARTER_1,RELEASE_QUARTER_2,RELEASE_QUARTER_3,RELEASE_QUARTER_4
MOVIE_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Marvel's The Avengers,1.0,1.0,0.0,220000000.0,0.0,0.0,0.0,207438708,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
The Dark Knight Rises,1.0,0.0,0.0,250000000.0,0.0,0.0,0.0,160887295,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
The Hunger Games,1.0,1.0,0.0,78000000.0,0.0,0.0,0.0,152535747,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
Skyfall,1.0,0.0,0.0,200000000.0,0.0,0.0,0.0,88364714,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
The Hobbit:An Unexpected Journey,0.0,0.0,0.0,0.0,0.0,0.0,0.0,84617303,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Check the data for missing values, and other weird things

## Impute what I can into the raw data

### Check Domestic Openings

In [282]:
len(movie_data[movie_data["DOMESTIC_OPENING"] == 0])

0

## Check Budget 

In [283]:
# woah...missing a lot~~
len (movie_data[movie_data['BUDGET'] == 0])

774

In [284]:
movie_no_budget = movie_data[movie_data['BUDGET'] == 0]

In [285]:
movie_no_budget.head(10)

Unnamed: 0_level_0,Action,Adventure,Animation,BUDGET,Comedy,Concert,Crime,DOMESTIC_OPENING,Documentary,Drama,...,Romantic,Sci-Fi,Sports,Thriller,War,Western,RELEASE_QUARTER_1,RELEASE_QUARTER_2,RELEASE_QUARTER_3,RELEASE_QUARTER_4
MOVIE_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
The Hobbit:An Unexpected Journey,0.0,0.0,0.0,0.0,0.0,0.0,0.0,84617303,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Argo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19458109,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
The Campaign,0.0,0.0,0.0,0.0,1.0,0.0,0.0,26588460,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
Hope Springs,0.0,0.0,0.0,0.0,1.0,0.0,0.0,14650121,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
The Lucky One,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22518358,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
Project X,0.0,0.0,0.0,0.0,1.0,0.0,0.0,21051363,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
The Woman in Black,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20874072,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
The Devil Inside,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33732515,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
The Odd Life of Timothy Green,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10822903,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
Beauty and the Beast (3D),0.0,0.0,1.0,0.0,0.0,0.0,0.0,17751905,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


# Note!!!!!! one reason for weird domest/foreign offset is if a movie is released somewhere like korea...it might do super well there but not well in america (which is considered domestic)

# Have option of ignoring films with foreign as the genre

In [286]:
movies_no_domestic = movie_data[movie_data["DOMESTIC_OPENING"] == 0]

In [287]:
movies_no_domestic

Unnamed: 0_level_0,Action,Adventure,Animation,BUDGET,Comedy,Concert,Crime,DOMESTIC_OPENING,Documentary,Drama,...,Romantic,Sci-Fi,Sports,Thriller,War,Western,RELEASE_QUARTER_1,RELEASE_QUARTER_2,RELEASE_QUARTER_3,RELEASE_QUARTER_4
MOVIE_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


# Just get rid of entries with no budget info

In [288]:
movie_data = movie_data[movie_data['BUDGET'] != 0]

In [289]:
movie_data.shape

(754, 31)

In [290]:
len(movie_data[movie_data['Foreign'] == 1])

9

# Drop foreign movies because 

In [291]:
movie_data = movie_data[movie_data['Foreign'] == 0]

In [292]:
movie_data.shape

(745, 31)

# Get some plots the data

In [293]:
import sklearn
from sklearn.cross_validation import train_test_split,KFold,cross_val_score
from pandas.tools.plotting import scatter_matrix

### Drop na values (there is apparently 1)

In [294]:
movie_data.dropna(inplace=True)

In [295]:
movie_data.shape

(745, 31)

In [296]:
movie_features,movie_target = movie_data.drop('FOREIGN_TOTAL', axis=1),movie_data['FOREIGN_TOTAL']

## Want to get some plots...write these to a directory
#### currently without the extra actor/director/writer information...

In [298]:
column_names = list(movie_features.columns)

for column in column_names:
    try:
    #     print movie_features[column]
        # create plots of column vs foreign budget

        ## want to rescale budget and domestic fields to make the graphs look better and be 
        ## easier to understand
        if column == 'BUDGET':
            plt.scatter(movie_features[column]/float(1000000),movie_target/float(1000000),alpha=0.2)
        elif column == 'DOMESTIC_OPENING':
            plt.scatter(movie_features[column]/float(1000000),movie_target/float(1000000),alpha=0.2)
        else:
            plt.scatter(movie_features[column],movie_target/float(1000000),alpha=0.2)

        plt.xlabel(column)
        plt.ylabel('foreign gross (millions $)')

        path = '/Users/williamcosby/Documents/metis/Project_Luther/plots/'+str(column)+'.png'
        plt.savefig(path)
        plt.close() 
    except:
        print column
        print movie_features[column]

# Start some modeling
### Split the data into features and targets



In [299]:
from sklearn import linear_model
from sklearn.cross_validation import cross_val_predict, train_test_split

# Training, Testing, and holdout splits

#### nh means "not holdout"

### Make a holdout set from the data

In [349]:
movie_features_nh, movie_features_holdout, movie_target_nh, movie_target_holdout = train_test_split(movie_features,movie_target,
                                                            test_size=0.1)

### Now split the data into the train and test sets (within the non-holdout data) and create the model

In [350]:
movie_features_train, movie_features_test, movie_target_train, movie_target_test = train_test_split(
                                                            movie_features_nh,movie_target_nh,
                                                            test_size=0.3)

### Drop domestic opening

In [351]:
# try dropping domestic opening 
movie_features_train_nodom = movie_features_train.drop(['DOMESTIC_OPENING'],axis=1)
movie_features_test_nodom = movie_features_test.drop(['DOMESTIC_OPENING'],axis=1)

## Train lin reg without domestic opening

In [352]:
lr_nodom = linear_model.LinearRegression()
lr_nodom.fit(movie_features_train_nodom,movie_target_train)
print "vanilla lin reg score: ", lr_nodom.score(movie_features_test_nodom, movie_target_test)

vanilla lin reg score:  0.479908865453


## Train lin reg with domestic opening

In [353]:
lr = linear_model.LinearRegression()
lr.fit(movie_features_train,movie_target_train)
print "vanilla lin reg score: ", lr.score(movie_features_test, movie_target_test)
# sort features by coefficient
sorted_features = sorted(zip(list(movie_features_train.columns),lr.coef_),
                         key=lambda tup: abs(tup[1]),reverse=True) 

for feature in sorted_features:
    print(feature)

vanilla lin reg score:  0.708468586129
('War', -77701393.542315155)
('Historical', 47537688.510640897)
('Western', -36503247.228029147)
('Sports', -28884843.363440741)
('Comedy', -27233351.718280904)
('Period', 26861391.296647333)
('Animation', 26086113.521871332)
('Adventure', -24914351.922636937)
('RELEASE_QUARTER_1', -24801696.900545347)
('Family', 23448963.703135617)
('RELEASE_QUARTER_4', 19490275.850698773)
('Romantic', 13540693.294094682)
('Fantasy', 12302919.158234753)
('Crime', -11416411.942109365)
('Horror', -11179281.083517201)
('Documentary', -8734655.3789676763)
('Sci-Fi', 8653689.1514319591)
('Music', -7465543.4426283697)
('Thriller', -6127581.4844733197)
('RELEASE_QUARTER_2', 5488391.2437234614)
('Action', 4241574.6324213753)
('Musical', 2970132.2945143841)
('Romance', -2530998.5003666393)
('IMAX', -1254609.0890348423)
('Drama', -585263.57451717928)
('RELEASE_QUARTER_3', -176970.21885108948)
('DOMESTIC_OPENING', 3.7656464669853449)
('BUDGET', 0.98224097862839699)
('Concer

# Run on holdout set

In [354]:
# with domestic opening information
lr.score(movie_features_holdout, movie_target_holdout)

0.82025956632474772

In [355]:
# without domestic opening information
movie_features_holdout_nodom = movie_features_holdout.drop(['DOMESTIC_OPENING'],axis=1)
lr_nodom.score(movie_features_holdout_nodom,movie_target_holdout)

0.56729220424970483

# Random Forest

In [356]:
len(movie_features.columns)

30

In [357]:
from sklearn.ensemble import RandomForestRegressor

In [358]:
rf_nodom = RandomForestRegressor(n_estimators=1000,max_features=25)
rf_nodom.fit(movie_features_train_nodom,movie_target_train)
print "Random Forest result with no domestic information: ", rf_nodom.score(movie_features_test_nodom,movie_target_test)

rf = RandomForestRegressor(n_estimators=1000,max_features=25)
rf.fit(movie_features_train,movie_target_train)
print "Random Forest result with domestic information: ", rf.score(movie_features_test,movie_target_test)



Random Forest result with no domestic information:  0.299610490728
Random Forest result with domestic information:  0.669434349823


In [359]:
rf.feature_importances_

array([  5.54414419e-03,   1.05976039e-02,   5.55720092e-03,
         2.79110815e-01,   4.32284676e-03,   0.00000000e+00,
         2.98753371e-04,   6.37453080e-01,   1.31883814e-05,
         1.96536426e-03,   2.58458118e-03,   1.66350857e-03,
         0.00000000e+00,   2.65034710e-03,   2.78510207e-03,
         7.97645333e-07,   3.13897459e-04,   1.13654135e-03,
         1.15101437e-03,   3.29332404e-03,   4.13940050e-04,
         8.39912528e-03,   1.17866931e-05,   1.98211432e-03,
         1.70757471e-04,   2.53655641e-04,   5.72103226e-03,
         6.11657577e-03,   4.17197892e-03,   1.23169227e-02])

# Gradient Boost

In [360]:
from sklearn.ensemble import GradientBoostingRegressor

In [363]:
grad_boost = GradientBoostingRegressor(n_estimators=100,learning_rate=0.1,alpha=0.3,loss='huber')

grad_boost.fit(movie_features_train_nodom,movie_target_train)
grad_boost.score(movie_features_test_nodom,movie_target_test)

0.45241514643437525

In [364]:
grad_boost.score(movie_features_holdout_nodom, movie_target_holdout)

0.61666676026934897

# Grid Searching Parameters

In [368]:
from sklearn import grid_search

## Linear Regression

In [385]:
models = {}
models['lin_reg'] = linear_model.LinearRegression()
models['ridge'] = linear_model.Ridge()
models['lasso'] = linear_model.Lasso(alpha = 2) # ran this normally...alpha =2 kills more features than default
models['elastic'] = linear_model.ElasticNet()
models['elasticCV'] = linear_model.ElasticNetCV()
models['larscv'] = linear_model.LarsCV()
models['lassoCV'] = linear_model.LassoCV()

In [386]:
# and lol remember that dictionaries are NOT ordered
for name,model in models.iteritems():
    model.fit(movie_features_train_nodom,movie_target_train)
    print('Model: '+name)
    print('Score: ' + str(model.score(movie_features_test_nodom,movie_target_test)))
    # sort features by coefficient
#     sorted_features = sorted(zip(movie_features_train_nodom.columns,model.coef_),
#                              key=lambda tup: abs(tup[1]),reverse=True) 
#     # reverse makes it go from positive to negative...doesnt mean importance though~~~~~!!!!!
#     for feature in sorted_features:
#         print(feature)
#     print # get some spacing

Model: ridge
Score: 0.482811459515
Model: elastic
Score: 0.491718417505
Model: lin_reg
Score: 0.479908865453
Model: lassoCV
Score: 0.488099925535
Model: lasso
Score: 0.479908982932
Model: larscv
Score: 0.502916085048
Model: elasticCV
Score: 0.488644763113




In [378]:
lasso = linear_model.Lasso()
# want to see the effect of normalizing and different parameters
# np.logspace selects numbers evenly from -4 to -0.1 (on log scale, goes from 10^4 to 10^0.1)
parameters = {'normalize':(True,False),'alpha':np.logspace(-4,-.1,30)}
grid_searcher = grid_search.GridSearchCV(lasso,parameters)
grid_searcher.fit(movie_features_train_nodom,movie_target_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'normalize': (True, False), 'alpha': array([  1.00000e-04,   1.36296e-04,   1.85766e-04,   2.53191e-04,
         3.45089e-04,   4.70342e-04,   6.41057e-04,   8.73734e-04,
         1.19086e-03,   1.62310e-03,   2.21222e-03,   3.01516e-03,
         4.10954e-03,   5.60113e-03,   7.63411e-03...    1.68883e-01,   2.30181e-01,   3.13727e-01,   4.27597e-01,
         5.82797e-01,   7.94328e-01])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [372]:
grid_searcher.best_params_

{'alpha': 0.79432823472428149, 'normalize': True}

In [374]:
best_lasso_model = grid_searcher.best_estimator_

In [375]:
best_lasso_model.score(movie_features_test_nodom,movie_target_test)

0.47990904000030549

# 