In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import pickle

In [2]:
crew_roi_data = pd.read_csv('../../Sample_Data/For_Modeling/person.averageroi.tsv', sep='\t')

In [3]:
crew_roi_list = crew_roi_data.values.tolist()
crew_roi_dict = {}
for crew_roi in crew_roi_list:
    crew_roi_dict[crew_roi[0]] = float(crew_roi[1])
crew_roi_dict

{'nm0000019': 0.4,
 'nm0000033': 4.016666666666667,
 'nm0000037': 2.8346456692913384,
 'nm0000040': 2.88679301148413,
 'nm0000041': 1.18193164,
 'nm0000076': 0.017371875,
 'nm0000092': 4.977836388888888,
 'nm0000095': 1.2701804863360395,
 'nm0000101': 5.096687227777778,
 'nm0000104': 0.13028013333333333,
 'nm0000106': 0.8718122666666667,
 'nm0000108': 1.2417243166727063,
 'nm0000110': 1.3562651878787877,
 'nm0000114': 0.4425489230769231,
 'nm0000116': 2.0830875269588143,
 'nm0000118': 19.547559438527585,
 'nm0000120': 6.0181163333333325,
 'nm0000123': 0.9971593522702936,
 'nm0000126': 5.969355989878542,
 'nm0000127': 6.041677589885234,
 'nm0000128': 0.18667186666666666,
 'nm0000129': 1.37784555,
 'nm0000131': 1.0980764722222225,
 'nm0000141': 0.19426600000000002,
 'nm0000142': 2.150714227002354,
 'nm0000149': 0.8128798080246913,
 'nm0000154': 3.783474509444445,
 'nm0000158': 5.76495395,
 'nm0000160': 2.799975666666666,
 'nm0000163': 1.6716688181818182,
 'nm0000165': 1.1105531280168492,

In [4]:
filename = '../../Sample_Data/Processed/movie.crews.combined.tsv'
movie_data = pd.read_csv(filename, sep='\t')
movie_data.head()

Unnamed: 0,tconst,directors,writers,persons,ROI_domestic
0,tt0011549,nm0587754,"nm0137414,nm0806061","nm0587754,nm0137414,nm0806061",30.0
1,tt0016641,"nm0629243,nm0102643,nm0127511,nm0169905,nm0002271","nm0908753,nm0558923,nm0933133,nm0385012,nm0129721","nm0629243,nm0102643,nm0127511,nm0169905,nm0002...",2.307692
2,tt0019729,nm0064600,"nm0332539,nm0396876,nm0322299,nm0049898","nm0064600,nm0332539,nm0396876,nm0322299,nm0049898",7.387863
3,tt0024034,nm0045800,"nm0416861,nm0786827,nm0740622,nm0093456","nm0045800,nm0416861,nm0786827,nm0740622,nm0093456",3.275626
4,tt0024548,nm0792514,"nm0922213,nm0857710,nm0109148","nm0792514,nm0922213,nm0857710,nm0109148",10.0


In [5]:
movie_data['directors'] = movie_data['directors'].str.split(',')
movie_data['writers'] = movie_data['writers'].str.split(',')
movie_data.head()

Unnamed: 0,tconst,directors,writers,persons,ROI_domestic
0,tt0011549,[nm0587754],"[nm0137414, nm0806061]","nm0587754,nm0137414,nm0806061",30.0
1,tt0016641,"[nm0629243, nm0102643, nm0127511, nm0169905, n...","[nm0908753, nm0558923, nm0933133, nm0385012, n...","nm0629243,nm0102643,nm0127511,nm0169905,nm0002...",2.307692
2,tt0019729,[nm0064600],"[nm0332539, nm0396876, nm0322299, nm0049898]","nm0064600,nm0332539,nm0396876,nm0322299,nm0049898",7.387863
3,tt0024034,[nm0045800],"[nm0416861, nm0786827, nm0740622, nm0093456]","nm0045800,nm0416861,nm0786827,nm0740622,nm0093456",3.275626
4,tt0024548,[nm0792514],"[nm0922213, nm0857710, nm0109148]","nm0792514,nm0922213,nm0857710,nm0109148",10.0


In [6]:
def set_value(row, assigned_value):
    average_roi_list = []
    for person in row:
        if person in assigned_value:
            average_roi_list.append(assigned_value[person])
        else:
            average_roi_list.append(0)
    return np.mean(average_roi_list)

In [7]:
movie_data['director_mean_roi'] = movie_data['directors'].apply(set_value, args =(crew_roi_dict, ))
movie_data.head()

Unnamed: 0,tconst,directors,writers,persons,ROI_domestic,director_mean_roi
0,tt0011549,[nm0587754],"[nm0137414, nm0806061]","nm0587754,nm0137414,nm0806061",30.0,30.0
1,tt0016641,"[nm0629243, nm0102643, nm0127511, nm0169905, n...","[nm0908753, nm0558923, nm0933133, nm0385012, n...","nm0629243,nm0102643,nm0127511,nm0169905,nm0002...",2.307692,2.307692
2,tt0019729,[nm0064600],"[nm0332539, nm0396876, nm0322299, nm0049898]","nm0064600,nm0332539,nm0396876,nm0322299,nm0049898",7.387863,7.387863
3,tt0024034,[nm0045800],"[nm0416861, nm0786827, nm0740622, nm0093456]","nm0045800,nm0416861,nm0786827,nm0740622,nm0093456",3.275626,3.275626
4,tt0024548,[nm0792514],"[nm0922213, nm0857710, nm0109148]","nm0792514,nm0922213,nm0857710,nm0109148",10.0,10.0


In [8]:
movie_data['writer_mean_roi'] = movie_data['writers'].apply(set_value, args =(crew_roi_dict, ))
movie_data.head()

Unnamed: 0,tconst,directors,writers,persons,ROI_domestic,director_mean_roi,writer_mean_roi
0,tt0011549,[nm0587754],"[nm0137414, nm0806061]","nm0587754,nm0137414,nm0806061",30.0,30.0,30.0
1,tt0016641,"[nm0629243, nm0102643, nm0127511, nm0169905, n...","[nm0908753, nm0558923, nm0933133, nm0385012, n...","nm0629243,nm0102643,nm0127511,nm0169905,nm0002...",2.307692,2.307692,2.56359
2,tt0019729,[nm0064600],"[nm0332539, nm0396876, nm0322299, nm0049898]","nm0064600,nm0332539,nm0396876,nm0322299,nm0049898",7.387863,7.387863,7.387863
3,tt0024034,[nm0045800],"[nm0416861, nm0786827, nm0740622, nm0093456]","nm0045800,nm0416861,nm0786827,nm0740622,nm0093456",3.275626,3.275626,3.275626
4,tt0024548,[nm0792514],"[nm0922213, nm0857710, nm0109148]","nm0792514,nm0922213,nm0857710,nm0109148",10.0,10.0,10.0


In [9]:
# # movie_data['historic_mean_roi'] = np.where((movie_data['ROI_domestic']==movie_data['director_mean_roi']), \
#                                            np.where((movie_data['ROI_domestic']==movie_data['writer_mean_roi']), 0, movie_data['writer_mean_roi']), \
#                                            np.where((movie_data['ROI_domestic']==movie_data['writer_mean_roi']), movie_data['director_mean_roi'], (movie_data['director_mean_roi']+movie_data['writer_mean_roi'])/2))
movie_data['historic_mean_roi'] = np.where((movie_data['ROI_domestic']==movie_data['director_mean_roi']), \
                                           0, \
                                           np.where((movie_data['ROI_domestic']==movie_data['writer_mean_roi']), 0, (movie_data['director_mean_roi']+movie_data['writer_mean_roi'])/2))
movie_data.head()

Unnamed: 0,tconst,directors,writers,persons,ROI_domestic,director_mean_roi,writer_mean_roi,historic_mean_roi
0,tt0011549,[nm0587754],"[nm0137414, nm0806061]","nm0587754,nm0137414,nm0806061",30.0,30.0,30.0,0.0
1,tt0016641,"[nm0629243, nm0102643, nm0127511, nm0169905, n...","[nm0908753, nm0558923, nm0933133, nm0385012, n...","nm0629243,nm0102643,nm0127511,nm0169905,nm0002...",2.307692,2.307692,2.56359,0.0
2,tt0019729,[nm0064600],"[nm0332539, nm0396876, nm0322299, nm0049898]","nm0064600,nm0332539,nm0396876,nm0322299,nm0049898",7.387863,7.387863,7.387863,0.0
3,tt0024034,[nm0045800],"[nm0416861, nm0786827, nm0740622, nm0093456]","nm0045800,nm0416861,nm0786827,nm0740622,nm0093456",3.275626,3.275626,3.275626,0.0
4,tt0024548,[nm0792514],"[nm0922213, nm0857710, nm0109148]","nm0792514,nm0922213,nm0857710,nm0109148",10.0,10.0,10.0,0.0


In [10]:
movie_data.drop(movie_data[movie_data['historic_mean_roi']==0].index,inplace=True)

In [11]:
len(movie_data)

2444

## Combining with genre

In [12]:
filename = '../../Sample_Data/Processed/roi.genresplitted.tsv'
df = pd.read_csv(filename, sep='\t')
df.head()

Unnamed: 0,tconst,name,date,prod_budget,domestic_gross,worldwide_gross,isAdult,startYear,runtimeMinutes,Action,...,Adult,War,Biography,Western,Sport,Music,News,Film-Noir,ROI_domestic,ROI_worldwide
0,tt4154796,Avengers: Endgame,2019,400000000,858373000,2797800564,0,2019,181,1,...,0,0,0,0,0,0,0,0,2.145933,6.994501
1,tt1298650,Pirates of the Caribbean: On Stranger Tides,2011,379000000,241063875,1045663875,0,2011,136,1,...,0,0,0,0,0,0,0,0,0.636052,2.759008
2,tt2395427,Avengers: Age of Ultron,2015,365000000,459005868,1403013963,0,2015,141,1,...,0,0,0,0,0,0,0,0,1.25755,3.843874
3,tt4154756,Avengers: Infinity War,2018,300000000,678815482,2048359754,0,2018,149,1,...,0,0,0,0,0,0,0,0,2.262718,6.827866
4,tt0974015,Justice League,2017,300000000,229024295,655945209,0,2017,120,1,...,0,0,0,0,0,0,0,0,0.763414,2.186484


In [13]:
df.drop(columns=['name', 'date', 'ROI_domestic', 'ROI_worldwide', 'prod_budget', 'domestic_gross', 'worldwide_gross', 'startYear', 'runtimeMinutes'], inplace=True)
movie_data.drop(columns=['directors', 'writers', 'persons', 'historic_mean_roi'], inplace=True)
movie_data = movie_data.join(df.set_index('tconst'), on='tconst', how='left')
movie_data.head()

Unnamed: 0,tconst,ROI_domestic,director_mean_roi,writer_mean_roi,isAdult,Action,Adventure,Drama,Fantasy,Sci-Fi,...,Musical,Documentary,Adult,War,Biography,Western,Sport,Music,News,Film-Noir
7,tt0036628,1.482423,2.554624,1.912688,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
8,tt0037884,8.8,6.275,7.958333,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
10,tt0038499,3.4,3.314286,3.586458,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
11,tt0038589,1.39884,2.554624,1.457637,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
14,tt0041866,3.375,2.916093,3.222031,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [14]:
len(movie_data)

2478

In [15]:
movie_data.drop_duplicates()
len(movie_data)

2478

In [33]:
movie_data.to_csv('../../Sample_Data/For_Modeling/movie.persons.roi.genres.tsv', sep='\t', index=False)

## Classification

In [16]:
X = movie_data.iloc[:, 2:].values.tolist()
y = movie_data['ROI_domestic'].values.tolist()

In [17]:
def test_model(X_train, X_test, y_train, y_test, clf, cv_parameter=None):
    best_param = ''
    if cv_parameter != None:
        grid = GridSearchCV(clf, cv_parameter, scoring='recall')
        grid.fit(X_train, y_train)
        best_param = grid.best_params_
        clf = grid.best_estimator_
    else:
        clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('===========================')
    print('The metric of {} with {} is:'.format(clf.__class__.__name__, best_param))
    precision, recall, fscore, w = precision_recall_fscore_support(y_test, y_pred, average='binary')
    print(precision, recall, fscore)
    print('The accuracy of {} is:'.format(clf.__class__.__name__))
    print(accuracy_score(y_test, y_pred))
    print('===========================')
    return {'recall': recall, 'precision':precision, 'f1_score': fscore}, clf

In [18]:
class Model:
    
    def __init__(self, model, cv_grid):
        self.model = model
        self.cv_grid = cv_grid

def test_threshold(X, y, threshold=2):
    y = [int(i<threshold) for i in y]
    print('There are {}% of movies in the dataset with ROI < {}'.format(np.sum(y)/len(y)*100, threshold))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    max_fscore = 0
    best_scores = None
    best_clf = None
    
    clfs = [Model(svm.SVC(), {'kernel': ['linear', 'rbf']}),
            Model(LogisticRegression(), None),
            Model(RandomForestClassifier(), {"min_samples_split":[i*10 for i in range(6,11,1)]})]
    
    for clf in clfs:
        scores, test_clf = test_model(X_train, X_test, y_train, y_test, clf.model, clf.cv_grid)
        if scores['f1_score'] > max_fscore:
            max_fscore = scores['f1_score']
            best_scores = scores
            best_clf = test_clf
    return {'clf': best_clf, 'scores': best_scores}

In [19]:
X = movie_data.iloc[:, 2:].values.tolist()
y = movie_data['ROI_domestic'].values.tolist()
clf = {}

In [20]:
clf[1.25] = test_threshold(X, y, 1.25)

There are 50.48426150121066% of movies in the dataset with ROI < 1.25
The metric of SVC with {'kernel': 'linear'} is:
0.6584615384615384 0.8392156862745098 0.7379310344827585
The accuracy of SVC is:
0.6935483870967742
The metric of LogisticRegression with  is:
0.6523076923076923 0.8313725490196079 0.7310344827586207
The accuracy of LogisticRegression is:
0.6854838709677419


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


The metric of RandomForestClassifier with {'min_samples_split': 60} is:
0.7622950819672131 0.7294117647058823 0.7454909819639278
The accuracy of RandomForestClassifier is:
0.7439516129032258


In [21]:
clf[1.5] = test_threshold(X, y, 1.5)

There are 57.546408393866024% of movies in the dataset with ROI < 1.5
The metric of SVC with {'kernel': 'linear'} is:
0.7026315789473684 0.9206896551724137 0.7970149253731343
The accuracy of SVC is:
0.7258064516129032
The metric of LogisticRegression with  is:
0.6953125 0.9206896551724137 0.7922848664688428
The accuracy of LogisticRegression is:
0.717741935483871


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


The metric of RandomForestClassifier with {'min_samples_split': 60} is:
0.8068965517241379 0.8068965517241379 0.8068965517241379
The accuracy of RandomForestClassifier is:
0.7741935483870968


In [22]:
clf[1.75] = test_threshold(X, y, 1.75)

There are 63.19612590799032% of movies in the dataset with ROI < 1.75
The metric of SVC with {'kernel': 'linear'} is:
0.743142144638404 0.9371069182389937 0.8289290681502086
The accuracy of SVC is:
0.7520161290322581
The metric of LogisticRegression with  is:
0.7274939172749392 0.940251572327044 0.8203017832647462
The accuracy of LogisticRegression is:
0.7358870967741935
The metric of RandomForestClassifier with {'min_samples_split': 60} is:
0.8237082066869301 0.8522012578616353 0.8377125193199383
The accuracy of RandomForestClassifier is:
0.7883064516129032


In [23]:
clf[2] = test_threshold(X, y, 2)

There are 68.48264729620662% of movies in the dataset with ROI < 2
The metric of SVC with {'kernel': 'linear'} is:
0.7780373831775701 0.9568965517241379 0.8582474226804124
The accuracy of SVC is:
0.7782258064516129
The metric of LogisticRegression with  is:
0.7726218097447796 0.9568965517241379 0.8549422336328625
The accuracy of LogisticRegression is:
0.7721774193548387
The metric of RandomForestClassifier with {'min_samples_split': 90} is:
0.8602739726027397 0.9022988505747126 0.8807854137447404
The accuracy of RandomForestClassifier is:
0.8286290322580645


In [24]:
clf[2.25] = test_threshold(X, y, 2.25)

There are 72.88135593220339% of movies in the dataset with ROI < 2.25
The metric of SVC with {'kernel': 'linear'} is:
0.7825112107623319 0.9614325068870524 0.8627935723114958
The accuracy of SVC is:
0.7762096774193549
The metric of LogisticRegression with  is:
0.7878103837471784 0.9614325068870524 0.8660049627791564
The accuracy of LogisticRegression is:
0.782258064516129


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


The metric of RandomForestClassifier with {'min_samples_split': 80} is:
0.8608247422680413 0.9201101928374655 0.88948069241012
The accuracy of RandomForestClassifier is:
0.8326612903225806


In [32]:
pickle.dump([movie_data.columns[2:], clf[2]], open("../../Sample_Data/For_Modeling/clf_model", "wb"))

In [31]:
genres = movie_data.columns[2:][2:]
with open("../../Sample_Data/For_Modeling/genres_list_html.tsv", 'w') as f:
    for genre in genres:
        f.write('{}\n'.format(genre))

## Regression

In [26]:
X = movie_data.loc[:, ['director_mean_roi', 'writer_mean_roi']].values.tolist()
y = movie_data['ROI_domestic'].values.tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
reg = LinearRegression().fit(X_train, y_train)
print('{}\'s score is {}'.format(reg.__class__.__name__, reg.score(X_train, y_train)))
print('{}\'s coefficient is {}'.format(reg.__class__.__name__, reg.coef_))

LinearRegression's score is 0.4924121449577028
LinearRegression's coefficient is [0.26686152 1.35772245]


In [28]:
cv_parameter = {'C':[1, 5, 10, 50], 'kernel': ['linear', 'poly', 'rbf']}
reg = SVR()
grid = GridSearchCV(reg, cv_parameter)
grid.fit(X_train, y_train)
print(grid.best_params_)
model = grid.best_estimator_
print('{}\'s score is {}'.format(model.__class__.__name__, model.score(X_train, y_train)))
print('{}\'s coefficient is {}'.format(model.__class__.__name__, model.coef_))

{'C': 1, 'kernel': 'linear'}
SVR's score is 0.46314905479747037
SVR's coefficient is [[-0.01921997  1.22721272]]


In [29]:
cv_parameter = {"min_samples_split":[i*10 for i in range(7,10)]}
reg = RandomForestRegressor()
grid = GridSearchCV(reg, cv_parameter)
grid.fit(X_train, y_train)
print(grid.best_params_)
model = grid.best_estimator_
print('{}\'s score is {}'.format(model.__class__.__name__, model.score(X_train, y_train)))
try:
    print('{}\'s coefficient is {}'.format(model.__class__.__name__, model.coef_))
except:
    print('{} does not have coefficients'.format(model.__class__.__name__))

{'min_samples_split': 70}
RandomForestRegressor's score is 0.4928558242440321
RandomForestRegressor does not have coefficients
