In [13]:
#Jia Yi (Susan) Wang
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import re
#import seaborn
#seaborn.set()

from sklearn.model_selection import train_test_split
#from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
#from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer

""""from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC"""

"""from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier"""

#from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
""""from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor"""
from sklearn.linear_model import LinearRegression

#from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.decomposition import TruncatedSVD #PCA for sparse values
from sklearn.ensemble import GradientBoostingClassifier

# Getting the data we need

In [14]:
genres = pd.read_json('../data/genres.json.gz', orient='record', lines=True, encoding='utf-8')
omdb_data = pd.read_json('../data/omdb-data.json.gz', orient='record', lines=True, encoding='utf-8')
rt_data = pd.read_json('../data/rotten-tomatoes.json.gz', orient='record', lines=True, encoding='utf-8')
wd_data = pd.read_json('../data/wikidata-movies.json.gz', orient='record', lines=True, encoding='utf-8')

In [15]:
omdb_data=omdb_data.explode('omdb_genres')
omdb_data=omdb_data[omdb_data.omdb_awards.notnull()]
omdb_data=omdb_data[omdb_data.omdb_awards!='N\A']

nominations_re = re.compile(r'Nominated for (\d+)')
nominations_re2 = re.compile(r'(\d+) nomination(s?)')
wins_re = re.compile(r'(\d+) win(s?)')
wins_re2 = re.compile(r'Won (\d+)')

def get_wins(txt):
    wins = 0
    wins1 = wins_re.search(txt)
    wins2 = wins_re2.search(txt)
    if wins1:
        wins = wins + int(wins1.group(1))
    if wins2:
        wins = wins + int(wins2.group(1))
    return wins

def get_nominations(txt):
    noms = 0
    nominations1 = nominations_re.search(txt)
    nominations2 = nominations_re2.search(txt)
    if nominations1:
        noms = noms + int(nominations1.group(1))
    if nominations2:
        noms = noms + int(nominations2.group(1))
    return noms

omdb_data['nominations'] = omdb_data['omdb_awards'].apply(get_nominations)
omdb_data['wins'] = omdb_data['omdb_awards'].apply(get_wins)
    

In [16]:
wd_data = wd_data.explode('cast_member')
movies_played=wd_data[['cast_member','wikidata_id']].groupby('cast_member').count().reset_index().rename(columns={'wikidata_id':'movies_in'})
wd_data_stars = movies_played[movies_played.movies_in>=10]
wd_data_stars = wd_data_stars.set_index('cast_member').join(wd_data.set_index('cast_member'),on='cast_member').reset_index()

In [17]:
joined = omdb_data.set_index('imdb_id').join(rt_data.set_index('imdb_id'),on='imdb_id')
joined = joined.join(wd_data_stars.set_index('imdb_id'), lsuffix='_joined', rsuffix='_wd', on='imdb_id')

# Defining our Features and Label and filtering data

In [18]:
info = joined.reset_index()
info = info[['audience_average', 'omdb_genres', 'cast_member', 'wins', 'nominations', 'country_of_origin']]
info = info[info.audience_average.notnull()]
info = info[info.cast_member.notnull()]
info = info[info.omdb_genres.notnull()]
info = info[info.omdb_genres != 'N/A']
info = info[info.country_of_origin.notnull()]
info = info[info.country_of_origin != 'N/A']

# Encoding categorical values

In [19]:
#One hot encoder example: https://medium.com/@contactsunny/label-encoder-vs-one-hot-encoder-in-machine-learning-3fc273365621
#Column Transforer source: https://datascience.stackexchange.com/questions/41113/deprecationwarning-the-categorical-features-keyword-is-deprecated-in-version

#Encoding the genres to binary

ct = ColumnTransformer(
    transformers=[
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), [0,1,4])
    ],
    remainder='passthrough' 
) #randomforestregressor - try this

#X = ct.fit_transform(X)

# GradientBoostingRegessor model for predicting audience average

In [20]:
model = Pipeline(
        #steps=[('col_trans',ct), ('lr', LinearRegression(fit_intercept=False))]
        steps=[
            ('col_trans',ct), 
            ('pca', TruncatedSVD(2)), 
            ('gfr', GradientBoostingRegressor(max_depth=5, n_estimators=100, min_samples_leaf=10))
        ]
    )

X=info.drop(columns=['audience_average'],axis=1)
y=info['audience_average']

X_train, X_valid, y_train, y_valid = train_test_split(X,y)

model.fit(X_train, y_train)

print(model.score(X_train,y_train))
print(model.score(X_valid,y_valid)) 

#For actors that starred in >1 movie:
#m_d=5: train=0.3398654, valid=0.33893729 15.3 s
#m_d=6: train=0.37, valid=0.36 20.3s
#m_d=7: train=0.40, valid=0.39 23.3s
#m_d=8: train=0.42, valid=0.40 30.7s
#m_d=9: train=0.45, valid=0.43 34.1s
#m_d=10 : train=0.46, valid=0.44 44.4s
#m_d=15 : train=0.53, valid=0.47 1min37s
#m_d=20 : train=0.55, valid=0.47 2min55s
#m_d=30 : train=0.57, valid=0.46 4min48s

0.3592728220649418
0.3517942806093531


# Same model for predicting critic average

In [21]:
info = joined.reset_index()
info = info[['critic_average', 'omdb_genres', 'cast_member', 'wins', 'nominations', 'country_of_origin']]
info = info[info.critic_average.notnull()]
info = info[info.cast_member.notnull()]
info = info[info.omdb_genres.notnull()]
info = info[info.omdb_genres != 'N/A']
info = info[info.country_of_origin.notnull()]
info = info[info.country_of_origin != 'N/A']

ct = ColumnTransformer(
    transformers=[
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), [0,1,4])
    ],
    remainder='passthrough' 
)

model = Pipeline(
        #steps=[('col_trans',ct), ('lr', LinearRegression(fit_intercept=False))]
        steps=[
            ('col_trans',ct), 
            ('pca', TruncatedSVD(2)), 
            ('gfr', GradientBoostingRegressor(max_depth=5, n_estimators=100, min_samples_leaf=10))
        ]
    )

X=info.drop(columns=['critic_average'],axis=1)
y=info['critic_average']

X_train, X_valid, y_train, y_valid = train_test_split(X,y)

model.fit(X_train, y_train)

print(model.score(X_train,y_train))
print(model.score(X_valid,y_valid)) 



0.41721040718106905
0.4139857008989343


# Same model for predicting audience percent

In [22]:
info = joined.reset_index()
info = info[['audience_percent', 'omdb_genres', 'cast_member', 'wins', 'nominations', 'country_of_origin']]
info = info[info.audience_percent.notnull()]
info = info[info.cast_member.notnull()]
info = info[info.omdb_genres.notnull()]
info = info[info.omdb_genres != 'N/A']
info = info[info.country_of_origin.notnull()]
info = info[info.country_of_origin != 'N/A']

ct = ColumnTransformer(
    transformers=[
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), [0,1,4])
    ],
    remainder='passthrough' 
)

model = Pipeline(
        #steps=[('col_trans',ct), ('lr', LinearRegression(fit_intercept=False))]
        steps=[
            ('col_trans',ct), 
            ('pca', TruncatedSVD(2)), 
            ('gfr', GradientBoostingRegressor(max_depth=5, n_estimators=100, min_samples_leaf=10))
        ]
    )

X=info.drop(columns=['audience_percent'],axis=1)
y=info['audience_percent']

X_train, X_valid, y_train, y_valid = train_test_split(X,y)

model.fit(X_train, y_train)

print(model.score(X_train,y_train))
print(model.score(X_valid,y_valid)) 

0.3876440710817455
0.38018352115317644


# Same model for predicting critic percent

In [23]:
info = joined.reset_index()
info = info[['critic_percent', 'omdb_genres', 'cast_member', 'wins', 'nominations', 'country_of_origin']]
info = info[info.critic_percent.notnull()]
info = info[info.cast_member.notnull()]
info = info[info.omdb_genres.notnull()]
info = info[info.omdb_genres != 'N/A']
info = info[info.country_of_origin.notnull()]
info = info[info.country_of_origin != 'N/A']

ct = ColumnTransformer(
    transformers=[
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), [0,1,4])
    ],
    remainder='passthrough' 
)

model = Pipeline(
        #steps=[('col_trans',ct), ('lr', LinearRegression(fit_intercept=False))]
        steps=[
            ('col_trans',ct), 
            ('pca', TruncatedSVD(2)), 
            ('gfr', GradientBoostingRegressor(max_depth=5, n_estimators=100, min_samples_leaf=10))
        ]
    )

X=info.drop(columns=['critic_percent'],axis=1)
y=info['critic_percent']

X_train, X_valid, y_train, y_valid = train_test_split(X,y)

model.fit(X_train, y_train)

print(model.score(X_train,y_train))
print(model.score(X_valid,y_valid)) 

0.3523301274375591
0.3415297391033797


# Same model for predicting if a movie profits

In [24]:
info = joined.reset_index()
info = info[['made_profit', 'omdb_genres', 'cast_member', 'wins', 'nominations', 'country_of_origin']]
info = info[info.made_profit.notnull()]
info = info[info.cast_member.notnull()]
info = info[info.omdb_genres.notnull()]
info = info[info.omdb_genres != 'N/A']
info = info[info.country_of_origin.notnull()]
info = info[info.country_of_origin != 'N/A']

ct = ColumnTransformer(
    transformers=[
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), [0,1,4])
    ],
    remainder='passthrough' 
)

model = Pipeline(
        #steps=[('col_trans',ct), ('lr', LinearRegression(fit_intercept=False))]
        steps=[
            ('col_trans',ct), 
            ('pca', TruncatedSVD(2)), 
            ('gfr', GradientBoostingClassifier(max_depth=5, n_estimators=100, min_samples_leaf=10))
        ]
    )

X=info.drop(columns=['made_profit'],axis=1)
y=info['made_profit']

X_train, X_valid, y_train, y_valid = train_test_split(X,y)

model.fit(X_train, y_train)

print(model.score(X_train,y_train))
print(model.score(X_valid,y_valid)) 

0.946959214601041
0.9422810832180273
