In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, r2_score
from xgboost import XGBRegressor

np.random.seed(42)

In [2]:
df = pd.read_csv('df_movies.csv')

In [3]:
df['log_views'] = df[df.views!=0].views.apply(np.log)
df['log_likes'] = df[df.likes!=0].likes.apply(np.log)
df['log_dislikes'] = df[df.dislikes!=0].dislikes.apply(np.log)
date = pd.to_datetime(df["release_date"])
df["weekday"] = date.dt.weekday
df["month"] = date.dt.month
df['log_budget']= df['budget'].apply(np.log10)

# one-hot encoding genres
df["genres"] = df["genres"].apply(lambda x: x.strip("[").strip("]").replace("'","").replace(" ","").split(","))
s = df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_df = df.drop('genres', axis=1).join(s)
pop_gen = pd.DataFrame(gen_df['genre'].value_counts()).reset_index()
pop_gen.columns = ['genre', 'movies']
for genre in pop_gen["genre"]:
    df["g_" + genre] = 0
    for idx in range(len(df)):
        if genre in df["genres"][idx]:
            df["g_" + genre][idx] = 1
            
# apply constrains to get rid of outliers
df = df[df.success<=3]
df = df[df.budget>=10000000]
df = df[df.runtime>80]

df.eval('view_score = views*(likes - dislikes)/(likes + dislikes)*search_volume', 
        inplace=True)
df["view_score"] = df["view_score"].apply(np.log)

df = df.drop(['title', 'genres', 'production_companies', 'production_countries', 
              'release_date', 'views', 'likes', 'dislikes', 'budget'], axis=1)
df = df.dropna()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [4]:
df.head()

Unnamed: 0,runtime,num_peaks,search_volume,dt_main,dt_trailers,success,log_views,log_likes,log_dislikes,weekday,...,g_Family,g_Mystery,g_Fantasy,g_Animation,g_Music,g_History,g_War,g_Western,g_Documentary,view_score
0,94,1,335,75,0,0.014714,15.67499,9.605587,5.876334,4,...,0,0,0,0,0,0,0,0,0,21.44109
1,109,2,666,124,-70,0.443592,11.823134,6.34388,2.944439,4,...,0,0,0,0,0,0,0,0,0,18.257615
3,109,1,483,134,0,0.864348,15.149918,8.81433,5.762051,0,...,0,0,0,0,0,0,0,0,0,21.235362
4,83,2,544,116,0,0.476076,12.361314,6.399427,3.238678,3,...,0,0,0,0,0,0,0,0,0,18.575425
5,128,2,1058,89,112,0.687179,12.189728,6.4531,3.232121,4,...,0,0,0,0,0,0,0,0,0,19.073989


In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 396 entries, 0 to 553
Data columns (total 30 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   runtime           396 non-null    int64  
 1   num_peaks         396 non-null    int64  
 2   search_volume     396 non-null    int64  
 3   dt_main           396 non-null    int64  
 4   dt_trailers       396 non-null    int64  
 5   log_views         396 non-null    float64
 6   log_likes         396 non-null    float64
 7   log_dislikes      396 non-null    float64
 8   weekday           396 non-null    int64  
 9   month             396 non-null    int64  
 10  log_budget        396 non-null    float64
 11  g_Drama           396 non-null    int64  
 12  g_Comedy          396 non-null    int64  
 13  g_Thriller        396 non-null    int64  
 14  g_Action          396 non-null    int64  
 15  g_Romance         396 non-null    int64  
 16  g_Adventure       396 non-null    int64  
 1

In [5]:
# Creat train and test data sets
X, y = df.drop('success', axis=1), df['success'].copy()

# Gridserach to find the best hyper parameters

params = {
     "xgb__n_estimators": [100, 300],
     "xgb__reg_alpha": range(5, 20)[::2],
     "xgb__reg_lambda": range(50, 201)[::10]
}


steps = [("scale", RobustScaler()),
         ("xgb", XGBRegressor(random_state=42, objective='reg:squarederror'))]

model = Pipeline(steps)

scorer = make_scorer(r2_score)

clf = GridSearchCV(model, params, scoring=scorer)

clf.fit(X, y)

model = clf.best_estimator_

model.fit(X, y)
# with open('model.pkl', 'wb') as f:
#     pickle.dump(model, f)

Pipeline(steps=[('scale', RobustScaler()),
                ('xgb',
                 XGBRegressor(n_estimators=300, objective='reg:squarederror',
                              random_state=42, reg_alpha=5, reg_lambda=60))])

In [11]:
X

Unnamed: 0,runtime,num_peaks,search_volume,dt_main,dt_trailers,log_views,log_likes,log_dislikes,weekday,month,...,g_Family,g_Mystery,g_Fantasy,g_Animation,g_Music,g_History,g_War,g_Western,g_Documentary,view_score
0,94,1,335,75,0,15.674990,9.605587,5.876334,4,11,...,0,0,0,0,0,0,0,0,0,21.441090
1,109,2,666,124,-70,11.823134,6.343880,2.944439,4,8,...,0,0,0,0,0,0,0,0,0,18.257615
3,109,1,483,134,0,15.149918,8.814330,5.762051,0,3,...,0,0,0,0,0,0,0,0,0,21.235362
4,83,2,544,116,0,12.361314,6.399427,3.238678,3,8,...,0,0,0,0,0,0,0,0,0,18.575425
5,128,2,1058,89,112,12.189728,6.453100,3.232121,4,4,...,0,0,0,0,0,0,0,0,0,19.073989
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
549,105,2,713,145,0,16.155683,9.841931,6.369901,4,9,...,0,0,0,0,0,0,0,0,0,22.663037
550,94,2,624,75,0,15.554389,8.270013,6.584791,4,12,...,0,0,0,0,0,0,0,0,0,21.615394
551,102,1,445,145,0,14.814711,8.311275,5.416100,4,4,...,0,0,0,0,0,0,0,0,0,20.802094
552,102,3,449,52,0,14.867938,7.861727,7.151485,2,7,...,1,0,0,0,0,0,0,0,0,19.898821


In [10]:
X.columns

Index(['runtime', 'num_peaks', 'search_volume', 'dt_main', 'dt_trailers',
       'log_views', 'log_likes', 'log_dislikes', 'weekday', 'month',
       'log_budget', 'g_Drama', 'g_Comedy', 'g_Thriller', 'g_Action',
       'g_Romance', 'g_Adventure', 'g_Crime', 'g_ScienceFiction', 'g_Horror',
       'g_Family', 'g_Mystery', 'g_Fantasy', 'g_Animation', 'g_Music',
       'g_History', 'g_War', 'g_Western', 'g_Documentary', 'view_score'],
      dtype='object')

In [17]:
print([X.iloc[0]])

[runtime              94.000000
num_peaks             1.000000
search_volume       335.000000
dt_main              75.000000
dt_trailers           0.000000
log_views            15.674990
log_likes             9.605587
log_dislikes          5.876334
weekday               4.000000
month                11.000000
log_budget            7.255273
g_Drama               1.000000
g_Comedy              0.000000
g_Thriller            1.000000
g_Action              0.000000
g_Romance             0.000000
g_Adventure           1.000000
g_Crime               0.000000
g_ScienceFiction      0.000000
g_Horror              0.000000
g_Family              0.000000
g_Mystery             0.000000
g_Fantasy             0.000000
g_Animation           0.000000
g_Music               0.000000
g_History             0.000000
g_War                 0.000000
g_Western             0.000000
g_Documentary         0.000000
view_score           21.441090
Name: 0, dtype: float64]


In [18]:
model.predict([X.iloc[0]])[0]

0.4165088