PROJECT DESCRIPTION

Title: Movie Rating Prediction

To develop a model that accurately estimates the rating given to a movie by users or critics


In [127]:
#importing libraries
import pandas as pd
import numpy as np 
from sklearn.preprocessing import StandardScaler, LabelEncoder
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.metrics import accuracy_score


In [128]:
def wrangle(filepath,encoding):
    data = pd.read_csv(filepath, encoding= 'latin1')
    data.drop(columns='Name', inplace=True)
    data.drop(columns=['Year','Duration'], inplace=True)

    #Remove duplicate rows from the dataset
    data.drop_duplicates()

    #remove missing rows 
    data.dropna(subset='Rating', inplace= True)
    #remove the votes column
    data.drop(columns= 'Votes', inplace=True)
    
    return data

In [129]:
data = wrangle('IMDb_Movies_India.csv', encoding= 'latin1')
data.head()


Unnamed: 0,Genre,Rating,Director,Actor 1,Actor 2,Actor 3
1,Drama,7.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
3,"Comedy, Romance",4.4,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
5,"Comedy, Drama, Musical",4.7,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
6,"Drama, Romance, War",7.4,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma
8,"Horror, Mystery, Thriller",5.6,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia


In [130]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7919 entries, 1 to 15508
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Genre     7817 non-null   object 
 1   Rating    7919 non-null   float64
 2   Director  7914 non-null   object 
 3   Actor 1   7794 non-null   object 
 4   Actor 2   7719 non-null   object 
 5   Actor 3   7627 non-null   object 
dtypes: float64(1), object(5)
memory usage: 433.1+ KB


In [131]:
#standardize the target variable (Rating)
from sklearn.preprocessing import StandardScaler
scaler =StandardScaler()
numerical_features = ['Rating']
data[numerical_features]= scaler.fit_transform(data[numerical_features])

In [132]:
data['Rating'].unique()

array([ 0.83837812, -1.04337551, -0.82625009,  1.12787868, -0.17487384,
       -1.33287607,  0.259377  ,  0.04225158,  0.47650242, -0.1024987 ,
        0.33175214,  0.9831284 ,  0.54887756,  1.05550354,  0.91075326,
        0.76600298, -1.69475177, -0.60912468, -0.97100037,  0.40412728,
       -1.26050093, -0.75387495,  1.63450465, -0.24724898,  0.69362784,
        0.18700186,  1.34500409, -0.53674954,  1.27262895, -1.98425233,
       -1.83950205,  1.41737923,  1.85163007, -0.4643744 , -1.11575065,
       -0.03012356, -0.89862523,  1.20025382,  0.6212527 , -1.62237663,
       -1.40525121, -0.31962412, -1.18812579, -0.39199926, -1.76712691,
       -2.05662747,  1.56212951,  0.11462672, -1.47762635,  1.48975437,
       -2.27375289, -0.68149982, -2.49087831, -1.55000149, -1.91187719,
       -2.41850317, -2.20137775, -2.34612803, -2.12900261,  1.70687979,
        2.06875549,  1.77925493,  2.50300633,  2.14113063, -2.70800373,
       -2.56325345,  1.92400521,  1.99638035,  2.28588091,  2.72

In [133]:
data.nunique()

Genre        432
Rating        84
Director    3139
Actor 1     2551
Actor 2     2873
Actor 3     3064
dtype: int64

In [134]:
data.isna().sum()

Genre       102
Rating        0
Director      5
Actor 1     125
Actor 2     200
Actor 3     292
dtype: int64

In [135]:
data.dtypes

Genre        object
Rating      float64
Director     object
Actor 1      object
Actor 2      object
Actor 3      object
dtype: object

In [136]:
#creating a new dataset from the previous one 
data_new = data.copy()

In [137]:
#transposing the rows and columns of the created dataset
data_new.head().T

Unnamed: 0,1,3,5,6,8
Genre,Drama,"Comedy, Romance","Comedy, Drama, Musical","Drama, Romance, War","Horror, Mystery, Thriller"
Rating,0.838378,-1.043376,-0.82625,1.127879,-0.174874
Director,Gaurav Bakshi,Ovais Khan,Rahul Rawail,Shoojit Sircar,Allyson Patel
Actor 1,Rasika Dugal,Prateik,Bobby Deol,Jimmy Sheirgill,Yash Dave
Actor 2,Vivek Ghamande,Ishita Raj,Aishwarya Rai Bachchan,Minissha Lamba,Muntazir Ahmad
Actor 3,Arvind Jangid,Siddhant Kapoor,Shammi Kapoor,Yashpal Sharma,Kiran Bhatia


In [138]:
#check if the row 'Actor 1' contains string data 
pd.api.types.is_string_dtype(data_new['Actor 1'])

False

In [139]:
data_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7919 entries, 1 to 15508
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Genre     7817 non-null   object 
 1   Rating    7919 non-null   float64
 2   Director  7914 non-null   object 
 3   Actor 1   7794 non-null   object 
 4   Actor 2   7719 non-null   object 
 5   Actor 3   7627 non-null   object 
dtypes: float64(1), object(5)
memory usage: 433.1+ KB


In [140]:
pd.api.types.is_object_dtype(data_new['Actor 1'])

True

In [141]:
for label, content in data_new.items():
    if pd.api.types.is_object_dtype(content):
        print(label)

Genre
Director
Actor 1
Actor 2
Actor 3


In [142]:
for label, content in data_new.items():
    if pd.api.types.is_object_dtype(content):
        data_new[label] = content.astype('category').cat.as_ordered()

In [143]:
data_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7919 entries, 1 to 15508
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Genre     7817 non-null   category
 1   Rating    7919 non-null   float64 
 2   Director  7914 non-null   category
 3   Actor 1   7794 non-null   category
 4   Actor 2   7719 non-null   category
 5   Actor 3   7627 non-null   category
dtypes: category(5), float64(1)
memory usage: 569.6 KB


In [144]:
for label, content in data_new.items():
    if not pd.api.types.is_numeric_dtype(content):
        # data_new[label + "_is_missing"] = pd.isnull(content)
        data_new[label] = pd.Categorical(content).codes

In [145]:
data_new.head()

Unnamed: 0,Genre,Rating,Director,Actor 1,Actor 2,Actor 3
1,268,0.838378,811,1782,2814,377
3,207,-1.043376,1749,1589,890,2572
5,177,-0.82625,2005,508,85,2449
6,331,1.127879,2643,931,1388,3029
8,367,-0.174874,174,2520,1461,1205


In [146]:
X = data_new.drop('Rating', axis = 1)
y = data_new['Rating']

In [147]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [159]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators = 400)

model.fit(X_train, y_train)

In [160]:
model.score(X_test, y_test)

from sklearn.metrics import *
print(mean_absolute_error(y_test, model.predict(X_test)))
print(r2_score(y_test, model.predict(X_test)))

0.7404256331473902
0.1181453523339


In [150]:
#initiate catboost classifier 
model = CatBoostClassifier(
    #number of boosting rounds 
    iterations= 150,
    learning_rate= 0.1,
    depth = 6,
    loss_function= 'MultiClass',
    verbose = 10
)
model.fit(X_train,y_train)
 


0:	learn: 4.3741455	total: 424ms	remaining: 1m 3s
10:	learn: 4.0281762	total: 4.96s	remaining: 1m 2s
20:	learn: 3.8532976	total: 9.29s	remaining: 57s
30:	learn: 3.7385733	total: 13.7s	remaining: 52.6s
40:	learn: 3.6449382	total: 18.6s	remaining: 49.3s
50:	learn: 3.5650299	total: 23.7s	remaining: 46.1s
60:	learn: 3.4900064	total: 29.1s	remaining: 42.5s
70:	learn: 3.4221903	total: 33.1s	remaining: 36.9s
80:	learn: 3.3643794	total: 36.6s	remaining: 31.2s
90:	learn: 3.3004177	total: 40s	remaining: 25.9s
100:	learn: 3.2411271	total: 43.7s	remaining: 21.2s
110:	learn: 3.1965536	total: 47.2s	remaining: 16.6s
120:	learn: 3.1512651	total: 51s	remaining: 12.2s
130:	learn: 3.1009930	total: 54.2s	remaining: 7.86s
140:	learn: 3.0667131	total: 58.2s	remaining: 3.72s
149:	learn: 3.0300342	total: 1m 1s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1a21473af50>

In [151]:
#make predictions 
y_pred = model.predict(X_test)

In [164]:
from sklearn.metrics import *
mae = mean_absolute_error(y_pred, y_test)
r2 = r2_score(y_pred, y_test)
mae, r2

(0.8169618030817962, -1.5949949977402818)