# MOVIE RATING PREDICTION WITH PYTHON

In [108]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV,KFold 
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error

# pd.set_option("display.max_rows",None)
import warnings
warnings.filterwarnings("ignore")

#### Loading the Dataset

In [109]:
df = pd.read_csv(r"C:\Users\ADMIN\Downloads\codsoft\IMDb Movies India.csv",encoding="latin-1")
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [110]:
df.shape

(15509, 10)

#### Understanding and Pre-Processing the Data

In [111]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


#### Data Cleaning

In [112]:
df.isnull().sum()

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [113]:
year = df["Year"][df["Year"].isnull()==False]

In [114]:
year = year.apply(lambda x: int(x.strip("()")))

In [115]:
df.loc[year.index,"Year"] = year

In [116]:
duration = df["Duration"][df["Duration"].isnull()==False]

In [117]:
duration = duration.apply(lambda x: int(x.split(" ")[0]))

In [118]:
df.loc[duration.index,"Duration"] = duration

In [119]:
votes = df["Votes"][df["Votes"].isnull()==False]

In [120]:
def vote(x):
    return int(re.sub("[^0-9]", "",x))

In [121]:
votes = votes.apply(vote)

In [122]:
df.loc[votes.index,"Votes"] = votes

In [123]:
df["Name"] = df["Name"].apply(lambda x: np.nan if x.isspace()==True else x )

In [124]:
def sort(text):
    a = list(text.replace(" ","").split(","))
    a.sort(reverse=True)
    return ",".join(a)

In [125]:
genre = df["Genre"][df["Genre"].isnull()==False]

In [126]:
genre.nunique()

485

In [127]:
genre.apply(sort).nunique()

401

In [128]:
genre = genre.apply(sort)

In [129]:
df.loc[genre.index,"Genre"] = genre

In [130]:
df

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),2019,109,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,2021,90,"Musical,Drama",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,2019,110,"Romance,Comedy",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,2010,105,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali
...,...,...,...,...,...,...,...,...,...,...
15504,Zulm Ko Jala Doonga,1988,,Action,4.6,11,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand
15505,Zulmi,1999,129,"Drama,Action",4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani
15506,Zulmi Raj,2005,,Action,,,Kiran Thej,Sangeeta Tiwari,,
15507,Zulmi Shikari,1988,,Action,,,,,,


In [131]:
df[["Name","Director","Actor 1","Actor 2","Actor 3"]] = df[["Name","Director","Actor 1","Actor 2","Actor 3"]].fillna("Unknown")

In [132]:
mode1 = df["Genre"].mode()[0]

In [133]:
df["Genre"] = df["Genre"].fillna(mode1)

In [134]:
df["Duration"] = df["Duration"].fillna(df["Duration"].median()).astype(int)

In [135]:
df["Year"] = df["Year"].fillna(df["Year"].median().astype(int))

In [136]:
df["Votes"] = df["Votes"].fillna(df["Votes"].median().astype(int))

In [137]:
df.isnull().sum()

Name           0
Year           0
Duration       0
Genre          0
Rating      7590
Votes          0
Director       0
Actor 1        0
Actor 2        0
Actor 3        0
dtype: int64

In [138]:
for i in df.select_dtypes(exclude="O").drop("Rating",axis=1).columns:
    sc = StandardScaler()
    df[i] = sc.fit_transform(df[[i]])

In [139]:
for i in df.select_dtypes(include="O").columns:
    le = LabelEncoder()
    df[i] = le.fit_transform(df[i])

In [140]:
df

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,13040,0.154143,0.067731,28,,-0.115272,1926,2250,800,3108
1,0,1.274595,-1.043094,28,7.0,-0.120905,1548,3280,4791,527
2,1,1.354627,-2.002443,158,,-0.115272,5123,3713,2866,3450
3,2,1.274595,-0.992602,227,4.4,-0.117669,3319,2917,1504,4020
4,6,0.914449,-1.245062,28,,-0.115272,385,3112,3462,405
...,...,...,...,...,...,...,...,...,...,...
15504,13832,0.034095,0.067731,0,4.6,-0.120545,2690,2586,4299,4262
15505,13834,0.474272,-0.033253,29,4.5,-0.043359,2499,227,4532,519
15506,13835,0.714369,0.067731,0,,-0.115272,2424,3609,4558,4481
15507,13836,0.034095,0.067731,0,,-0.115272,5549,4388,4558,4481


In [141]:
pred_df = df[df["Rating"].isnull()==True].drop("Rating",axis=1).reset_index(drop=True)
pred_df.head()

Unnamed: 0,Name,Year,Duration,Genre,Votes,Director,Actor 1,Actor 2,Actor 3
0,13040,0.154143,0.067731,28,-0.115272,1926,2250,800,3108
1,1,1.354627,-2.002443,158,-0.115272,5123,3713,2866,3450
2,6,0.914449,-1.245062,28,-0.115272,385,3112,3462,405
3,9,0.834417,-3.567696,23,-0.115272,475,4388,4558,4481
4,12,0.114127,0.067731,316,-0.115272,1741,3659,1100,4481


In [142]:
new_df = df[df["Rating"].isnull()==False].reset_index(drop=True)
new_df

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,0,1.274595,-1.043094,28,7.0,-0.120905,1548,3280,4791,527
1,2,1.274595,-0.992602,227,4.4,-0.117669,3319,2917,1504,4020
2,7,0.394240,0.875603,163,4.7,-0.022744,3800,895,123,3829
3,8,0.714369,0.623143,394,7.4,0.008298,4993,1698,2359,4763
4,132,0.994482,-2.406379,362,5.6,-0.082791,305,4655,2482,1851
...,...,...,...,...,...,...,...,...,...,...
7914,13829,0.194159,0.067731,43,5.3,-0.105683,945,1139,2440,1291
7915,13831,0.074111,-0.235221,43,5.8,-0.116590,4416,990,1596,3135
7916,13832,0.034095,0.067731,0,4.6,-0.120545,2690,2586,4299,4262
7917,13834,0.474272,-0.033253,29,4.5,-0.043359,2499,227,4532,519


### Train And Test

In [143]:
x = new_df.drop("Rating",axis=1)
y = new_df["Rating"]

In [144]:
xtrain,xtest,ytrain,ytest= train_test_split(x,y,random_state=10,test_size=0.20)
xtrain.shape,xtest.shape,ytrain.shape,ytest.shape

((6335, 9), (1584, 9), (6335,), (1584,))

In [145]:
col = pd.MultiIndex.from_product([["Train","Test"],["RMSE","MAPE"]])
modelscore = pd.DataFrame(columns=col)
modelscore.index.name = "Model Name"

In [146]:
modelscore

Unnamed: 0_level_0,Train,Train,Test,Test
Unnamed: 0_level_1,RMSE,MAPE,RMSE,MAPE
Model Name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2


In [147]:
def score_cal(model_df,model_name,model,x,y):
    xtrain,xtest,ytrain,ytest= train_test_split(x,y,random_state=10,test_size=0.20)
    xtrain,xtest,ytrain,ytest
    model.fit(xtrain,ytrain)
    ypred_train = model.predict(xtrain)
    ypred_test = model.predict(xtest)
    train_rmse = mean_squared_error(ytrain,ypred_train,squared=False)
    train_mape = mean_absolute_percentage_error(ytrain,ypred_train)
    test_rmse = mean_squared_error(ytest,ypred_test,squared=False)
    test_mape = mean_absolute_percentage_error(ytest,ypred_test)
    model_df.loc[model_name,:] = [train_rmse,train_mape,test_rmse,test_mape]
    return model_df

### Linear Regression

In [148]:
lr = LinearRegression()
score_cal(modelscore,"Linear Regression",lr,x,y)

Unnamed: 0_level_0,Train,Train,Test,Test
Unnamed: 0_level_1,RMSE,MAPE,RMSE,MAPE
Model Name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Linear Regression,1.333321,0.215261,1.337866,0.223709


### Decision Tree

In [149]:
dt = DecisionTreeRegressor(random_state=10)
score_cal(modelscore,"Decision Tree",dt,x,y)

Unnamed: 0_level_0,Train,Train,Test,Test
Unnamed: 0_level_1,RMSE,MAPE,RMSE,MAPE
Model Name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Linear Regression,1.333321,0.215261,1.337866,0.223709
Decision Tree,0.0,0.0,1.632199,0.243349


### Decision Tree-Tuned

In [150]:
params = {"max_depth":range(21),"min_samples_split":range(10)}
dt = DecisionTreeRegressor(random_state=10)
gd = GridSearchCV(dt,params,cv=5)
gd.fit(xtrain,ytrain)

In [151]:
gd.best_params_

{'max_depth': 6, 'min_samples_split': 7}

In [152]:
dt = DecisionTreeRegressor(max_depth=6,min_samples_split=4,random_state=10)
score_cal(modelscore,"Decision Tree-Tuned",dt,x,y)

Unnamed: 0_level_0,Train,Train,Test,Test
Unnamed: 0_level_1,RMSE,MAPE,RMSE,MAPE
Model Name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Linear Regression,1.333321,0.215261,1.337866,0.223709
Decision Tree,0.0,0.0,1.632199,0.243349
Decision Tree-Tuned,1.15782,0.179757,1.224525,0.190642


### Random Forest

In [153]:
rf = RandomForestRegressor(random_state=10)
score_cal(modelscore,"Random Forest",rf,x,y)

Unnamed: 0_level_0,Train,Train,Test,Test
Unnamed: 0_level_1,RMSE,MAPE,RMSE,MAPE
Model Name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Linear Regression,1.333321,0.215261,1.337866,0.223709
Decision Tree,0.0,0.0,1.632199,0.243349
Decision Tree-Tuned,1.15782,0.179757,1.224525,0.190642
Random Forest,0.425877,0.064823,1.129839,0.178675


### Random Forest-Tuned

In [154]:
params = {"max_depth":range(10),"min_samples_split":range(10)}
rf = RandomForestRegressor(random_state=10)
gd = GridSearchCV(rf,params,cv=5)
gd.fit(xtrain,ytrain)

In [155]:
gd.best_params_

{'max_depth': 9, 'min_samples_split': 9}

In [156]:
rf = RandomForestRegressor(max_depth=9,min_samples_split=9,random_state=10)
score_cal(modelscore,"Random Forest-Tuned",rf,x,y)

Unnamed: 0_level_0,Train,Train,Test,Test
Unnamed: 0_level_1,RMSE,MAPE,RMSE,MAPE
Model Name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Linear Regression,1.333321,0.215261,1.337866,0.223709
Decision Tree,0.0,0.0,1.632199,0.243349
Decision Tree-Tuned,1.15782,0.179757,1.224525,0.190642
Random Forest,0.425877,0.064823,1.129839,0.178675
Random Forest-Tuned,0.985633,0.154083,1.138151,0.181507


### AdaBoost

In [157]:
adb = AdaBoostRegressor(random_state=10)
score_cal(modelscore,"AdaBoost",adb,x,y)

Unnamed: 0_level_0,Train,Train,Test,Test
Unnamed: 0_level_1,RMSE,MAPE,RMSE,MAPE
Model Name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Linear Regression,1.333321,0.215261,1.337866,0.223709
Decision Tree,0.0,0.0,1.632199,0.243349
Decision Tree-Tuned,1.15782,0.179757,1.224525,0.190642
Random Forest,0.425877,0.064823,1.129839,0.178675
Random Forest-Tuned,0.985633,0.154083,1.138151,0.181507
AdaBoost,1.260206,0.194232,1.265955,0.200293


### Gradient Boosting

In [158]:
gdb = GradientBoostingRegressor(random_state=10)
score_cal(modelscore,"Gradient Boosting",gdb,x,y)

Unnamed: 0_level_0,Train,Train,Test,Test
Unnamed: 0_level_1,RMSE,MAPE,RMSE,MAPE
Model Name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Linear Regression,1.333321,0.215261,1.337866,0.223709
Decision Tree,0.0,0.0,1.632199,0.243349
Decision Tree-Tuned,1.15782,0.179757,1.224525,0.190642
Random Forest,0.425877,0.064823,1.129839,0.178675
Random Forest-Tuned,0.985633,0.154083,1.138151,0.181507
AdaBoost,1.260206,0.194232,1.265955,0.200293
Gradient Boosting,1.054568,0.165228,1.118335,0.179109


In [159]:
modelscore.iloc[:,2]-modelscore.iloc[:,0]

Model Name
Linear Regression      0.004545
Decision Tree          1.632199
Decision Tree-Tuned    0.066705
Random Forest          0.703962
Random Forest-Tuned    0.152518
AdaBoost               0.005749
Gradient Boosting      0.063767
dtype: object

In [160]:
modelscore.iloc[:,3]-modelscore.iloc[:,1]

Model Name
Linear Regression      0.008447
Decision Tree          0.243349
Decision Tree-Tuned    0.010884
Random Forest          0.113852
Random Forest-Tuned    0.027425
AdaBoost               0.006061
Gradient Boosting      0.013881
dtype: object

#### From All the  above models Gradient Boosting has given a better result. As we shaw the minimum error in all 3 cases train,test and overall. So we conclude with using Gradient Boosting for future predictions.

In [161]:
pred_df

Unnamed: 0,Name,Year,Duration,Genre,Votes,Director,Actor 1,Actor 2,Actor 3
0,13040,0.154143,0.067731,28,-0.115272,1926,2250,800,3108
1,1,1.354627,-2.002443,158,-0.115272,5123,3713,2866,3450
2,6,0.914449,-1.245062,28,-0.115272,385,3112,3462,405
3,9,0.834417,-3.567696,23,-0.115272,475,4388,4558,4481
4,12,0.114127,0.067731,316,-0.115272,1741,3659,1100,4481
...,...,...,...,...,...,...,...,...,...
7585,13827,0.314208,0.067731,0,-0.115272,5362,1366,3886,1855
7586,13828,0.234175,0.067731,0,-0.115272,849,2563,4706,4481
7587,13830,-0.326051,0.067731,0,-0.115272,1398,3227,1547,4536
7588,13835,0.714369,0.067731,0,-0.115272,2424,3609,4558,4481


In [162]:
rating = gdb.predict(pred_df)

In [163]:
pred_df["Rating"] = rating

In [164]:
pred_df

Unnamed: 0,Name,Year,Duration,Genre,Votes,Director,Actor 1,Actor 2,Actor 3,Rating
0,13040,0.154143,0.067731,28,-0.115272,1926,2250,800,3108,5.829801
1,1,1.354627,-2.002443,158,-0.115272,5123,3713,2866,3450,6.742248
2,6,0.914449,-1.245062,28,-0.115272,385,3112,3462,405,6.193584
3,9,0.834417,-3.567696,23,-0.115272,475,4388,4558,4481,7.408925
4,12,0.114127,0.067731,316,-0.115272,1741,3659,1100,4481,5.225139
...,...,...,...,...,...,...,...,...,...,...
7585,13827,0.314208,0.067731,0,-0.115272,5362,1366,3886,1855,4.914657
7586,13828,0.234175,0.067731,0,-0.115272,849,2563,4706,4481,5.533177
7587,13830,-0.326051,0.067731,0,-0.115272,1398,3227,1547,4536,6.462607
7588,13835,0.714369,0.067731,0,-0.115272,2424,3609,4558,4481,5.599735


#### This is the dataset with the use of  predicted Ratings