IMPORTING THE LIBRARIES

In [165]:
import pandas as pd  
from sklearn.model_selection import train_test_split, cross_val_score  # train_test_split splits the data into training and testing sets. cross_val_score performs cross-validation.
from sklearn.preprocessing import OneHotEncoder  # OneHotEncoder is used to convert categorical variables into a format that can be provided to ML algorithms to do a better job in prediction.
from sklearn.compose import ColumnTransformer  # ColumnTransformer allows for the preprocessing of different types of data (e.g., numerical and categorical) separately.
from sklearn.pipeline import Pipeline  # Pipeline helps in chaining multiple steps into one, making the workflow cleaner and easier to manage.
from sklearn.impute import SimpleImputer  # SimpleImputer is used to handle missing values, providing strategies for imputing them.
from sklearn.linear_model import LinearRegression  # LinearRegression is a linear model for regression tasks.
from sklearn.metrics import mean_squared_error, r2_score  # mean_squared_error measures the average squared difference between observed and predicted values. r2_score provides the coefficient of determination, indicating how well predictions match actual data.


LOAD THE DATASET

In [166]:
# Load the dataset
df = pd.read_csv('C:/Users/Admin/Downloads/ENCRYPTIX/MOVIE RATING PREDICTION USING PYTHON/IMDb Movies India.csv',encoding='latin1')

In [167]:
# Display the DataFrame
df

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali
...,...,...,...,...,...,...,...,...,...,...
15504,Zulm Ko Jala Doonga,(1988),,Action,4.6,11,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand
15505,Zulmi,(1999),129 min,"Action, Drama",4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani
15506,Zulmi Raj,(2005),,Action,,,Kiran Thej,Sangeeta Tiwari,,
15507,Zulmi Shikari,(1988),,Action,,,,,,


In [168]:
# Display the first few rows of the dataset 
display(df.head())

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [181]:
# Display the last few rows of the dataset
# This can help ensure that the dataset was loaded correctly and provide a glimpse of the data
display(df.tail())

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,1997
15501,Zulm Ki Hukumat,(1992),Votes,"Action, Crime, Drama",5.3,135,Bharat Rangachary,Dharmendra,Moushumi Chatterjee,Govinda,
15503,Zulm Ki Zanjeer,(1989),125 min,"Action, Crime, Drama",5.8,44,S.P. Muthuraman,Chiranjeevi,Jayamalini,Rajinikanth,
15504,Zulm Ko Jala Doonga,(1988),Votes,Action,4.6,11,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand,
15505,Zulmi,(1999),129 min,"Action, Drama",4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani,
15508,Zulm-O-Sitam,(1998),130 min,"Action, Drama",6.2,20,K.C. Bokadia,Dharmendra,Jaya Prada,Arjun Sarja,


In [170]:
# Display the shape of the dataset
# This helps us understand the dimensions of the dataset, i.e., number of rows and columns
display(df.shape)

(15509, 10)

In [171]:
# Display basic statistics of the dataset
display(df.describe())

Unnamed: 0,Rating
count,7919.0
mean,5.841621
std,1.381777
min,1.1
25%,4.9
50%,6.0
75%,6.8
max,10.0


In [172]:
# Display information about the dataset
display(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


None

HANDLE MISSING VALUES

In [173]:
# Drop rows where the target variable is missing
df = df.dropna(subset=['Rating'])

# Fill other missing values (in this case, using a placeholder)
df = df.fillna('Votes')

# Verify the changes
display(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 7919 entries, 1 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      7919 non-null   object 
 1   Year      7919 non-null   object 
 2   Duration  7919 non-null   object 
 3   Genre     7919 non-null   object 
 4   Rating    7919 non-null   float64
 5   Votes     7919 non-null   object 
 6   Director  7919 non-null   object 
 7   Actor 1   7919 non-null   object 
 8   Actor 2   7919 non-null   object 
 9   Actor 3   7919 non-null   object 
dtypes: float64(1), object(9)
memory usage: 680.5+ KB


None

FEATURE ENGINEERING

In [182]:
# Extract the year from the 'Name' column using a regular expression
df['1997'] = df['Name'].str.extract(r'\((\d{4})\)', expand=False)

# Convert the extracted year to numeric format
# If there are any errors during conversion, coerce them to NaN
df['1997'] = pd.to_numeric(df['1997'], errors='coerce')

# Display the DataFrame with the new '1997' column
df


Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,1997
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid,
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor,
5,...Aur Pyaar Ho Gaya,(1997),147 min,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,
6,...Yahaan,(2005),142 min,"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma,
8,?: A Question Mark,(2012),82 min,"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia,
...,...,...,...,...,...,...,...,...,...,...,...
15501,Zulm Ki Hukumat,(1992),Votes,"Action, Crime, Drama",5.3,135,Bharat Rangachary,Dharmendra,Moushumi Chatterjee,Govinda,
15503,Zulm Ki Zanjeer,(1989),125 min,"Action, Crime, Drama",5.8,44,S.P. Muthuraman,Chiranjeevi,Jayamalini,Rajinikanth,
15504,Zulm Ko Jala Doonga,(1988),Votes,Action,4.6,11,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand,
15505,Zulmi,(1999),129 min,"Action, Drama",4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani,


In [176]:
# Select relevant features for the model
X = df[['Genre', 'Director' , 'Votes']]
y = df['Rating']

In [177]:
# Define the features (independent variables) for the model
X

Unnamed: 0,Genre,Director,Votes
1,Drama,Gaurav Bakshi,8
3,"Comedy, Romance",Ovais Khan,35
5,"Comedy, Drama, Musical",Rahul Rawail,827
6,"Drama, Romance, War",Shoojit Sircar,1086
8,"Horror, Mystery, Thriller",Allyson Patel,326
...,...,...,...
15501,"Action, Crime, Drama",Bharat Rangachary,135
15503,"Action, Crime, Drama",S.P. Muthuraman,44
15504,Action,Mahendra Shah,11
15505,"Action, Drama",Kuku Kohli,655


In [183]:
# Define the target variable (dependent variable) for the model
y

1        7.0
3        4.4
5        4.7
6        7.4
8        5.6
        ... 
15501    5.3
15503    5.8
15504    4.6
15505    4.5
15508    6.2
Name: Rating, Length: 7919, dtype: float64

SPLIT DATA INTO TRAINING AND TESTING SETS

In [180]:
#Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify the split
print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (6335, 3)
X_test shape: (1584, 3)
y_train shape: (6335,)
y_test shape: (1584,)


PREPROCESS THE DATA

In [113]:
# Preprocess the data (OneHotEncoding for categorical features)
preprocessor = ColumnTransformer(
    transformers=[
        ('categorical,', OneHotEncoder(handle_unknown='ignore'), ['Genre', 'Director', 'Votes'])
    ]
)

In [114]:
# Verify the preprocessor
preprocessor

CREATE A PIPELINE WITH PREPROCESSING AND REGRESSION

In [99]:
#Create a pipeline with preprocessing and regression
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [115]:
# Verify the preprocessor
pipeline

TRAIN THE MODEL

In [101]:
# Train the model
pipeline.fit(X_train, y_train)

PREDICT RATINGS ON THE TEST SET

In [102]:
# Predict the ratings on the test set
y_pred = pipeline.predict(X_test)

In [116]:
# Display the predictions
y_pred

array([5.42573703, 6.90675976, 4.68718943, ..., 5.05611094, 5.75663606,
       6.0248373 ])

EVALUATE THE MODEL

In [104]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [105]:
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 2.190774555006216
R^2 Score: -0.17837736662618253


In [164]:
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
cv_rmse_scores = (cv_scores * -1) ** 0.5
print("Cross-Validation RMSE Scores:", cv_rmse_scores)

Cross-Validation RMSE Scores: [1.44105571 1.5172473  1.54388358 1.49108037 1.52556415]
