In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [12]:
# Load dataset
df = pd.read_csv("IMDb Movies India.csv", encoding='latin1')
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [13]:
# Check structure and missing values
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [14]:
# Drop rows with missing essential values
df = df.dropna(subset=['Name', 'Year', 'Genre', 'Director', 'Actor 1', 'Rating', 'Votes'])

In [15]:
# Remove duration text (convert to int), convert Votes to int
df['Votes'] = df['Votes'].astype(str).str.replace(',', '').astype(int)
df['Year'] = df['Year'].astype(str).str.extract(r'(\d{4})') 
df['Year'] = df['Year'].astype(int)
df['Rating'] = df['Rating'].astype(float)

In [16]:
# Extract primary genre
df['Primary_Genre'] = df['Genre'].apply(lambda x: x.split(',')[0] if pd.notnull(x) else x)

In [17]:
# Encode categorical columns
le = LabelEncoder()
df['Primary_Genre'] = le.fit_transform(df['Primary_Genre'])
df['Director'] = le.fit_transform(df['Director'])
df['Actor 1'] = le.fit_transform(df['Actor 1'])

In [18]:
# Select features and target
X = df[['Primary_Genre', 'Director', 'Actor 1', 'Year', 'Votes']]
y = df['Rating']

In [19]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [21]:
# Predict and evaluate
y_pred = model.predict(X_test)
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

Mean Squared Error: 1.37772833463035
R² Score: 0.28610308997448175


In [23]:
sample_input = pd.DataFrame([[1, 2, 3, 2022, 5000]], columns=['Primary_Genre', 'Director', 'Actor 1', 'Year', 'Votes'])
prediction = model.predict(sample_input)
print("Predicted Rating:", prediction[0])

Predicted Rating: 5.176999999999997
