# MOVIE RATING PREDICTION WITH PYTHON

In [1]:
#Step 1: Data Preparation

In [2]:
pip install pandas numpy scikit-learn matplotlib seaborn

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd

# Load the dataset
data = pd.read_csv('IMDb Movies India.csv', encoding='ISO-8859-1')
data

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali
...,...,...,...,...,...,...,...,...,...,...
15504,Zulm Ko Jala Doonga,(1988),,Action,4.6,11,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand
15505,Zulmi,(1999),129 min,"Action, Drama",4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani
15506,Zulmi Raj,(2005),,Action,,,Kiran Thej,Sangeeta Tiwari,,
15507,Zulmi Shikari,(1988),,Action,,,,,,


In [4]:
# Display the first few rows of the dataset
print(data.head())

# Display the summary of the dataset
print(data.info())

# Display statistical summary of the dataset
print(data.describe())

                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhatia  
1    Arvind Jangid  
2       Roy Angana  
3  Siddhant Kapoor  
4    

In [5]:
#Step 2: Data Cleaning and Feature Engineering

In [6]:
# Handle missing values (example: fill missing numeric values with median, categorical with mode)
data['Rating'].fillna(data['Rating'].median(), inplace=True)
data['Genre'].fillna(data['Genre'].mode()[0], inplace=True)
data['Director'].fillna(data['Director'].mode()[0], inplace=True)
data['Actor 1'].fillna(data['Actor 1'].mode()[0], inplace=True)

In [7]:
# Drop columns that won't be used if necessary
# data.drop(['ColumnName'], axis=1, inplace=True)

# Encode categorical variables
data = pd.get_dummies(data, columns=['Genre', 'Director', 'Actor 1'], drop_first=True)

print(data.head())

                                 Name    Year Duration  Rating Votes  \
0                                         NaN      NaN     6.0   NaN   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min     7.0     8   
2                         #Homecoming  (2021)   90 min     6.0   NaN   
3                             #Yaaram  (2019)  110 min     4.4    35   
4                   ...And Once Again  (2010)  105 min     6.0   NaN   

              Actor 2          Actor 3  Genre_Action, Adventure  \
0              Birbal  Rajendra Bhatia                    False   
1      Vivek Ghamande    Arvind Jangid                    False   
2   Plabita Borthakur       Roy Angana                    False   
3          Ishita Raj  Siddhant Kapoor                    False   
4  Rituparna Sengupta      Antara Mali                    False   

   Genre_Action, Adventure, Biography  Genre_Action, Adventure, Comedy  ...  \
0                               False                            False  ...   
1     

In [8]:
#Step 3: Model Building

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [10]:
# Check for non-numeric columns
print("Non-numeric columns before encoding:", data.select_dtypes(include=['object']).columns)

# Encode categorical variables
data = pd.get_dummies(data, columns=['Name', 'Year', 'Duration', 'Votes', 'Actor 2', 'Actor 3'], drop_first=True)

# Check for non-numeric columns after encoding
print("Non-numeric columns after encoding:", data.select_dtypes(include=['object']).columns)

Non-numeric columns before encoding: Index(['Name', 'Year', 'Duration', 'Votes', 'Actor 2', 'Actor 3'], dtype='object')
Non-numeric columns after encoding: Index([], dtype='object')


In [11]:
# Split the data into features and target variable
X = data.drop('Rating', axis=1)
y = data['Rating']

In [12]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred)}")
print(f"R^2 Score: {r2_score(y_test, y_pred)}")

In [None]:
#Step 4: Feature Importance

In [None]:
import numpy as np
# Feature importance
coefficients = pd.DataFrame({'Feature': X_train.columns, 'Coefficient': model.coef_})

print(coefficients.sort_values(by='Coefficient', ascending=False))

In [None]:
# Plot feature importances
coefficients.sort_values(by='Coefficient', ascending=False).plot(kind='bar', x='Feature', y='Coefficient')
plt.title('Feature Importances')
plt.show()