## Problem Statement and Dataset

### *Objective:* Predict the rating of a movie based on different features.
### *Data:* IMDb India Movie ratings dataset with the features Name, Year, Duration, Genre, Rating, Votes, Director, Actor 1, Actor 2, Actor 3.

## Import necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
#from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, median_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

## Load the Dataset

In [2]:
data = pd.read_csv("D:/Yuktha/Codsoft/Task2_Movie_rating/IMDb_Movies_India.csv", encoding='latin1')
# Since there are a few errors while reading file with default utf-8 encoding, I have used latin1 encoding

## Exploratory Data Analysis (EDA)

In [3]:
#View the data
data

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali
...,...,...,...,...,...,...,...,...,...,...
15504,Zulm Ko Jala Doonga,(1988),,Action,4.6,11,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand
15505,Zulmi,(1999),129 min,"Action, Drama",4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani
15506,Zulmi Raj,(2005),,Action,,,Kiran Thej,Sangeeta Tiwari,,
15507,Zulmi Shikari,(1988),,Action,,,,,,


In [4]:
# Display summary statistics for numerical columns
summary_stats = data.describe()
print("Summary Statistics:")
print(summary_stats)

Summary Statistics:
            Rating
count  7919.000000
mean      5.841621
std       1.381777
min       1.100000
25%       4.900000
50%       6.000000
75%       6.800000
max      10.000000


In [5]:
data.isnull().sum()

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

## Data Preprocessing

In [6]:
# Ensure 'Year' column is treated as string before using .str accessor
data['Year'] = data['Year'].astype(str)
data['Year'] = data['Year'].str.extract(r'(\d+)', expand=False).astype(float)

# Ensure 'Duration' column is treated as string before using .str accessor
data['Duration'] = data['Duration'].astype(str)
data['Duration'] = data['Duration'].str.extract(r'(\d+)', expand=False).astype(float)

# Convert 'Votes' to numeric, handling commas
data['Votes'] = data['Votes'].replace({',': '', '\$':'',"M":''}, regex=True).astype(float)

# Handle missing values in 'Rating' using mean imputation
data['Rating'] = pd.to_numeric(data['Rating'], errors='coerce')
data['Rating'].fillna(data['Rating'].mean(), inplace=True)

# Handle missing values in 'Votes' using mean imputation
data['Votes'].fillna(data['Votes'].mean(), inplace=True)

# Handle missing values in 'Year' using mode imputation
data['Year'].fillna(data['Year'].mode()[0], inplace=True)

# Handle missing values in 'Duration' using mean imputation
data['Duration'].fillna(data['Duration'].mean(), inplace=True)

In [7]:
# Display the cleaned DataFrame
data

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,2019.0,128.126519,Drama,5.841621,1938.276283,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),2019.0,109.000000,Drama,7.000000,8.000000,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,2021.0,90.000000,"Drama, Musical",5.841621,1938.276283,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,2019.0,110.000000,"Comedy, Romance",4.400000,35.000000,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,2010.0,105.000000,Drama,5.841621,1938.276283,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali
...,...,...,...,...,...,...,...,...,...,...
15504,Zulm Ko Jala Doonga,1988.0,128.126519,Action,4.600000,11.000000,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand
15505,Zulmi,1999.0,129.000000,"Action, Drama",4.500000,655.000000,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani
15506,Zulmi Raj,2005.0,128.126519,Action,5.841621,1938.276283,Kiran Thej,Sangeeta Tiwari,,
15507,Zulmi Shikari,1988.0,128.126519,Action,5.841621,1938.276283,,,,


## Model Building



In [10]:
# Define features and target variable
X = data[['Year', 'Duration', 'Votes']]  # Features excluding 'Name', 'Rating', 'Genre', 'Director', 'Actor1', 'Actor2', 'Actor3'
y = data['Rating']  # Target variable 'Rating'

In [11]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Linear Regression model
model = LinearRegression()

# Fit the model on the training data
model.fit(X_train, y_train)

# Predict ratings for the test data
y_pred = model.predict(X_test)

## Model Evaluation

In [12]:
# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")

Mean Squared Error (MSE): 0.9279798152003759


In [13]:
# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae}")

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Calculate R-squared (R²)
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R²): {r2}")

# Calculate Adjusted R-squared
n = len(y_test)
p = X_train.shape[1]
adjusted_r2 = 1 - (1 - r2) * ((n - 1) / (n - p - 1))
print(f"Adjusted R-squared: {adjusted_r2}")

# Calculate Mean Absolute Percentage Error (MAPE)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
print(f"Mean Absolute Percentage Error (MAPE): {mape}%")

# Calculate Median Absolute Error
medae = median_absolute_error(y_test, y_pred)
print(f"Median Absolute Error (MedAE): {medae}")

Mean Absolute Error (MAE): 0.5980292440366146
Root Mean Squared Error (RMSE): 0.9633170896440983
R-squared (R²): 0.03584246154755255
Adjusted R-squared: 0.03490880350515191
Mean Absolute Percentage Error (MAPE): 12.338266341411174%
Median Absolute Error (MedAE): 0.19608672318435172


>> ## The model performance is quite low when we fit a linear regression Model
>> ## R^2 is also low, which means the independent variables do not well explain the dependent varible in this model

## Improvising the model and evaluating 

In [14]:
### Feature Engineering and Selection
### First, let's try creating polynomial features and selecting the best ones.

from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline

# Create polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

# Select the best features
selector = SelectKBest(score_func=f_regression, k='all')
X_poly_selected = selector.fit_transform(X_poly, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_poly_selected, y, test_size=0.2, random_state=42)

# Initialize and fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict ratings for the test data
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
adjusted_r2 = 1 - (1 - r2) * ((len(y_test) - 1) / (len(y_test) - X_train.shape[1] - 1))
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
medae = median_absolute_error(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")
print(f"Adjusted R-squared: {adjusted_r2}")
print(f"Mean Absolute Percentage Error (MAPE): {mape}%")
print(f"Median Absolute Error (MedAE): {medae}")

Mean Squared Error (MSE): 0.8927053235416608
Mean Absolute Error (MAE): 0.5933525408963858
Root Mean Squared Error (RMSE): 0.9448308438771782
R-squared (R²): 0.07249214561474793
Adjusted R-squared: 0.06979241382643375
Mean Absolute Percentage Error (MAPE): 12.297118636260427%
Median Absolute Error (MedAE): 0.20626454067953315


>> ## Here we can see an increase in the R^2 value, suggesting that this is comparitively a better model

#### We also try fitting a Random Forest Regressor and tune its hyperparameters.

In [15]:
# Initialize the Random Forest model
rf_model = RandomForestRegressor(random_state=42)

# Define the hyperparameters grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

# Initialize Grid Search
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best model
best_rf_model = grid_search.best_estimator_

# Predict ratings for the test data
y_pred = best_rf_model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
adjusted_r2 = 1 - (1 - r2) * ((len(y_test) - 1) / (len(y_test) - X_train.shape[1] - 1))
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
medae = median_absolute_error(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")
print(f"Adjusted R-squared: {adjusted_r2}")
print(f"Mean Absolute Percentage Error (MAPE): {mape}%")
print(f"Median Absolute Error (MedAE): {medae}")

Mean Absolute Error (MAE): 0.44694791631645475
Root Mean Squared Error (RMSE): 0.8386316257684516
R-squared (R²): 0.26927840265057634
Adjusted R-squared: 0.26715146397782574
Mean Absolute Percentage Error (MAPE): 9.436986600397873%
Median Absolute Error (MedAE): 0.021227241085970405


>> ## However, since the evaluation metrics did not show improvement, I decided to revert to using the previous linear regression model with selected features.

## A few data records for movie rating prediction

In [19]:
# Define new data points
new_data = {
    'Year': [2023, 2023, 2024],
    'Duration': [120, 130, 140],
    'Votes': [10000, 15000, 12000]
}

# Convert to DataFrame
new_df = pd.DataFrame(new_data)

# Transform the new data points using PolynomialFeatures and SelectKBest
new_data_poly = poly.transform(new_df)
new_data_selected = selector.transform(new_data_poly)

# Predict the ratings using the trained model
predicted_ratings = model.predict(new_data_selected)

# Print the new data points and their predicted ratings
print("\nNew Data Points:")
print(new_df)
for i, rating in enumerate(predicted_ratings):
    print(f"Data Point {i + 1}: Predicted Rating = {rating:.2f}")


New Data Points:
   Year  Duration  Votes
0  2023       120  10000
1  2023       130  15000
2  2024       140  12000
Data Point 1: Predicted Rating = 6.02
Data Point 2: Predicted Rating = 6.12
Data Point 3: Predicted Rating = 5.90


In [20]:
import pandas as pd
import chardet

def detect_encoding(file_path):
    with open(file_path, 'rb') as file:
        raw_data = file.read(10000)
    result = chardet.detect(raw_data)
    return result['encoding']

def find_encoding_errors(file_path, encoding):
    error_details = []

    with open(file_path, 'rb') as file:
        for i, line in enumerate(file):
            try:
                line.decode(encoding)
            except UnicodeDecodeError as e:
                error_details.append({
                    'row': i + 1,
                    'byte_position': e.start,
                    'problematic_bytes': line[e.start:e.end]
                })
    
    return error_details

def locate_error_in_dataframe(file_path, encoding, errors):
    with open(file_path, 'r', encoding=encoding, errors='replace') as file:
        for error in errors:
            row = error['row']
            byte_position = error['byte_position']

            for i in range(row):
                line = file.readline()
                if i + 1 == row:
                    columns = line.split(',')
                    break

            cumulative_length = 0
            for col_num, col_value in enumerate(columns):
                cumulative_length += len(col_value) + 1
                if cumulative_length > byte_position:
                    column_name = columns[col_num]
                    break
            
            print(f"Row: {row}, Column: {column_name}, Byte Position: {byte_position}, Problematic Bytes: {error['problematic_bytes']}")

file_path = "D:/Yuktha/Codsoft/Task2_Movie_rating/IMDb_Movies_India.csv"
detected_encoding = detect_encoding(file_path)
print(f"Detected encoding: {detected_encoding}")

errors = find_encoding_errors(file_path, detected_encoding)

if errors:
    locate_error_in_dataframe(file_path, detected_encoding, errors)
else:
    print("No encoding errors found.")

Detected encoding: ascii
Row: 836, Column: Aitana S�nchez-Gij�n, Byte Position: 67, Problematic Bytes: b'\xe1'
Row: 1247, Column: Rajni Bala, Byte Position: 72, Problematic Bytes: b'\xe1'
Row: 1661, Column: Rajni Bala, Byte Position: 83, Problematic Bytes: b'\xf6'
Row: 1956, Column: Mohan Dayaram Bhavnani, Byte Position: 31, Problematic Bytes: b'\xec'
Row: 2394, Column: Mohan Dayaram Bhavnani, Byte Position: 121, Problematic Bytes: b'\xf6'
Row: 2549, Column: Krishna Singh Bisht, Byte Position: 62, Problematic Bytes: b'\xe9'
Row: 3250, Column: 7.9, Byte Position: 56, Problematic Bytes: b'\xe9'
Row: 3457, Column: 7.9, Byte Position: 90, Problematic Bytes: b'\xf3'
Row: 4162, Column: 7.9, Byte Position: 54, Problematic Bytes: b'\xe7'
Row: 4176, Column: 7.9, Byte Position: 67, Problematic Bytes: b'\xef'
Row: 5329, Column: 7.9, Byte Position: 45, Problematic Bytes: b'\xe1'
Row: 5518, Column: 7.9, Byte Position: 3, Problematic Bytes: b'\xe9'
Row: 6061, Column: 7.9, Byte Position: 64, Problema