## **Import Libraries**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib


## **Load and Explore the Data**

In [None]:
# Load dataset with specified encoding
try:
    data = pd.read_csv('Movies.csv', encoding='ISO-8859-1')
except UnicodeDecodeError:
    data = pd.read_csv('Movies.csv', encoding='cp1252')

# Display the first few rows and data types
print(data.head())
print(data.info())


                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhatia  
1    Arvind Jangid  
2       Roy Angana  
3  Siddhant Kapoor  
4    

## **Preprocess the Data**

In [21]:
# Ensure columns are of type string
data['Year'] = data['Year'].astype(str)
data['Duration'] = data['Duration'].astype(str)

# Convert 'Year' to numeric values, extracting only digits
data['Year'] = pd.to_numeric(data['Year'].str.extract('(\d+)', expand=False), errors='coerce')

# Convert 'Duration' to numeric values, extracting only digits
data['Duration'] = pd.to_numeric(data['Duration'].str.extract('(\d+)', expand=False), errors='coerce')


## **Define Features and Target Variable**

In [22]:
# Define features and target variable
X = data[['Year', 'Duration', 'Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']]
y = data['Rating']


## **Set Up Preprocessing Pipelines**

In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Preprocessing for numerical data
numeric_features = ['Year', 'Duration']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Handle any remaining missing values
    ('scaler', StandardScaler())  # Standardize numerical features
])


Categorical Data Preprocessing

In [24]:
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for categorical data
categorical_features = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle any remaining missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Convert categorical variables into dummy/indicator variables
])


Combine Preprocessing Steps

In [25]:
from sklearn.compose import ColumnTransformer

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)


## **Define and Train the Model**

In [26]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=0))
])

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train the model
model.fit(X_train, y_train)


## **Make Predictions**

In [27]:
# Make predictions
y_pred = model.predict(X_test)


## **Evaluate the Model**

In [28]:
from sklearn.metrics import mean_squared_error, r2_score

# Evaluate the model
print(f'Mean Squared Error: {mean_squared_error(y_test, y_pred)}')
print(f'R^2 Score: {r2_score(y_test, y_pred)}')


Mean Squared Error: 1.5589793943386994
R^2 Score: 0.22780583142501654
