In [38]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [39]:
movies_df = pd.read_csv('movies.csv', encoding='ISO-8859-1')

In [40]:
print("Initial Data Summary:")
print(movies_df.info())


Initial Data Summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB
None


In [41]:
# Year
movies_df['Year'] = pd.to_numeric(movies_df['Year'].str.extract('(\d{4})')[0], errors='coerce')
num_imputer = SimpleImputer(strategy='median')
movies_df['Year'] = num_imputer.fit_transform(movies_df[['Year']])

# Duration
movies_df['Duration'] = pd.to_numeric(movies_df['Duration'].str.extract('(\d+)')[0], errors='coerce')
movies_df['Duration'] = num_imputer.fit_transform(movies_df[['Duration']])

# Votes
movies_df['Votes'] = pd.to_numeric(movies_df['Votes'].str.replace(',', ''), errors='coerce')
movies_df['Votes'] = movies_df['Votes'].fillna(0)

# Rating
movies_df = movies_df.dropna(subset=['Rating'])

# Categorical values
cat_columns = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
for column in cat_columns:
    movies_df[column] = movies_df[column].fillna('Unknown')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df[column] = movies_df[column].fillna('Unknown')


In [42]:
# Cleaning
for column in cat_columns:
    movies_df[column] = movies_df[column].str.lower().str.strip()

In [43]:
# Feature Engineering
genres = movies_df['Genre'].str.get_dummies(sep=', ')
movies_df = pd.concat([movies_df, genres], axis=1)
movies_df = movies_df.drop(columns=['Genre'])

le = LabelEncoder()
for column in ['Director', 'Actor 1', 'Actor 2', 'Actor 3']:
    movies_df[column] = le.fit_transform(movies_df[column])

In [44]:
# For outliers
z_scores = np.abs((movies_df[['Year', 'Duration', 'Votes']] - movies_df[['Year', 'Duration', 'Votes']].mean()) / movies_df[['Year', 'Duration', 'Votes']].std())
outliers = (z_scores > 3).any(axis=1)

print(f"Number of outliers detected: {outliers.sum()}")

movies_df = movies_df[~outliers]

Number of outliers detected: 178


In [45]:
# Training and Testing data
X = movies_df.drop(columns=['Name', 'Rating'])
y = movies_df['Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [46]:
# Build and Train the Model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Feature Importance
feature_importance = pd.Series(model.feature_importances_, index=X.columns)
print("Feature Importances:")
print(feature_importance.sort_values(ascending=False))

Mean Squared Error: 1.246311021949645
Feature Importances:
Year           2.102679e-01
Votes          2.027666e-01
Actor 1        9.000465e-02
Actor 2        8.887933e-02
Director       8.459191e-02
Actor 3        8.228378e-02
Duration       8.164346e-02
documentary    2.540136e-02
action         2.189830e-02
drama          1.893548e-02
romance        1.576932e-02
horror         1.297374e-02
comedy         1.258265e-02
thriller       8.628093e-03
crime          7.404293e-03
adventure      5.960586e-03
musical        4.997657e-03
family         4.872242e-03
mystery        3.369883e-03
fantasy        2.982104e-03
unknown        2.604779e-03
history        2.454321e-03
biography      2.194515e-03
war            1.912726e-03
music          1.718327e-03
animation      1.470350e-03
sci-fi         7.710110e-04
sport          6.412377e-04
western        1.938439e-05
news           6.628239e-09
dtype: float64
