## Random_Forest Model to Predict Platform

#### -- Genre, Age_rating, Duration are selected as training features
#### -- Predicts the platform for the given inputs

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from joblib import dump
import os


# Load the dataset
df = pd.read_csv(r'https://raw.githubusercontent.com/ulquyorra-11/Cinemalytics/5da1bd9f3c477cf9c5337f0881c5eeefb3e4115b/data/clean/updated_clean_combined_movies.csv')

# Encode 'genre' and 'age_rating' using LabelEncoder
label_encoder_genre = LabelEncoder()
df['genre_encoded'] = label_encoder_genre.fit_transform(df['genre'])

label_encoder_age_rating = LabelEncoder()
df['age_rating_encoded'] = label_encoder_age_rating.fit_transform(df['age_rating'])

label_encoder_platform = LabelEncoder()
df['platform_encoded'] = label_encoder_platform.fit_transform(df['platform'])

# Select features and target variable for the model
X = df[['genre_encoded', 'duration_min', 'age_rating_encoded']]
y = df['platform_encoded']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Create directory to save models and encoders
os.makedirs(r'trained_models')

# Save the trained model
dump(clf, r'trained_models\random_forest_model.joblib')

# Save the label encoders
dump(label_encoder_genre, r'trained_models\genre_encoder.joblib')
dump(label_encoder_age_rating, r'trained_models\age_rating_encoder.joblib')
dump(label_encoder_platform, r'trained_models\platform_encoder.joblib')

# Predict on the test set and evaluate the model's accuracy
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

# Gather user input
genre_input = input("Enter the movie's genre: ")
duration_input = int(input("Enter the movie's duration in minutes: "))
age_rating_input = input("Enter the movie's age rating: ")

# Transform inputs using the fitted LabelEncoders
try:
    genre_encoded = label_encoder_genre.transform([genre_input])[0]
    age_rating_encoded = label_encoder_age_rating.transform([age_rating_input])[0]
except ValueError as e:
    print(f"Error: {e}. Please ensure your inputs match the dataset's categories.")
    exit()

# Create DataFrame for the new movie
new_movie = pd.DataFrame({'genre_encoded': [genre_encoded], 'duration_min': [duration_input], 'age_rating_encoded': [age_rating_encoded]})

# Use the trained classifier to predict the best platform
predicted_platform_encoded = clf.predict(new_movie)
predicted_platform = label_encoder_platform.inverse_transform(predicted_platform_encoded)

print(f'Accuracy on the test set: {accuracy:.4f}')
print(f'Best Platform for the new movie: {predicted_platform[0]}')


Accuracy on the test set: 0.7487
Best Platform for the new movie: Prime Video


## Random_forest regressor model to predict Revenue

#### -- Predicted Platform is provided as input for Testing
#### -- Training features are Genre, Age_rating, Duration, Platform 
#### -- Predicts the approximate revenue for the given inputs

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import joblib

# Load the dataset
df = pd.read_csv(r'https://raw.githubusercontent.com/ulquyorra-11/Cinemalytics/5da1bd9f3c477cf9c5337f0881c5eeefb3e4115b/data/clean/updated_clean_combined_movies.csv')

# Encode 'genre' and 'age_rating' using LabelEncoder
label_encoder_genre = LabelEncoder()
df['genre_encoded'] = label_encoder_genre.fit_transform(df['genre'])

label_encoder_age_rating = LabelEncoder()
df['age_rating_encoded'] = label_encoder_age_rating.fit_transform(df['age_rating'])

label_encoder_platform = LabelEncoder()
df['platform_encoded'] = label_encoder_platform.fit_transform(df['platform'])

# Select features and target variable for the model
X = df[['genre_encoded', 'duration_min', 'age_rating_encoded', 'platform_encoded']]  # Include 'revenue' as a feature
y = df['revenue']  

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a Random Forest Regressor
regressor = RandomForestRegressor(n_estimators=100, random_state=42)
regressor.fit(X_train, y_train)

# Save the trained model
joblib.dump(regressor, r'trained_models\random_forest_regressor_with_revenue.joblib')

# Save the label encoders
# joblib.dump(label_encoder_genre, r'C:\Users\uzair\OneDrive\Desktop\Cinemalytics_project\model\regress\genre_encoder.joblib')
# joblib.dump(label_encoder_age_rating, r'C:\Users\uzair\OneDrive\Desktop\Cinemalytics_project\model\regress\age_rating_encoder.joblib')
# joblib.dump(label_encoder_platform, r'C:\Users\uzair\OneDrive\Desktop\Cinemalytics_project\model\regress\platform_encoder.joblib')


# Predict on the test set and evaluate the model's performance
y_pred = regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)  # Calculate mean squared error

print(f'Mean Squared Error on the test set: {mse:.4f}')

# Predict the revenue for a new movie with 'Comedy' genre, 120 min duration, and 'R' age rating
#genre_input = 'Comedy'  # Placeholder, replace with user input
#age_rating_input = 'R'  # Placeholder, replace with user input
#platform_input = 'Disney+'


platform_input = predicted_platform

# Transform inputs using the fitted LabelEncoders
platform_encoded = label_encoder_platform.transform([platform_input])[0]

# Create DataFrame for the new movie
new_movie = pd.DataFrame({'genre_encoded': [genre_encoded], 'duration_min': [duration_input], 'age_rating_encoded': [age_rating_encoded], 'platform_encoded': [platform_encoded]})

# Use the trained regressor to predict the revenue
predicted_revenue = regressor.predict(new_movie)

print(f'Predicted Revenue for the new movie: {predicted_revenue[0]}')


Mean Squared Error on the test set: 103883861231611712.0000
Predicted Revenue for the new movie: 515733482.2258794


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
