# Random_Forest_Model

#### Model without User Input and without corresponding testing

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Load the dataset
df = pd.read_csv(r'C:\Users\data science\Downloads\GitHub_Repositories\OTT_SUGGESTER\data\clean\updated_clean_combined_movies.csv')

# Encode 'genre' and 'age_rating' using LabelEncoder
label_encoder_genre = LabelEncoder()
df['genre_encoded'] = label_encoder_genre.fit_transform(df['genre'])

label_encoder_age_rating = LabelEncoder()
df['age_rating_encoded'] = label_encoder_age_rating.fit_transform(df['age_rating'])

label_encoder_platform = LabelEncoder()
df['platform_encoded'] = label_encoder_platform.fit_transform(df['platform'])

# Select features and target variable for the model
X = df[['genre_encoded', 'duration_min', 'age_rating_encoded']]  # Now includes encoded age_rating
y = df['platform_encoded']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Save the trained model
dump(clf, r'C:\Users\data science\Desktop\OTT_project\trained_models\random_forest_model.joblib')

# Save the label encoders
dump(label_encoder_genre, r'C:\Users\data science\Desktop\OTT_project\trained_models\genre_encoder.joblib')
dump(label_encoder_age_rating, r'C:\Users\data science\Desktop\OTT_project\trained_models\age_rating_encoder.joblib')
dump(label_encoder_platform, r'C:\Users\data science\Desktop\OTT_project\trained_models\platform_encoder.joblib')


# Predict on the test set and evaluate the model's accuracy
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

# Predict the best platform for a new movie with 'Comedy' genre, 120 min duration, and 'R' age rating
genre_input = 'Comedy'  # Placeholder, replace with user input
age_rating_input = 'R'  # Placeholder, replace with user input

# Transform inputs using the fitted LabelEncoders
genre_encoded = label_encoder_genre.transform([genre_input])[0]
age_rating_encoded = label_encoder_age_rating.transform([age_rating_input])[0]

# Create DataFrame for the new movie
new_movie = pd.DataFrame({'genre_encoded': [genre_encoded], 'duration_min': [120], 'age_rating_encoded': [age_rating_encoded]})

# Use the trained classifier to predict the best platform
predicted_platform_encoded = clf.predict(new_movie)
predicted_platform = label_encoder_platform.inverse_transform(predicted_platform_encoded)

print(f'Accuracy on the test set: {accuracy:.4f}')
print(f'Best Platform for the new movie: {predicted_platform[0]}')


Accuracy on the test set: 0.8612
Best Platform for the new movie: Prime Video


In [2]:
df.columns


Index(['title', 'age_rating', 'duration_min', 'genre', 'budget', 'revenue',
       'platform', 'genre_encoded', 'platform_encoded'],
      dtype='object')

#### ""Model with user input and corresponding testing ""

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from joblib import dump


# Load the dataset
df = pd.read_csv(r'C:\Users\data science\Downloads\GitHub_Repositories\OTT_SUGGESTER\data\clean\updated_clean_combined_movies.csv')

# Encode 'genre' and 'age_rating' using LabelEncoder
label_encoder_genre = LabelEncoder()
df['genre_encoded'] = label_encoder_genre.fit_transform(df['genre'])

label_encoder_age_rating = LabelEncoder()
df['age_rating_encoded'] = label_encoder_age_rating.fit_transform(df['age_rating'])

label_encoder_platform = LabelEncoder()
df['platform_encoded'] = label_encoder_platform.fit_transform(df['platform'])

# Select features and target variable for the model
X = df[['genre_encoded', 'duration_min', 'age_rating_encoded']]
y = df['platform_encoded']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Save the trained model
dump(clf, r'C:\Users\data science\Desktop\OTT_project\trained_models\random_forest_model.joblib')

# Save the label encoders
dump(label_encoder_genre, r'C:\Users\data science\Desktop\OTT_project\trained_models\genre_encoder.joblib')
dump(label_encoder_age_rating, r'C:\Users\data science\Desktop\OTT_project\trained_models\age_rating_encoder.joblib')
dump(label_encoder_platform, r'C:\Users\data science\Desktop\OTT_project\trained_models\platform_encoder.joblib')

# Predict on the test set and evaluate the model's accuracy
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

# Gather user input
genre_input = input("Enter the movie's genre: ")
duration_input = int(input("Enter the movie's duration in minutes: "))
age_rating_input = input("Enter the movie's age rating: ")

# Transform inputs using the fitted LabelEncoders
try:
    genre_encoded = label_encoder_genre.transform([genre_input])[0]
    age_rating_encoded = label_encoder_age_rating.transform([age_rating_input])[0]
except ValueError as e:
    print(f"Error: {e}. Please ensure your inputs match the dataset's categories.")
    exit()

# Create DataFrame for the new movie
new_movie = pd.DataFrame({'genre_encoded': [genre_encoded], 'duration_min': [duration_input], 'age_rating_encoded': [age_rating_encoded]})

# Use the trained classifier to predict the best platform
predicted_platform_encoded = clf.predict(new_movie)
predicted_platform = label_encoder_platform.inverse_transform(predicted_platform_encoded)

print(f'Accuracy on the test set: {accuracy:.4f}')
print(f'Best Platform for the new movie: {predicted_platform[0]}')


Accuracy on the test set: 0.8612
Best Platform for the new movie: Prime Video
