In [None]:
# 📌 1. Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


# 📌 3. Load and clean the dataset
df = pd.read_csv("/content/horror_movies.csv")

# Drop rows with missing vote_average (target)
df = df.dropna(subset=['vote_average'])

# Select useful features
features = ['popularity', 'vote_count', 'budget', 'revenue', 'runtime',
            'genre_names', 'original_language', 'status']
target = 'vote_average'

df = df[features + [target]].copy()
df.fillna("Unknown", inplace=True)

# Encode categorical features
label_encoders = {}
for col in ['genre_names', 'original_language', 'status']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# 📌 4. Prepare train/test sets
X = df[features]
y = df[target]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 📌 5. Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 📌 6. Take user input
print("\n🎬 Enter movie details to predict IMDb rating:")

user_popularity = float(input("Popularity: "))
user_vote_count = int(input("Vote count: "))
user_budget = float(input("Budget ($): "))
user_revenue = float(input("Revenue ($): "))
user_runtime = float(input("Runtime (minutes): "))
user_genre = input("Genre (e.g. Horror, Thriller): ")
user_lang = input("Original language (e.g. en, es): ")
user_status = input("Status (e.g. Released, Post Production): ")

# Encode input safely
def safe_encode(val, encoder):
    if val in encoder.classes_:
        return encoder.transform([val])[0]
    else:
        encoder.classes_ = np.append(encoder.classes_, val)
        return encoder.transform([val])[0]

encoded_genre = safe_encode(user_genre, label_encoders['genre_names'])
encoded_lang = safe_encode(user_lang, label_encoders['original_language'])
encoded_status = safe_encode(user_status, label_encoders['status'])

# Prepare input
user_input = [[user_popularity, user_vote_count, user_budget,
               user_revenue, user_runtime, encoded_genre,
               encoded_lang, encoded_status]]

user_input_scaled = scaler.transform(user_input)

# 📌 7. Predict
predicted_rating = model.predict(user_input_scaled)[0]
print(f"\n⭐ Predicted IMDb rating: {round(predicted_rating, 2)}")



🎬 Enter movie details to predict IMDb rating:
Popularity: 2000
Vote count: 500
Budget ($): 0
Revenue ($): 959848
Runtime (minutes): 120
Genre (e.g. Horror, Thriller): thriller
Original language (e.g. en, es): en
Status (e.g. Released, Post Production): released

⭐ Predicted IMDb rating: 6.46




In [None]:
from sklearn.ensemble import RandomForestRegressor
import joblib

# Sample model training
model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, y_train)

# Save model to a .pkl file
joblib.dump(model, 'my_model.pkl')



['my_model.pkl']