In [1]:
# === Install required packages ===
# pip install pandas scikit-learn openpyxl xgboost imbalanced-learn matplotlib

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# ===============================
# 1. Load metadata only
# ===============================
meta_df = pd.read_excel("Final Dataset.xlsx")

# ===============================
# 2. Reduce categorical levels
# ===============================
categorical_vars = ["MPA", "1st Genre"]

meta_df["MPA"] = meta_df["MPA"].apply(lambda x: x if x in ["PG-13", "R"] else "Other")
main_genres = ["Action", "Drama", "Comedy", "Biography"]
meta_df["1st Genre"] = meta_df["1st Genre"].apply(lambda x: x if x in main_genres else "Other")

# Combine categorical features
meta_df["MPA_Genre"] = meta_df["MPA"] + "_" + meta_df["1st Genre"]
categorical_vars.append("MPA_Genre")

# ===============================
# 3. One-hot encode categorical variables
# ===============================
import sklearn
if sklearn.__version__ >= "1.2":
    encoder = OneHotEncoder(drop="first", sparse_output=False)
else:
    encoder = OneHotEncoder(drop="first", sparse=False)

encoded_cats = encoder.fit_transform(meta_df[categorical_vars])
encoded_cat_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_vars))

# ===============================
# 4. Prepare numerical variables + interaction features
# ===============================
numerical_vars = ["budget", "Duration_Minutes", "First Actor Avg", "Second Actor Avg", "Average IMDb Rating"]
numerical_df = meta_df[numerical_vars].reset_index(drop=True)

# Interaction & log features
numerical_df["budget_x_duration"] = numerical_df["budget"] * numerical_df["Duration_Minutes"]
numerical_df["budget_div_duration"] = numerical_df["budget"] / (numerical_df["Duration_Minutes"] + 1)
numerical_df["budget_x_rating"] = numerical_df["budget"] * numerical_df["Average IMDb Rating"]
numerical_df["duration_x_rating"] = numerical_df["Duration_Minutes"] * numerical_df["Average IMDb Rating"]
numerical_df["actor_avg_diff"] = numerical_df["First Actor Avg"] - numerical_df["Second Actor Avg"]
numerical_df["budget_log"] = np.log1p(numerical_df["budget"])
numerical_df["duration_log"] = np.log1p(numerical_df["Duration_Minutes"])
numerical_df["rating_squared"] = numerical_df["Average IMDb Rating"] ** 2
numerical_df["budget_squared"] = numerical_df["budget"] ** 2

# ===============================
# 5. Combine metadata-based features only
# ===============================
X = pd.concat([numerical_df, encoded_cat_df.reset_index(drop=True)], axis=1)

# ===============================
# 6. Target variable
# ===============================
y_str = meta_df["Rating"].apply(lambda r: "Success" if r >= 6.5 else "Unsuccess")
le = LabelEncoder()
y = le.fit_transform(y_str)  # Success=1, Unsuccess=0

# ===============================
# 7. Train-test split
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Original class distribution:", dict(pd.Series(y_train).value_counts()))

# ===============================
# 8. Resample using SMOTE
# ===============================
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
print("After SMOTE class distribution:", dict(pd.Series(y_train_res).value_counts()))

# ===============================
# 9. Define class-balanced models
# ===============================
y_train_series = pd.Series(y_train)
scale_pos_weight = y_train_series.value_counts()[0] / y_train_series.value_counts()[1]

xgb_model = XGBClassifier(
    n_estimators=500,
    max_depth=7,
    learning_rate=0.08,
    subsample=0.9,
    colsample_bytree=0.8,
    reg_alpha=0.5,
    reg_lambda=1.0,
    use_label_encoder=False,
    eval_metric="logloss",
    scale_pos_weight=scale_pos_weight,
    random_state=42
)

rf_model = RandomForestClassifier(
    n_estimators=500,
    max_depth=8,
    class_weight="balanced",
    random_state=42
)

lr_model = LogisticRegression(
    max_iter=2000,
    class_weight="balanced"
)

# ===============================
# 10. Weighted Voting Ensemble
# ===============================
voting = VotingClassifier(
    estimators=[("xgb", xgb_model), ("rf", rf_model), ("lr", lr_model)],
    voting="soft",
    weights=[3, 2, 1],  # give more weight to XGB
    n_jobs=-1
)

voting.fit(X_train_res, y_train_res)

# ===============================
# 11. Threshold tuning
# ===============================
y_prob = voting.predict_proba(X_test)[:, 1]
thresholds = np.linspace(0.3, 0.5, 21)
f1_scores = [f1_score(y_test, (y_prob >= t).astype(int)) for t in thresholds]
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]

y_pred = (y_prob >= best_threshold).astype(int)

# ===============================
# 12. Final evaluation
# ===============================
print(f"\nOptimal Threshold: {best_threshold:.2f}")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Original class distribution: {1: 571, 0: 549}
After SMOTE class distribution: {1: 571, 0: 571}

Optimal Threshold: 0.33
Accuracy: 0.7321428571428571
              precision    recall  f1-score   support

           0       0.79      0.61      0.69       137
           1       0.70      0.85      0.76       143

    accuracy                           0.73       280
   macro avg       0.74      0.73      0.73       280
weighted avg       0.74      0.73      0.73       280

