## Preparing Data

In [61]:
import pandas as pd
df = pd.read_csv("../preprocessed.tsv", sep = "\t")
df = df.drop(columns=['ROI'])

In [None]:
df = df.fillna(0)
df = df.loc[:, ~df.columns.str.startswith("Topic")]
df = df.drop(columns = ["writer_max_total_profit", "writer_max_total_gross", "writer_max_nb_movies"])
df = df.drop(columns = ["director_max_total_profit","director_max_total_gross","director_max_nb_movies"])
df = df.drop(columns=['network_aa_heterogeneity','weighted_genre_profitability'])

In [62]:
## Preparing the data
import numpy as np
success_threshold = 70
threshold = np.percentile(df['revenue'], success_threshold)
y = (df['revenue'] > threshold).astype(int)
X = df.drop(columns=['revenue'])

In [63]:
## Scaling (if necessary)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

## Model training

In [64]:
# Model and hyperparameters
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB


folds = 5
n_estimators = 200


models = {
    'rf': RandomForestClassifier(random_state=42, n_estimators = n_estimators),
    'xgb': xgb.XGBClassifier(
                objective='binary:logistic', 
                max_depth=2,                
                learning_rate=0.05,           
                n_estimators=n_estimators,         
                subsample=0.8,               
                colsample_bytree=0.8,       
                random_state=42
            ),
    'ada': AdaBoostClassifier(algorithm='SAMME', random_state=42),
    'logreg': LogisticRegression(max_iter=500),
    'svm': SVC(kernel='linear', C=0.05),
    'gbm': GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=0.1, max_depth=2, random_state=42),
    'knn': KNeighborsClassifier(n_neighbors=150),
    'nb': GaussianNB()
}


In [65]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer, recall_score, precision_score, accuracy_score

class BinaryClassifier:
    def __init__(self, model, X, y, folds = 10):
        self.model = model
        self.X = X
        self.y = y
        self.folds = folds

    def evaluate(self):
        # Define scoring metrics
        scoring = {
            'accuracy': make_scorer(accuracy_score),
            'precision': make_scorer(precision_score),
            'recall': make_scorer(recall_score)
        }

        # Perform K-fold cross-validation
        cv = StratifiedKFold(n_splits=self.folds, shuffle=True, random_state=42)
        scores = {metric: cross_val_score(self.model, self.X, self.y, cv=cv, scoring=scorer).mean() for metric, scorer in scoring.items()}

        # Print the results
        print(f"Accuracy: {scores['accuracy']:.4f}")
        print(f"Precision: {scores['precision']:.4f}")
        print(f"Recall: {scores['recall']:.4f}")

In [66]:
# Create and evaluate BinaryClassifier
binary_classifier = BinaryClassifier(model=models["nb"], X=X, y=y,folds=folds)
binary_classifier.evaluate()

Accuracy: 0.7522
Precision: 0.5695
Recall: 0.7839


### Ensemble Methods

In [67]:
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import make_scorer, recall_score, precision_score, accuracy_score
import numpy as np
from scipy.stats import mode

class EnsembleBinaryClassifier:
    def __init__(self, models, X, y, folds=5):
        self.models = models
        self.X = X
        self.y = y
        self.folds = folds

    def evaluate(self):
        cv = StratifiedKFold(n_splits=self.folds, shuffle=True, random_state=42)
        predictions = []
        
        for model in self.models:
            pred = cross_val_predict(model, self.X, self.y, cv=cv)
            predictions.append(pred)
        
        predictions = np.array(predictions)
        majority_vote = mode(predictions, axis=0).mode.flatten()
        
        scores = {
            'accuracy': accuracy_score(self.y, majority_vote),
            'precision': precision_score(self.y, majority_vote),
            'recall': recall_score(self.y, majority_vote)
        }
        
        for metric, score in scores.items():
            print(f"{metric.capitalize()}: {score:.4f}")


In [68]:
# Create and evaluate ensemble model
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
binary_classifier = EnsembleBinaryClassifier(models=models.values(),X=X, y=y,folds=5)
binary_classifier.evaluate()

Accuracy: 0.8616
Precision: 0.8365
Recall: 0.6695


### Feature importance

In [None]:
import pandas as pd
import xgboost as xgb

# Train model
model = xgb.XGBClassifier()
model.fit(X, y)

# Extract feature importance
feature_names = X.columns if isinstance(X, pd.DataFrame) else range(X.shape[1])
importance = model.feature_importances_

# Create DataFrame
df_importance = pd.DataFrame(sorted(zip(feature_names, importance), key=lambda x: x[1], reverse=True),
                             columns=["Feature", "Importance"])
pd.set_option('display.max_rows', None)

df_importance.head(80)
