## Libs

In [3]:
## lib: defined functions and classes
import numpy as np
from typing import Union, List, Optional

def create_ordinal_labels(values: np.ndarray, 
                         n_classes: int = 3, 
                         thresholds: Optional[List[float]] = None) -> np.ndarray:
    if n_classes < 2:
        raise ValueError("n_classes must be at least 2")
    
    if thresholds is not None:
        if len(thresholds) != n_classes - 1:
            raise ValueError(f"Expected {n_classes-1} thresholds for {n_classes} classes, "
                           f"but got {len(thresholds)}")
    else:
        percentiles = np.linspace(0, 100, n_classes + 1)[1:-1]
        thresholds = [np.percentile(values, p) for p in percentiles]
    
    # Initialize labels array with zeros
    labels = np.zeros(len(values))
    
    # Assign labels based on thresholds
    for i, threshold in enumerate(thresholds, 1):
        labels[values > threshold] = i
        
    return labels.astype(int)

def weighted_accuracy_score(y_true, y_pred):
    nb_classes = np.max(y_true) - np.min(y_true)
    differences = np.abs(y_true - y_pred)
    error = np.sum(differences) / (nb_classes*len(y_true))
    acc = 1 - error
    return acc

import numpy as np
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer, confusion_matrix, mean_absolute_error, accuracy_score
import pandas as pd


class OrdinalClassifier:
    def __init__(self, model, X, y, folds=5):
        self.model = model
        self.X = X
        self.y = y
        self.folds = folds
        
    def evaluate(self):
        cv = StratifiedKFold(n_splits=self.folds, shuffle=True, random_state=42)
        scoring = {
            'MAE': make_scorer(mean_absolute_error),
            'accuracy': make_scorer(accuracy_score),
            'weighted_acc': make_scorer(weighted_accuracy_score)
        }
        
        cv = StratifiedKFold(n_splits=self.folds, shuffle=True, random_state=42)
        scores = {metric: cross_val_score(self.model, self.X, self.y, cv=cv, scoring=scorer).mean() for metric, scorer in scoring.items()}
        
        return scores

## Preparing the data

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../preprocessed.tsv", sep = "\t")
df = df.drop(columns=['ROI'])

In [2]:
df = df.fillna(0)
df = df.loc[:, ~df.columns.str.startswith("Topic")]
df = df.drop(columns = ["writer_max_total_profit", "writer_max_total_gross", "writer_max_nb_movies"])
df = df.drop(columns = ["director_max_total_profit","director_max_total_gross","director_max_nb_movies"])
df = df.drop(columns=['network_aa_heterogeneity','weighted_genre_profitability'])

In [6]:
X = df.drop(columns=['revenue'])
y = create_ordinal_labels(df['revenue'], n_classes=3, thresholds = [np.percentile(df['revenue'], 25), np.percentile(df['revenue'], 75)])

In [7]:
## Scaling (if necessary)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

## Model Training

In [5]:
# Model and hyperparameters
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB


folds = 5
n_estimators = 300


models = {
    'rf': RandomForestClassifier(random_state=42, n_estimators = n_estimators),
    'xgb': xgb.XGBClassifier(
        objective='reg:squarederror', 
        max_depth=6,
        learning_rate=0.1,
        n_estimators=200,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    ),
    'ada': AdaBoostClassifier(algorithm='SAMME', random_state=42),
    'logreg': LogisticRegression(max_iter=500),
    'svm': SVC(kernel='linear', C=0.05),
    'gbm': GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=0.1, max_depth=2, random_state=42),

    'knn': KNeighborsClassifier(n_neighbors=50)
}


In [9]:
classifier = OrdinalClassifier(model=models['knn'], X=X, y=y, folds=5)

# Train and evaluate
classifier.evaluate()

{'MAE': 0.33430440289304403,
 'accuracy': 0.6689819351398193,
 'weighted_acc': 0.832847798553478}