## Preparing the data

In [25]:
import pandas as pd
import numpy as np

df = pd.read_csv("../preprocessed.tsv", sep = "\t")
df = df.apply(lambda x: x.astype(int) if x.dtype == 'bool' else x)

df = df.drop(columns=["revenue"])
df = df.fillna(0)

In [16]:
 df = df.drop(columns=["Transmedia",'startYear',"max_genre_profitability","runtime"])

In [75]:
df = df.fillna(0)
df = df.drop(columns = ["director_max_total_profit", "director_max_avg_profit","director_max_avg_gross"])
df = df.drop(columns = ["writer_max_total_profit", "writer_max_avg_profit", "writer_max_avg_gross"])
df = df.drop(columns = ["actors_avg_total_profit","actors_avg_total_gross","actors_avg_nb_movies"])

In [26]:
df["ROI"] = np.log(df["ROI"]+ 100)

In [27]:
X = df.drop(columns=['ROI'])
XP = df.drop(columns=['ROI'])
y = df["ROI"]

In [28]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

lower_percentile = np.percentile(y, 0)
upper_percentile = np.percentile(y, 100)

y_filtered = y[(y >= lower_percentile) & (y <= upper_percentile)]

plt.figure(figsize=(10, 6))
sns.histplot(y_filtered, kde=False, color='blue', label='y', bins=100, stat='density')
plt.legend()
plt.xlabel('ROI')
plt.ylabel('Density')
plt.show()

## Using LASSO

In [11]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.metrics import r2_score

# Lasso model
lasso = Lasso(alpha=0.025)

# Perform 10-fold cross-validation
mse_scores = cross_val_score(lasso, X, y, cv=10, scoring='neg_mean_squared_error')

# Convert MSE to RMSE (note that scores are negative because of 'neg_mean_squared_error')
rmse_scores = np.sqrt(-mse_scores)

# Print RMSE for each fold and the average RMSE
print("RMSE for each fold:", rmse_scores)
print("Average RMSE:", rmse_scores.mean())

# Perform cross-validation for R² score
r2_scores = cross_val_score(lasso, X, y, cv=10, scoring='r2')

# Print R² for each fold and the average R²
print("R² for each fold:", r2_scores)
print("Average R²:", r2_scores.mean())


RMSE for each fold: [1.76572468 1.84051446 2.08437429 2.30937699 2.03521787 2.37093547
 2.10505059 2.01345    2.10433347 2.24371535]
Average RMSE: 2.087269318108322
R² for each fold: [0.14466612 0.13199132 0.14395745 0.15733319 0.17766902 0.13878945
 0.19307037 0.2290923  0.25655184 0.13960514]
Average R²: 0.17127262022036152


In [None]:
from sklearn.linear_model import Lasso

import pandas as pd
import numpy as np

# Fit the Lasso model
lasso = Lasso(alpha=0.025)
lasso.fit(X, y)

# Get feature importances (absolute value of coefficients)
feature_importance = lasso.coef_

# Create a DataFrame to display features and their importance
importance_df = pd.DataFrame({
    'Feature': XP.columns,
    'Importance': feature_importance
})


pd.set_option('display.max_rows', None)

# Sort by importance in descending order
importance_df = importance_df.sort_values(by='Importance')

selected_features = importance_df[importance_df['Importance'] != 0]['Feature'].tolist()

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
X_selected = XP[selected_features]


# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = X_selected.columns
vif_data["VIF"] = [variance_inflation_factor(X_selected.values, i) for i in range(X_selected.shape[1])]

# Display the VIF for each feature
print(vif_data)



In [None]:
importance_df = importance_df.reindex(importance_df["Importance"].abs().sort_values(ascending=False).index)
importance_df

## Using XGB

In [12]:
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error


# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost model
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.07, max_depth=3)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

y_pred_original = np.expm1(y_pred)
y_test_original = np.expm1(y_test)

# Evaluate
rmse = root_mean_squared_error(y_test, y_pred)
print(f"RMSE: {rmse:.4f}")

RMSE: 1.7388


In [13]:
from sklearn.metrics import r2_score

# Compute R² on original scale
r2_original = r2_score(y_test, y_pred)
print(f"R² (original scale): {r2_original:.4f}")

R² (original scale): 0.3833


## Using SVR

In [14]:
from sklearn.svm import SVR
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error


# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost model
model = SVR(kernel='rbf', C=0.1, epsilon=0.01)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

y_pred_original = np.expm1(y_pred)
y_test_original = np.expm1(y_test)

# Evaluate
rmse = root_mean_squared_error(y_test, y_pred)
print(f"RMSE: {rmse:.4f}")

from sklearn.metrics import r2_score

# Compute R² on original scale
r2_original = r2_score(y_test, y_pred)
print(f"R² (original scale): {r2_original:.4f}")

RMSE: 2.0327
R² (original scale): 0.1572
