## Preparing the data

In [33]:
import pandas as pd
import numpy as np

df = pd.read_csv("../preprocessed.tsv", sep = "\t")
df = df.apply(lambda x: x.astype(int) if x.dtype == 'bool' else x)
df = df.drop(columns=["ROI"])
df = df.fillna(0)

In [34]:
df = df.fillna(0)
df = df.drop(columns=["Transmedia",'startYear',"max_genre_profitability","runtime"])
df = df.drop(columns = ["director_max_total_profit", "director_max_avg_profit","director_max_avg_gross"])
df = df.drop(columns = ["writer_max_total_profit", "writer_max_avg_profit", "writer_max_avg_gross"])
df = df.drop(columns = ["actors_avg_total_profit","actors_avg_total_gross","actors_avg_nb_movies"])

In [35]:
df["revenue"] = np.log(df["revenue"]+ 1)

In [36]:
X = df.drop(columns=['revenue'])
XP = df.drop(columns=['revenue'])
y = df["revenue"]

In [37]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.histplot(y, kde=False, color='blue', label='y', bins=100, stat='density')
plt.legend()
plt.xlabel('revenue')
plt.ylabel('Density')
plt.show()


## Using Lasso

In [38]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.metrics import r2_score

# Lasso model
lasso = Lasso(alpha=0.025)

# Perform 10-fold cross-validation
mse_scores = cross_val_score(lasso, X, y, cv=10, scoring='neg_mean_squared_error')

# Convert MSE to RMSE (note that scores are negative because of 'neg_mean_squared_error')
rmse_scores = np.sqrt(-mse_scores)

# Print RMSE for each fold and the average RMSE
print("RMSE for each fold:", rmse_scores)
print("Average RMSE:", rmse_scores.mean())

# Perform cross-validation for R² score
r2_scores = cross_val_score(lasso, X, y, cv=10, scoring='r2')

# Print R² for each fold and the average R²
print("R² for each fold:", r2_scores)
print("Average R²:", r2_scores.mean())


RMSE for each fold: [2.1126736  2.18979022 2.19112814 2.54085005 2.26031787 2.33508586
 2.35680768 2.08856658 2.24689165 2.36000387]
Average RMSE: 2.268211551469097
R² for each fold: [0.37920716 0.45287384 0.47834684 0.44750961 0.49649989 0.54886831
 0.45726379 0.56331369 0.55883812 0.54034999]
Average R²: 0.4923071245554199


In [39]:
from sklearn.linear_model import Lasso

import pandas as pd
import numpy as np

# Fit the Lasso model
lasso = Lasso(alpha=0.025)
lasso.fit(X, y)

# Get feature importances (absolute value of coefficients)
feature_importance = lasso.coef_

# Create a DataFrame to display features and their importance
importance_df = pd.DataFrame({
    'Feature': XP.columns,
    'Importance': feature_importance
})


pd.set_option('display.max_rows', None)

# Sort by importance in descending order
importance_df = importance_df.sort_values(by='Importance')

selected_features = importance_df[importance_df['Importance'] != 0]['Feature'].tolist()

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
X_selected = XP[selected_features]


# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = X_selected.columns
vif_data["VIF"] = [variance_inflation_factor(X_selected.values, i) for i in range(X_selected.shape[1])]



In [None]:
importance_df = importance_df.reindex(importance_df["Importance"].abs().sort_values(ascending=False).index)
importance_df

## Using XGB

In [29]:
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error


# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost model
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.07, max_depth=3)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

y_pred_original = np.expm1(y_pred)
y_test_original = np.expm1(y_test)

# Evaluate
rmse = root_mean_squared_error(y_test, y_pred)
print(f"RMSE: {rmse:.4f}")

RMSE: 1.6897


In [30]:
from sklearn.metrics import r2_score

# Compute R² on original scale
r2_original = r2_score(y_test, y_pred)
print(f"R² (original scale): {r2_original:.4f}")

R² (original scale): 0.6975


## Using SVR

In [32]:
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = SVR(kernel='rbf', C=0.1, epsilon=0.01)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = root_mean_squared_error(y_test, y_pred)
print(f"RMSE: {rmse:.4f}")
r2 = r2_score(y_test, y_pred)
print(f"R² (original scale): {r2:.4f}")

RMSE: 2.2788
R² (original scale): 0.4498
