In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

df = pd.read_csv('preprocessed_popular_spotify_songs.csv')

features = df[['in_spotify_playlists', 'in_apple_charts', 'in_shazam_charts', 'danceability_%', 'bpm_normalized', 'valence_%', 'energy_%']]
target = df['streams']

# Log-transform the target variable
target = np.log1p(target) 

# Normalize the features
scaler = StandardScaler()
features = scaler.fit_transform(features)

X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.2, random_state=42)

linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

# Make predictions on the training and validation sets
y_train_pred = linear_reg.predict(X_train)
y_val_pred = linear_reg.predict(X_val)

# Calculate performance metrics for the training set
train_mse = mean_squared_error(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

# Calculate performance metrics for the validation set
val_mse = mean_squared_error(y_val, y_val_pred)
val_mae = mean_absolute_error(y_val, y_val_pred)
val_r2 = r2_score(y_val, y_val_pred)

print("Linear Regression Performance on Training Set")
print(f"Mean Squared Error: {train_mse:.4f}")
print(f"Mean Absolute Error: {train_mae:.4f}")
print(f"R² Score: {train_r2:.4f}\n")

print("Linear Regression Performance on Validation Set")
print(f"Mean Squared Error: {val_mse:.4f}")
print(f"Mean Absolute Error: {val_mae:.4f}")
print(f"R² Score: {val_r2:.4f}\n")

y_train_pred_exp = np.expm1(y_train_pred)
y_val_pred_exp = np.expm1(y_val_pred)
y_train_exp = np.expm1(y_train)
y_val_exp = np.expm1(y_val)

train_mse_exp = mean_squared_error(y_train_exp, y_train_pred_exp)
train_mae_exp = mean_absolute_error(y_train_exp, y_train_pred_exp)
val_mse_exp = mean_squared_error(y_val_exp, y_val_pred_exp)
val_mae_exp = mean_absolute_error(y_val_exp, y_val_pred_exp)

print("Linear Regression Performance on Training Set (Original Scale)")
print(f"Mean Squared Error: {train_mse_exp:.4f}")
print(f"Mean Absolute Error: {train_mae_exp:.4f}\n")

print("Linear Regression Performance on Validation Set (Original Scale)")
print(f"Mean Squared Error: {val_mse_exp:.4f}")
print(f"Mean Absolute Error: {val_mae_exp:.4f}\n")

Linear Regression Performance on Training Set
Mean Squared Error: 0.8128
Mean Absolute Error: 0.6372
R² Score: 0.4139

Linear Regression Performance on Validation Set
Mean Squared Error: 0.5673
Mean Absolute Error: 0.6206
R² Score: 0.4380

Linear Regression Performance on Training Set (Original Scale)
Mean Squared Error: 1785550082843348992.0000
Mean Absolute Error: 389964316.8549

Linear Regression Performance on Validation Set (Original Scale)
Mean Squared Error: 375565324874784576.0000
Mean Absolute Error: 291540023.1752

