In [None]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# The dataset link: https://www.kaggle.com/datasets/globalmediadata/socialmediausage-sleepdata-sg

In [None]:
df = pd.read_csv('SocialMediaUsage_SleepLatencyAnalysis_Singapore.csv', encoding='utf-8')

# Knowing The Data

In [None]:
print(df.shape)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
print(df.info())

In [None]:
df.columns

In [None]:
print(df.isnull().sum())

In [None]:
for column in df.select_dtypes(include=['number']).columns:
    fig = px.box(df, x=column, title=f'Box plot for {column}')
    fig.show()


In [None]:
df_numeric = df.select_dtypes(include=[np.number])
sns.heatmap(df_numeric.corr(), cmap='coolwarm', annot=True)
plt.show()


# Feature selection & pre‑processing

In [None]:
selected_cols = [
    'Average Daily Social Media Use Time (minutes)',
    'Frequency of Social Media Checking (number of times per day)',
    'Pre-Sleep Social Media Use Duration (minutes)',
    'Dominant Social Media Platform',
    'Type of Social Media Content Consumed',
]
target_col = 'Stress Level Rating'

X = pd.get_dummies(df[selected_cols], drop_first=True)
y = df[target_col]

num_cols = [
    "Average Daily Social Media Use Time (minutes)",
    "Frequency of Social Media Checking (number of times per day)",
    "Pre-Sleep Social Media Use Duration (minutes)",
]
# Ensure float dtype so scaling writes back safely
X[num_cols] = X[num_cols].astype(float)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

In [None]:
scaler = StandardScaler()
X_train.loc[:, num_cols] = scaler.fit_transform(X_train[num_cols])
X_test.loc[:,  num_cols] = scaler.transform(X_test[num_cols])

# Build models

In [None]:
lin = LinearRegression()

ridge = RidgeCV(alphas=np.logspace(-3, 3, 25), cv=5)  # searches for the best α on a grid and picks the one that minimises cross-validated error

rf = RandomForestRegressor(n_estimators=300, random_state=42)


In [None]:
lin.fit(X_train, y_train)
ridge.fit(X_train, y_train)
rf.fit(X_train, y_train)


In [None]:
y_pred_lin = lin.predict(X_test)
y_pred_ridge = ridge.predict(X_test)
y_pred_rf = rf.predict(X_test)


In [None]:
def evaluate(y_true, y_pred, name='Model'):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = math.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f'{name:<15s}  MAE: {mae:.3f}   RMSE: {rmse:.3f}   R²: {r2:.3f}')
    return {'MAE': mae, 'RMSE': rmse, 'R2': r2}

print('\n--- Test‑set performance ---')
evaluate(y_test, y_pred_lin, 'Linear Regression')
evaluate(y_test, y_pred_ridge, 'Ridge Regression')
evaluate(y_test, y_pred_rf, 'Random Forest')

print(f"\nRidge alpha used: {ridge.alpha_:.4f}")

In [None]:
importances = rf.feature_importances_
feat_imp = pd.Series(importances, index=X.columns).sort_values()

plt.figure(figsize=(8, 5))
plt.barh(feat_imp.index, feat_imp.values)
plt.xlabel('Importance')
plt.title('Random‑Forest Feature Importances')
plt.tight_layout()
plt.show()
