In [None]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# The dataset link: https://www.kaggle.com/datasets/globalmediadata/socialmediausage-sleepdata-sg

In [None]:
df = pd.read_csv('SocialMediaUsage_SleepLatencyAnalysis_Singapore.csv', encoding='utf-8')

# Knowing The Data

In [None]:
print(df.shape)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
print(df.info())

In [None]:
df.columns

In [None]:
print(df.isnull().sum())

In [None]:
for column in df.select_dtypes(include=['number']).columns:
    fig = px.box(df, x=column, title=f'Box plot for {column}')
    fig.show()


In [None]:
df_numeric = df.select_dtypes(include=[np.number])
sns.heatmap(df_numeric.corr(), cmap='coolwarm', annot=True)
plt.show()


# Feature selection & pre‑processing

In [None]:
selected_cols = [
    'Average Daily Social Media Use Time (minutes)',
    'Frequency of Social Media Checking (number of times per day)',
    'Pre-Sleep Social Media Use Duration (minutes)',
    'Dominant Social Media Platform',
    'Type of Social Media Content Consumed',
]
target_col = 'Stress Level Rating'

X = pd.get_dummies(df[selected_cols], drop_first=True)
y = df[target_col]

num_cols = [
    "Average Daily Social Media Use Time (minutes)",
    "Frequency of Social Media Checking (number of times per day)",
    "Pre-Sleep Social Media Use Duration (minutes)",
]
# Ensure float dtype so scaling writes back safely
X[num_cols] = X[num_cols].astype(float)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

In [None]:
scaler = StandardScaler()
X_train.loc[:, num_cols] = scaler.fit_transform(X_train[num_cols])
X_test.loc[:,  num_cols] = scaler.transform(X_test[num_cols])

# Helper function for model evaluation

In [None]:
def evaluate(y_true, y_pred, name='Model'):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = math.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f'{name:<15s}  MAE: {mae:.3f}   RMSE: {rmse:.3f}   R²: {r2:.3f}')
    return {'MAE': mae, 'RMSE': rmse, 'R2': r2}

# Model 1: Linear Regression

In [None]:
lin = LinearRegression()
lin.fit(X_train, y_train)
y_pred_lin = lin.predict(X_test)
lin_metrics = evaluate(y_test, y_pred_lin, 'Linear Regression')

In [None]:
# Visualize Linear Regression predictions vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_lin, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Stress Level')
plt.ylabel('Predicted Stress Level')
plt.title('Linear Regression: Actual vs. Predicted Stress Levels')
plt.grid(True)
plt.tight_layout()
plt.show()

# Visualize the coefficients
coef = pd.Series(lin.coef_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(12, 8))
coef.plot(kind='bar')
plt.title('Linear Regression Coefficients')
plt.xlabel('Features')
plt.ylabel('Coefficient Value')
plt.xticks(rotation=90)
plt.grid(axis='y')
plt.tight_layout()
plt.show()

# Model 2: Ridge Regression

In [None]:
ridge = RidgeCV(alphas=np.logspace(-3, 3, 25), cv=5)  # searches for the best α on a grid
ridge.fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_test)
ridge_metrics = evaluate(y_test, y_pred_ridge, 'Ridge Regression')
print(f"Ridge alpha used: {ridge.alpha_:.4f}")

In [None]:
# Visualize Ridge Regression predictions vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_ridge, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Stress Level')
plt.ylabel('Predicted Stress Level')
plt.title('Ridge Regression: Actual vs. Predicted Stress Levels')
plt.grid(True)
plt.tight_layout()
plt.show()

# Visualize the coefficients
ridge_coef = pd.Series(ridge.coef_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(12, 8))
ridge_coef.plot(kind='bar')
plt.title('Ridge Regression Coefficients')
plt.xlabel('Features')
plt.ylabel('Coefficient Value')
plt.xticks(rotation=90)
plt.grid(axis='y')
plt.tight_layout()
plt.show()

# Model 3: Random Forest

In [None]:
rf = RandomForestRegressor(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
rf_metrics = evaluate(y_test, y_pred_rf, 'Random Forest')

In [None]:
# Visualize Random Forest predictions vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_rf, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Stress Level')
plt.ylabel('Predicted Stress Level')
plt.title('Random Forest: Actual vs. Predicted Stress Levels')
plt.grid(True)
plt.tight_layout()
plt.show()

# Visualize feature importances
importances = rf.feature_importances_
feat_imp = pd.Series(importances, index=X.columns).sort_values(ascending=False)

plt.figure(figsize=(12, 8))
feat_imp.plot(kind='bar')
plt.title('Random Forest Feature Importances')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.xticks(rotation=90)
plt.grid(axis='y')
plt.tight_layout()
plt.show()

# Model Comparison

In [None]:
# Create a dataframe to compare models
models = ['Linear Regression', 'Ridge Regression', 'Random Forest']
metrics = ['MAE', 'RMSE', 'R2']
comparison = pd.DataFrame(index=models, columns=metrics)
comparison.loc['Linear Regression'] = pd.Series(lin_metrics)
comparison.loc['Ridge Regression'] = pd.Series(ridge_metrics)
comparison.loc['Random Forest'] = pd.Series(rf_metrics)
comparison

In [None]:
# Visualize model comparison
fig, axs = plt.subplots(1, 3, figsize=(15, 5))

# MAE comparison
axs[0].bar(models, comparison['MAE'])
axs[0].set_title('Mean Absolute Error')
axs[0].set_ylabel('MAE')
axs[0].set_xticklabels(models, rotation=45)
axs[0].grid(axis='y')

# RMSE comparison
axs[1].bar(models, comparison['RMSE'])
axs[1].set_title('Root Mean Squared Error')
axs[1].set_ylabel('RMSE')
axs[1].set_xticklabels(models, rotation=45)
axs[1].grid(axis='y')

# R2 comparison
axs[2].bar(models, comparison['R2'])
axs[2].set_title('R² Score')
axs[2].set_ylabel('R²')
axs[2].set_xticklabels(models, rotation=45)
axs[2].grid(axis='y')

plt.tight_layout()
plt.show()