In [None]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# The dataset link: https://www.kaggle.com/datasets/globalmediadata/socialmediausage-sleepdata-sg

In [None]:
df = pd.read_csv('SocialMediaUsage_SleepLatencyAnalysis_Singapore.csv',encoding='utf-8')

# Knowing The Data


In [None]:
#know the shape of the data
print(df.shape)

In [None]:
#take a look of the data
df.head()

In [None]:
#descriing numerical values
df.describe()

In [None]:
#know the data types of the coulumns
print(df.info())

In [None]:
#to get columns names
df.columns

In [None]:
#Check for null values
print(df.isnull().sum())

In [None]:
#to show outliers for all numerical columns
for column in df.select_dtypes(include=['number']).columns:
	fig = px.box(df, x=column, title= f'Box plot for {column}')
	fig.show()

In [None]:
df_numeric = df.select_dtypes(include=[np.number])

sns.heatmap(df_numeric.corr(),cmap='coolwarm',annot=True)#annot-->the percentage
plt.show()

In [None]:
selected_cols = [
    "Average Daily Social Media Use Time (minutes)",
    "Frequency of Social Media Checking (number of times per day)",
    "Pre-Sleep Social Media Use Duration (minutes)",
    "Dominant Social Media Platform",
    "Type of Social Media Content Consumed",
]
target_col = "Stress Level Rating"

X = df[selected_cols]
y = df[target_col]

In [None]:
# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=None
)

In [None]:
numeric_features = [
    "Average Daily Social Media Use Time (minutes)",
    "Frequency of Social Media Checking (number of times per day)",
    "Pre-Sleep Social Media Use Duration (minutes)",
]
categorical_features = [
    "Dominant Social Media Platform",
    "Type of Social Media Content Consumed",
]
preproc = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(drop="first"), categorical_features),
    ]
)

In [None]:
lin_pipe = Pipeline(
    steps=[("preproc", preproc), ("model", LinearRegression())]
)

# Ridge with internal CV over a log-spaced grid of α
ridge_alphas = np.logspace(-3, 3, 25)        # 0.001 → 1000
ridge_pipe = Pipeline(
    steps=[
        ("preproc", preproc),
        ("model", RidgeCV(alphas=ridge_alphas, cv=5))
    ]
)

rf_pipe = Pipeline(
    steps=[
        ("preproc", preproc),
        ("model", RandomForestRegressor(
            n_estimators=300,
            random_state=42,
            n_jobs=-1,
        )),
    ]
)

In [None]:
# Train
lin_pipe.fit(X_train, y_train)
ridge_pipe.fit(X_train, y_train)
rf_pipe.fit(X_train, y_train)

In [None]:
# Predict
y_pred_lin = lin_pipe.predict(X_test)
y_pred_ridge = ridge_pipe.predict(X_test)
y_pred_rf  = rf_pipe.predict(X_test)

In [None]:
def evaluate(y_true, y_pred, name="Model"):
    mae  = mean_absolute_error(y_true, y_pred)
    rmse = math.sqrt(mean_squared_error(y_true, y_pred))
    r2   = r2_score(y_true, y_pred)
    print(f"{name:16s}  MAE: {mae:.3f}   RMSE: {rmse:.3f}   R²: {r2:.3f}")
    return {"MAE": mae, "RMSE": rmse, "R2": r2}

print("\n--- Test-set performance ---")
lin_metrics   = evaluate(y_test, y_pred_lin,   "Linear Regression")
ridge_metrics = evaluate(y_test, y_pred_ridge, "Ridge Regression")
rf_metrics    = evaluate(y_test, y_pred_rf,    "Random-Forest")

print(f"\nBest α chosen by RidgeCV: {ridge_pipe.named_steps['model'].alpha_:.4f}")

In [None]:
# 8.  Random-Forest feature importances
ohe = rf_pipe.named_steps["preproc"].named_transformers_["cat"]
feature_names = numeric_features + list(ohe.get_feature_names_out(categorical_features))
importances = rf_pipe.named_steps["model"].feature_importances_

imp_series = pd.Series(importances, index=feature_names).sort_values()

plt.figure(figsize=(8, 5))
plt.barh(imp_series.index, imp_series.values)
plt.xlabel("Importance")
plt.title("Random-Forest Feature Importances")
plt.tight_layout()
plt.show()