In [29]:
import pandas as pd

# Load dataset
df = pd.read_csv("final_data.csv")

In [30]:
# Create Risk Feature
# Risk = 1 if Stress_Level, Anxiety_Score, or Depression_Score > 3

df['Risk'] = df.apply(lambda x: 1 if (x['Stress_Level'] > 3 or x['Depression_Score'] > 3 or x['Anxiety_Score'] > 3) else 0,axis=1)

In [31]:
# Check distribution of Risk
df['Risk'].value_counts(normalize=True)

Risk
1    0.608677
0    0.391323
Name: proportion, dtype: float64

In [32]:
from sklearn.model_selection import train_test_split

# Prepare input features (X) and targets (y)

X = df.drop(columns=['Stress_Level', 'Anxiety_Score', 'Depression_Score', 'Risk'])
y_stress = df['Stress_Level']
y_anxiety = df['Anxiety_Score']
y_depression = df['Depression_Score']

# Split dataset into train and test sets

X_train, X_test, y_stress_train, y_stress_test = train_test_split(X, y_stress, test_size=0.3, random_state=42)
_, _, y_anx_train, y_anx_test = train_test_split(X, y_anxiety, test_size=0.3, random_state=42)
_, _, y_dep_train, y_dep_test = train_test_split(X, y_depression, test_size=0.3, random_state=42)


In [33]:
from sklearn.preprocessing import StandardScaler

# Feature Scaling: Standardize numeric columns

cols = ['Age', 'CGPA', 'Semester_Credit_Load']
scaler = StandardScaler()

X_train.loc[:, cols] = scaler.fit_transform(X_train[cols])
X_test.loc[:, cols] = scaler.transform(X_test[cols])

 -7.82834384e-01  2.60093792e+00]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_train.loc[:, cols] = scaler.fit_transform(X_train[cols])
 -0.22338862]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_train.loc[:, cols] = scaler.fit_transform(X_train[cols])
 -1.04312456]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_test.loc[:, cols] = scaler.transform(X_test[cols])
  1.15113807]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_test.loc[:, cols] = scaler.transform(X_test[cols])


In [34]:
from sklearn.linear_model import LinearRegression

# Initialize Linear Regression Models

model_stress = LinearRegression()
model_anxiety = LinearRegression()
model_depression = LinearRegression()

# Fit models on training data

model_anxiety.fit(X_train, y_anx_train)
model_stress.fit(X_train, y_stress_train)
model_depression.fit(X_train, y_dep_train)


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [35]:
# Make copies to avoid SettingWithCopyWarning
# Scale the full dataset for prediction

X_scaled = X.copy()
X_scaled.loc[:, cols] = scaler.transform(X_scaled[cols])

# Predict Stress, Anxiety, Depression on full dataset

df['Pred_Stress'] = model_stress.predict(X_scaled)
df['Pred_Anxiety'] = model_anxiety.predict(X_scaled)
df['Pred_Depression'] = model_depression.predict(X_scaled)

# Calculate predicted Risk based on predicted values

df['Pred_Risk'] = df.apply(
    lambda x: 1 if (x['Pred_Stress'] > 3 or x['Pred_Anxiety'] > 3 or x['Pred_Depression'] > 3) else 0,
    axis=1
)




 -0.26225403]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_scaled.loc[:, cols] = scaler.transform(X_scaled[cols])
 -1.13973974]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_scaled.loc[:, cols] = scaler.transform(X_scaled[cols])


In [36]:
from sklearn.metrics import classification_report, confusion_matrix

# Evaluate model performance

print("Confusion Matrix:\n", confusion_matrix(df['Risk'], df['Pred_Risk']))
print("\nClassification Report:\n", classification_report(df['Risk'], df['Pred_Risk']))


print(df[['Stress_Level','Pred_Stress',
          'Anxiety_Score','Pred_Anxiety',
          'Depression_Score','Pred_Depression',
          'Risk','Pred_Risk']].head(10))


Confusion Matrix:
 [[1298 1444]
 [1239 3026]]

Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.47      0.49      2742
           1       0.68      0.71      0.69      4265

    accuracy                           0.62      7007
   macro avg       0.59      0.59      0.59      7007
weighted avg       0.61      0.62      0.61      7007

   Stress_Level  Pred_Stress  Anxiety_Score  Pred_Anxiety  Depression_Score  \
0             3     1.957911              2      2.174818                 3   
1             0     2.020343              0      2.095484                 3   
2             4     2.069469              3      2.027443                 0   
3             3     2.142194              4      1.968713                 3   
4             2     3.180889              3      2.184955                 4   
5             2     2.207263              5      3.256580                 2   
6             0     2.125957              3      

In [37]:
from sklearn.feature_selection import RFE

# Feature Selection using Recursive Feature Elimination (RFE)

rfe1 = RFE(model_stress, n_features_to_select=5)  # pick top 5 features for Stress
rfe1.fit(X_train, y_stress_train)
X_train_rfe1 = X_train.loc[:, rfe1.support_]
X_test_rfe1 = X_test.loc[:, rfe1.support_]

model_stress.fit(X_train_rfe1, y_stress_train)

rfe2 = RFE(model_anxiety, n_features_to_select=5)  # pick top 5 features for Anxiety
rfe2.fit(X_train, y_anx_train)
X_train_rfe2 = X_train.loc[:, rfe2.support_]
X_test_rfe2 = X_test.loc[:, rfe2.support_]

model_anxiety.fit(X_train_rfe2, y_anx_train)


rfe3 = RFE(model_depression, n_features_to_select=5)  # pick top 5 features for Depression
rfe3.fit(X_train, y_dep_train)
X_train_rfe3 = X_train.loc[:, rfe3.support_]
X_test_rfe3 = X_test.loc[:, rfe3.support_]

model_depression.fit(X_train_rfe3, y_dep_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [38]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Check multicollinearity using VIF(Variance Inflation Factor) for Stress Model

vif = pd.DataFrame()
vif['Features'] = X_train_rfe1.columns
vif['VIF'] = [variance_inflation_factor(X_train_rfe1.values, i) for i in range(X_train_rfe1.shape[1])]
vif['VIF'] = round(vif['VIF'],2)

vif = vif.sort_values(by = 'VIF', ascending = False)
vif

Unnamed: 0,Features,VIF
1,Residence_Type_On-Campus,1.34
4,Course_Medical,1.19
2,Course_Computer Science,1.09
3,Course_Engineering,1.08
0,Chronic_Illness,1.03


In [39]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Check multicollinearity using VIF(Variance Inflation Factor) for Anxiety Model

vif = pd.DataFrame()
vif['Features'] = X_train_rfe2.columns
vif['VIF'] = [variance_inflation_factor(X_train_rfe2.values, i) for i in range(X_train_rfe2.shape[1])]
vif['VIF'] = round(vif['VIF'],2)

vif = vif.sort_values(by = 'VIF', ascending = False)
vif

Unnamed: 0,Features,VIF
0,Course_Business,1.0
1,Course_Computer Science,1.0
2,Course_Engineering,1.0
3,Course_Law,1.0
4,Course_Others,1.0


In [40]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()

# Check multicollinearity using VIF(Variance Inflation Factor) for Depression Model

vif['Features'] = X_train_rfe3.columns
vif['VIF'] = [variance_inflation_factor(X_train_rfe3.values, i) for i in range(X_train_rfe3.shape[1])]
vif['VIF'] = round(vif['VIF'],2)

vif = vif.sort_values(by = 'VIF', ascending = False)
vif

Unnamed: 0,Features,VIF
0,Chronic_Illness,1.04
1,Course_Computer Science,1.01
2,Course_Engineering,1.01
3,Course_Law,1.01
4,Course_Medical,1.01


In [41]:
# Scale RFE-selected features for full dataset prediction

X_scaled1 = X.copy()
X_scaled1.loc[:, cols] = scaler.transform(X_scaled1[cols])
X_scaled_rfe1 = X_scaled1[X_train_rfe1.columns]

X_scaled2 = X.copy()
X_scaled2.loc[:, cols] = scaler.transform(X_scaled2[cols])
X_scaled_rfe2 = X_scaled2[X_train_rfe2.columns]

X_scaled3 = X.copy()
X_scaled3.loc[:, cols] = scaler.transform(X_scaled3[cols])
X_scaled_rfe3 = X_scaled3[X_train_rfe3.columns]

# Predict with RFE-selected features

df['Pred_Stress'] = model_stress.predict(X_scaled_rfe1)
df['Pred_Anxiety'] = model_anxiety.predict(X_scaled_rfe2)
df['Pred_Depression'] = model_depression.predict(X_scaled_rfe3)


df['Pred_Risk'] = df.apply(
    lambda x: 1 if (x['Pred_Stress'] > 3 or x['Pred_Anxiety'] > 3 or x['Pred_Depression'] > 3) else 0,
    axis=1
)




 -0.26225403]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_scaled1.loc[:, cols] = scaler.transform(X_scaled1[cols])
 -1.13973974]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_scaled1.loc[:, cols] = scaler.transform(X_scaled1[cols])
 -0.26225403]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_scaled2.loc[:, cols] = scaler.transform(X_scaled2[cols])
 -1.13973974]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_scaled2.loc[:, cols] = scaler.transform(X_scaled2[cols])
 -0.26225403]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_scaled3.loc[:, cols] = scaler.transform(X_scaled3[cols])
 -1.13973974]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_scaled3.loc[:, cols] = scaler.transform(X_scaled3[cols])


In [42]:
from sklearn.metrics import classification_report, confusion_matrix

# Evaluate Linear Regression with RFE

print("Confusion Matrix:\n", confusion_matrix(df['Risk'], df['Pred_Risk']))
print("\nClassification Report:\n", classification_report(df['Risk'], df['Pred_Risk']))


print(df[['Stress_Level','Pred_Stress',
          'Anxiety_Score','Pred_Anxiety',
          'Depression_Score','Pred_Depression',
          'Risk','Pred_Risk']].head(10))


Confusion Matrix:
 [[1280 1462]
 [1220 3045]]

Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.47      0.49      2742
           1       0.68      0.71      0.69      4265

    accuracy                           0.62      7007
   macro avg       0.59      0.59      0.59      7007
weighted avg       0.61      0.62      0.61      7007

   Stress_Level  Pred_Stress  Anxiety_Score  Pred_Anxiety  Depression_Score  \
0             3     2.086013              2      2.113169                 3   
1             0     2.022808              0      2.052561                 3   
2             4     2.086013              3      2.021484                 0   
3             3     2.209339              4      2.021484                 3   
4             2     3.243163              3      2.121751                 4   
5             2     2.185961              5      3.224511                 2   
6             0     2.086013              3      

In [43]:
df['Risk'].value_counts(normalize=True)
df['Pred_Risk'].value_counts(normalize=True)

Pred_Risk
1    0.643214
0    0.356786
Name: proportion, dtype: float64

In [44]:
from sklearn.linear_model import Ridge, Lasso

# Ridge
ridge_stress = Ridge(alpha=1.0)
ridge_anxiety = Ridge(alpha=1.0)
ridge_depression = Ridge(alpha=1.0)

ridge_stress.fit(X_train_rfe1, y_stress_train)
ridge_anxiety.fit(X_train_rfe2, y_anx_train)
ridge_depression.fit(X_train_rfe3, y_dep_train)

# Predictions
df['Ridge_Stress'] = ridge_stress.predict(X_scaled_rfe1)
df['Ridge_Anxiety'] = ridge_anxiety.predict(X_scaled_rfe2)
df['Ridge_Depression'] = ridge_depression.predict(X_scaled_rfe3)

df['Ridge_Risk'] = df.apply(
    lambda x: 1 if (x['Ridge_Stress'] > 3 or x['Ridge_Anxiety'] > 3 or x['Ridge_Depression'] > 3) else 0,
    axis=1
)

print("Ridge Confusion Matrix:\n", confusion_matrix(df['Risk'], df['Ridge_Risk']))
print("\nRidge Classification Report:\n", classification_report(df['Risk'], df['Ridge_Risk']))


Ridge Confusion Matrix:
 [[1280 1462]
 [1220 3045]]

Ridge Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.47      0.49      2742
           1       0.68      0.71      0.69      4265

    accuracy                           0.62      7007
   macro avg       0.59      0.59      0.59      7007
weighted avg       0.61      0.62      0.61      7007



In [45]:
# Lasso
lasso_stress = Lasso(alpha=0.1)
lasso_anxiety = Lasso(alpha=0.1)
lasso_depression = Lasso(alpha=0.1)

lasso_stress.fit(X_train_rfe1, y_stress_train)
lasso_anxiety.fit(X_train_rfe2, y_anx_train)
lasso_depression.fit(X_train_rfe3, y_dep_train)

# Predictions
df['Lasso_Stress'] = lasso_stress.predict(X_scaled_rfe1)
df['Lasso_Anxiety'] = lasso_anxiety.predict(X_scaled_rfe2)
df['Lasso_Depression'] = lasso_depression.predict(X_scaled_rfe3)

df['Lasso_Risk'] = df.apply(
    lambda x: 1 if (x['Lasso_Stress'] > 3 or x['Lasso_Anxiety'] > 3 or x['Lasso_Depression'] > 3) else 0,
    axis=1
)

print("Lasso Confusion Matrix:\n", confusion_matrix(df['Risk'], df['Lasso_Risk']))
print("\nLasso Classification Report:\n", classification_report(df['Risk'], df['Lasso_Risk']))


Lasso Confusion Matrix:
 [[2742    0]
 [4265    0]]

Lasso Classification Report:
               precision    recall  f1-score   support

           0       0.39      1.00      0.56      2742
           1       0.00      0.00      0.00      4265

    accuracy                           0.39      7007
   macro avg       0.20      0.50      0.28      7007
weighted avg       0.15      0.39      0.22      7007



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
