In [None]:
#Importing Libraries
import pandas as pd
import missingno as msno
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import scipy.stats as stats
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, accuracy_score
import plotly.express as px
import warnings  

# Ignore all warnings  
warnings.filterwarnings('ignore')

sns.set(style="whitegrid")
%matplotlib inline

#Reading the dataset
df = pd.read_csv("/kaggle/input/depression/student_depression.csv")
df.head()


In [None]:
#Describe Dataset
df.describe()

In [None]:
df.shape

In [None]:
#Data Cleaning
df['Depression'] = df['Depression'].astype(int)

# Convert columns that should be categorical
cat_cols = ['Gender', 'City', 'Profession', 'Degree',
            'Have you ever had suicidal thoughts ?', 
            'Family History of Mental Illness']
for col in cat_cols:
    df[col] = df[col].astype('category')

# Check unique values in some columns to decide cleaning strategy
print("Unique values in 'Sleep Duration':", df['Sleep Duration'].unique())
print("Unique values in 'Financial Stress':", df['Financial Stress'].unique())

In [None]:
# Define a function to extract numeric hours from Sleep Duration column
def extract_hours(s):
    # Find a number (including decimals)
    match = re.search(r"(\d+(\.\d+)?)", str(s))
    return float(match.group(1)) if match else np.nan

df['Sleep Duration'] = df['Sleep Duration'].apply(extract_hours)

# Convert Financial Stress to categorical if it represents levels (e.g., Low, Medium, High)
df['Financial Stress'] = df['Financial Stress'].astype('category')

# Verify changes
print(df[['Sleep Duration', 'Financial Stress']].head())

In [None]:
#Checking Missing Data
df.isnull().sum()

In [None]:
for col in ['Sleep Duration']:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)

In [None]:
#Data Visualization
plt.figure(figsize=(8,5))
sns.countplot(x='Depression', data=df, palette="viridis")
plt.title("Distribution of Depression among Students")
plt.xlabel("Depression (0 = No, 1 = Yes)")
plt.ylabel("Count")
plt.show()

In [None]:
#Categorical Variables
plt.figure(figsize=(8,5))
sns.countplot(x='Gender', hue='Depression', data=df, palette="Set2")
plt.title("Depression Distribution by Gender")
plt.xlabel("Gender")
plt.ylabel("Count")
plt.legend(title="Depression")
plt.show()


In [None]:
#Correlation
plt.figure(figsize=(10,8))
num_cols = ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 
            'Study Satisfaction', 'Job Satisfaction', 'Sleep Duration', 'Work/Study Hours']
corr_matrix = df[num_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Analysis")
plt.show()

In [None]:
#Feature Engineering
df['Total Pressure'] = df['Academic Pressure'] + df['Work Pressure']

plt.figure(figsize=(8,5))
sns.boxplot(x='Depression', y='Total Pressure', data=df, palette="coolwarm")
plt.title("Total Pressure by Depression Status")
plt.xlabel("Depression (0 = No, 1 = Yes)")
plt.ylabel("Total Pressure")
plt.show()

In [None]:
#Enconding Categorical Variables
cat_features = ['Gender', 'City', 'Profession', 'Degree', 
                'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness', 'Financial Stress']

# Use one-hot encoding
df_encoded = pd.get_dummies(df, columns=cat_features, drop_first=True)

df_encoded.head()

In [None]:
#Machine Learning
# Drop unwanted columns from the original dataframe
drop_cols = ['id', 'Depression', 'Have you ever had suicidal thoughts ?', 
             'Family History of Mental Illness', 'Gender', 'City', 
             'Profession', 'Degree', 'Financial Stress']
df_clean = df.drop(columns=drop_cols)

# Now encode only if you have any categorical variables left (or skip if all are numeric)
df_encoded = pd.get_dummies(df_clean, drop_first=True)

In [None]:
# Print columns after encoding for debugging purposes
print("Columns after encoding:", df_encoded.columns.tolist())

# Define keys for the original categorical columns that were encoded
cat_keys = ["Have you ever had suicidal thoughts ?", "Family History of Mental Illness", 
            "Gender", "City", "Profession", "Degree", "Financial Stress"]

# Identify dummy columns that contain any of these keys
dummy_cols = [col for col in df_encoded.columns if any(key in col for key in cat_keys)]
# Build a list of columns to drop only if they exist in data_encoded
drop_cols = []
for col in ['id', 'Depression']:
    if col in df_encoded.columns:
        drop_cols.append(col)
drop_cols += dummy_cols  # Add dummy columns to drop list

# Drop the columns
X = df_encoded.drop(columns=drop_cols)
# Ensure target variable is correctly defined. If 'Depression' was dropped, use the original target.
if 'Depression' in df_encoded.columns:
    y = df_encoded['Depression']
else:
    # If 'Depression' is not in df_encoded, use it from the original data
    y = df['Depression']
# Standardize numerical features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
num_feats = ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 
             'Study Satisfaction', 'Job Satisfaction', 'Sleep Duration', 
             'Work/Study Hours', 'Total Pressure']
X[num_feats] = scaler.fit_transform(X[num_feats])

print("Feature matrix shape:", X.shape)

In [None]:
#Split Data into Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#Runnung Logistic Regression Model
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

# Predictions and evaluation
y_pred_log = log_model.predict(X_test)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_log))
# Confusion matrix
cm_log = confusion_matrix(y_test, y_pred_log)
sns.heatmap(cm_log, annot=True, fmt="d", cmap='Blues')
plt.title("Logistic Regression Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# ROC Curve
y_prob_log = log_model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_prob_log)
roc_auc_log = auc(fpr, tpr)
plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, label=f'Logistic Regression ROC curve (AUC = {roc_auc_log:.2f})', color='darkorange')
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Logistic Regression')
plt.legend(loc="lower right")
plt.show()


In [None]:
# Uisng Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions and evaluation
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

# Confusion matrix for RF
cm_rf = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm_rf, annot=True, fmt="d", cmap='Greens')
plt.title("Random Forest Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# ROC Curve for RF
y_prob_rf = rf_model.predict_proba(X_test)[:,1]
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, y_prob_rf)
roc_auc_rf = auc(fpr_rf, tpr_rf)
plt.figure(figsize=(8,6))
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest ROC curve (AUC = {roc_auc_rf:.2f})', color='green')
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Random Forest')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Cross-validation for Logistic Regression
cv_scores_log = cross_val_score(log_model, X, y, cv=5, scoring='accuracy')
print("5-Fold CV Accuracy for Logistic Regression:", cv_scores_log)
print("Mean CV Accuracy:", cv_scores_log.mean())

# Cross-validation for Random Forest
cv_scores_rf = cross_val_score(rf_model, X, y, cv=5, scoring='accuracy')
print("5-Fold CV Accuracy for Random Forest:", cv_scores_rf)
print("Mean CV Accuracy:", cv_scores_rf.mean())