In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("/content/Student Depression Dataset.csv")

## DATA EXPLORATION

In [None]:
df.head(8)

In [None]:
df.tail(8)

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe().round(2)

In [None]:
plt.figure(figsize=(12,5))
sns.heatmap(df.describe().round(2),annot=True)
plt.show()

# DATA PREPROCESSING

In [None]:
df.isnull().sum()

In [None]:
df['Financial Stress'].unique()

In [None]:
df['Financial Stress']=df['Financial Stress'].fillna(df['Financial Stress'].mean())

In [None]:
df.duplicated().sum()

In [None]:
df.isna().sum()

In [None]:
df.drop(columns=['id'],inplace=True)

In [None]:
df['Work Pressure'].unique()

In [None]:
df['Work Pressure'].value_counts()

In [None]:
df.drop(columns=['Work Pressure'],inplace=True)

In [None]:
df['Job Satisfaction'].value_counts()

In [None]:
df.drop(columns=['Job Satisfaction'],inplace=True)

In [None]:
df['Profession'].value_counts()

In [None]:
df.drop(columns=['Profession'],inplace=True)

In [None]:
df.rename(columns={'Depression Score':'Depression_Score','Academic Pressure':'Academic_Pressure','Study Satisfaction':'Study_Satisfaction','Sleep Duration':'Sleep_Duration','Dietary Habits':'Dietary_Habits','Have you ever had suicidal thoughts ?':'Have_you_ever_had_suicidal_thoughts_?','Work/Study Hours':'Work/Study_Hours','Financial_Stress':'Financial Stress','Family History of Mental Illness':'Family_History_of_Mental_Illness'},inplace=True)

In [None]:
df.head(10)

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(df)
plt.xticks(rotation=45, ha='right') # Rotate x-axis labels for better readability
plt.show()

In [None]:
df[df['CGPA']==0]

In [None]:
def remove_outliers_iqr(df, columns):
    clean_df = df.copy()
    for col in columns:
        Q1 = clean_df[col].quantile(0.25)
        Q3 = clean_df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        # نحافظ بس على القيم اللي جوه الحدود
        clean_df = clean_df[(clean_df[col] >= lower_bound) & (clean_df[col] <= upper_bound)]
    return clean_df

In [None]:
filtered_df = remove_outliers_iqr(df, ["Age","CGPA"])

In [None]:
df.shape

In [None]:
plt.figure(figsize=(11,5))
sns.boxplot(filtered_df)
plt.xticks(rotation=45, ha='right') # Rotate x-axis labels for better readability
plt.show()

In [None]:
filtered_df.shape

# EDA

In [None]:
plt.figure(figsize=(15,7))
sns.displot(filtered_df['Depression'],kde=True,color='red')
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(filtered_df['Age'],kde=True,color='green')

In [None]:
plt.figure(figsize=(10,5))
sns.lineplot(data=filtered_df,x='Age',y='Depression',color='red',marker='o',markersize=5,markeredgecolor='black')

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data=filtered_df,x='Gender',hue='Depression')

In [None]:
plt.figure(figsize=(10,5))
sns.catplot(data=filtered_df,x='Gender',y='Depression',kind='bar',color='red',palette='rocket',hue='Dietary_Habits')

In [None]:
plt.figure(figsize=(12,7))
sns.lineplot(data=filtered_df,x='CGPA',y='Depression',color='red',marker='o',markeredgecolor='black')

In [None]:
plt.figure(figsize=(11,6))
sns.lineplot(data=filtered_df,x='Work/Study_Hours',y='Depression',color='red',marker='o',markeredgecolor='black')

In [None]:
plt.figure(figsize=(11,6))
sns.lineplot(data=filtered_df,x='Academic_Pressure',y='Depression',color='red',marker='o',markeredgecolor='black')

In [None]:
plt.figure(figsize=(11,6))
sns.lineplot(data=filtered_df,x='Financial Stress',y='Depression',color='red',marker='o',markeredgecolor='black')
#

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(data=filtered_df,x='City',hue='Depression',multiple='stack',palette='rocket',shrink=.8,edgecolor='black',linewidth=1)
plt.xticks(rotation=90)

In [None]:
plt.figure(figsize=(10,6))
sns.catplot(data=filtered_df,x='Family_History_of_Mental_Illness',y='Depression',kind='bar',color='red',palette='rocket')

In [None]:
plt.figure(figsize=(15,7))
sns.catplot(data=filtered_df,x='Sleep_Duration',y='Depression',kind='bar',color='red',palette='rocket',hue='Gender')
plt.xticks(rotation=90)

# FEATURE ENGENERING

In [None]:
ln=LabelEncoder()

In [None]:
filtered_df.drop(columns=['Degree','City'],inplace=True)

In [None]:
filtered_df.head(3)

In [None]:
filtered_df['Gender']=ln.fit_transform(filtered_df['Gender'])
filtered_df['Dietary_Habits']=ln.fit_transform(filtered_df['Dietary_Habits'])
filtered_df['Family_History_of_Mental_Illness']=ln.fit_transform(filtered_df['Family_History_of_Mental_Illness'])
filtered_df['Have_you_ever_had_suicidal_thoughts_?']=ln.fit_transform(filtered_df['Have_you_ever_had_suicidal_thoughts_?'])
filtered_df['Sleep_Duration']=filtered_df['Sleep_Duration'].map({'Less than 5 hours':0,'5-6 hours':1,'7-8 hours':2,'More than 8 hours':3,'Others':4})

In [None]:
plt.figure(figsize=(8,10))
sns.heatmap(filtered_df.corr(),annot=True,cmap='rocket',linewidths=1,linecolor='black',fmt='.2f')
plt.show()

In [None]:
x=filtered_df.drop(columns=['Depression'])
y=filtered_df['Depression']
# X: الميزات، y: الهدف
selector = SelectKBest(score_func=f_classif, k='all')  # أو حدد k=5 لو عايز بس أعلى 5
fit = selector.fit(x,y)
# نتائج التقييم
feature_scores = pd.DataFrame({
    'Feature': x.columns,
    'Score': fit.scores_
}).sort_values(by='Score', ascending=False)
print(feature_scores)

In [None]:
filtered_df.head(3)

In [None]:
sc=StandardScaler()

In [None]:
filtered_df['Age']=sc.fit_transform(filtered_df[['Age']])
filtered_df['CGPA']=sc.fit_transform(filtered_df[['CGPA']])
filtered_df['Work/Study_Hours']=sc.fit_transform(filtered_df[['Work/Study_Hours']])
filtered_df['Academic_Pressure']=sc.fit_transform(filtered_df[['Academic_Pressure']])
filtered_df['Financial Stress']=sc.fit_transform(filtered_df[['Financial Stress']])
filtered_df['Study_Satisfaction']=sc.fit_transform(filtered_df[['Study_Satisfaction']])

In [None]:
filtered_df.head(3)

# MODELING

In [None]:
x=filtered_df.drop(columns=['Depression'])
y=filtered_df['Depression']

In [None]:
lr=LogisticRegression()
svm=SVC(C=10,kernel='linear')
kn=KNeighborsClassifier(n_neighbors=1)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
lr.fit(x_train,y_train)
svm.fit(x_train,y_train)

In [None]:
accuracy_score(y_test,lr.predict(x_test))

In [None]:
accuracy_score(y_test,svm.predict(x_test))

In [None]:
x_new = filtered_df.drop(columns=['Depression','Family_History_of_Mental_Illness','Gender','CGPA'])
x_train,x_test,y_train,y_test=train_test_split(x_new,y,test_size=0.2,random_state=42)

In [None]:
lr.fit(x_train,y_train)
svm.fit(x_train,y_train)

In [None]:
accuracy_score(y_test,lr.predict(x_test))

In [None]:
accuracy_score(y_test,svm.predict(x_test))

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lr, x, y, cv=5, scoring='accuracy')
print(scores)

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(svm, x, y, cv=5, scoring='accuracy')
print(scores)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, lr.predict(x_test)))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, svm.predict(x_test)))

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = lr.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
plt.matshow(cm, cmap='Blues')
plt.title("Confusion Matrix")
plt.colorbar()
plt.xlabel('Predicted')
plt.ylabel('Actual')
for (i, j), val in np.ndenumerate(cm):
    plt.text(j, i, val, ha='center', va='center', color='black')
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc
y_probs = lr.predict_proba(x_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_probs)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve
y_probs = lr.predict_proba(x_test)[:, 1]
precision, recall, _ = precision_recall_curve(y_test, y_probs)
plt.plot(recall, precision, color='blue')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()

In [None]:
import joblib # Import the joblib library
joblib.dump(lr, "logreg_model.pkl")
joblib.dump(svm, "svm_model.pkl")

In [None]:
filtered_df.to_csv('preprocessed_data.csv', index=False)