In [31]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer,SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report

In [32]:
df = pd.read_csv(r"C:\Users\phunk\Desktop\MyProject\machine learning\Student Depress\student_depression_dataset.csv")

In [33]:
df.head()

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,'5-6 hours',Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,'5-6 hours',Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,'Less than 5 hours',Healthy,BA,No,9.0,1.0,Yes,0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,'7-8 hours',Moderate,BCA,Yes,4.0,5.0,Yes,1
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,'5-6 hours',Moderate,M.Tech,Yes,1.0,1.0,No,0


In [34]:
df.isnull().sum()

id                                       0
Gender                                   0
Age                                      0
City                                     0
Profession                               0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         0
Family History of Mental Illness         0
Depression                               0
dtype: int64

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Data columns (total 18 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     27901 non-null  int64  
 1   Gender                                 27901 non-null  object 
 2   Age                                    27901 non-null  float64
 3   City                                   27901 non-null  object 
 4   Profession                             27901 non-null  object 
 5   Academic Pressure                      27901 non-null  float64
 6   Work Pressure                          27901 non-null  float64
 7   CGPA                                   27901 non-null  float64
 8   Study Satisfaction                     27901 non-null  float64
 9   Job Satisfaction                       27901 non-null  float64
 10  Sleep Duration                         27901 non-null  object 
 11  Di

In [36]:
for col in df.columns:
    print(f"{col} : {len(df[col].unique())}")

id : 27901
Gender : 2
Age : 34
City : 52
Profession : 14
Academic Pressure : 6
Work Pressure : 3
CGPA : 332
Study Satisfaction : 6
Job Satisfaction : 5
Sleep Duration : 5
Dietary Habits : 4
Degree : 28
Have you ever had suicidal thoughts ? : 2
Work/Study Hours : 13
Financial Stress : 6
Family History of Mental Illness : 2
Depression : 2


In [37]:
for col in df.columns:
    print(f"{col} : {df[col].unique()}")

id : [     2      8     26 ... 140689 140690 140699]
Gender : ['Male' 'Female']
Age : [33. 24. 31. 28. 25. 29. 30. 27. 19. 20. 23. 18. 21. 22. 34. 32. 26. 39.
 35. 42. 36. 58. 49. 38. 51. 44. 43. 46. 59. 54. 48. 56. 37. 41.]
City : ['Visakhapatnam' 'Bangalore' 'Srinagar' 'Varanasi' 'Jaipur' 'Pune' 'Thane'
 'Chennai' 'Nagpur' 'Nashik' 'Vadodara' 'Kalyan' 'Rajkot' 'Ahmedabad'
 'Kolkata' 'Mumbai' 'Lucknow' 'Indore' 'Surat' 'Ludhiana' 'Bhopal'
 'Meerut' 'Agra' 'Ghaziabad' 'Hyderabad' 'Vasai-Virar' 'Kanpur' 'Patna'
 'Faridabad' 'Delhi' 'Saanvi' 'M.Tech' 'Bhavna' "'Less Delhi'" 'City'
 '3.0' "'Less than 5 Kalyan'" 'Mira' 'Harsha' 'Vaanya' 'Gaurav' 'Harsh'
 'Reyansh' 'Kibara' 'Rashi' 'ME' 'M.Com' 'Nalyan' 'Mihir' 'Nalini'
 'Nandini' 'Khaziabad']
Profession : ['Student' "'Civil Engineer'" 'Architect' "'UX/UI Designer'"
 "'Digital Marketer'" "'Content Writer'" "'Educational Consultant'"
 'Teacher' 'Manager' 'Chef' 'Doctor' 'Lawyer' 'Entrepreneur' 'Pharmacist']
Academic Pressure : [5. 2. 3. 4. 1

In [38]:
df["Financial Stress"] = df["Financial Stress"].replace("?",np.nan)

In [39]:
X = df.drop(["Depression","id"],axis=1)
y = df["Depression"]

In [40]:
cat_col = ["Gender","City","Profession","Sleep Duration","Dietary Habits","Degree","Have you ever had suicidal thoughts ?","Family History of Mental Illness"]
num_col = ["Age","Academic Pressure","Work Pressure","CGPA","Study Satisfaction","Job Satisfaction","Work/Study Hours","Financial Stress"]

In [41]:
cat_tranform = Pipeline(steps=[
    ('simple impute',SimpleImputer(strategy='most_frequent')),
    ('one hot encode',OneHotEncoder(drop="first",sparse_output=False, handle_unknown='ignore'))
])

def round_values(x):
    return np.round(x, 0)

num_tranform = Pipeline(steps=[
    ('impute',KNNImputer(n_neighbors=3)),
    ('round',FunctionTransformer(round_values)),
    ('scaler',StandardScaler())
])

prepreocess = ColumnTransformer(transformers=[
    ('cat',cat_tranform,cat_col),
    ('num',num_tranform,num_col)
])

In [42]:
models = {
    "logistic regression": LogisticRegression(max_iter=500),
    "decision tree": DecisionTreeClassifier(max_depth=5, min_samples_split=10, min_samples_leaf=5, random_state=42),
    "random forest": RandomForestClassifier(n_estimators=300, max_depth=10, min_samples_split=10, random_state=42, n_jobs=-1),
    "SVM": SVC(),
    "KNeighbors": KNeighborsClassifier(n_neighbors=5)
}


In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [47]:
for name, model in models.items():
    clf = Pipeline(steps=[
        ('preprocess',prepreocess),
        ('classifier',model)
    ])

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(name)
    print(classification_report(y_test, y_pred))
    scores = cross_val_score(clf, X_train, y_train, cv=5)
    print(f"{name}: CV Accuracy = {scores.mean():.3f}")
    
voting_clf = VotingClassifier(
    estimators=list(models.items()),
    voting="hard"
)
vclf = Pipeline(steps=[
    ('prepreocess',prepreocess),
    ('classifier',voting_clf)
])
vclf.fit(X_train, y_train)
y_pred_voting = vclf.predict(X_test)
print("Voting Result(Hard)")
print(classification_report(y_test,y_pred_voting))



logistic regression: CV Accuracy = 0.850




logistic regression
              precision    recall  f1-score   support

           0       0.82      0.79      0.80      2343
           1       0.85      0.88      0.86      3238

    accuracy                           0.84      5581
   macro avg       0.84      0.83      0.83      5581
weighted avg       0.84      0.84      0.84      5581





decision tree: CV Accuracy = 0.827




decision tree
              precision    recall  f1-score   support

           0       0.77      0.80      0.78      2343
           1       0.85      0.82      0.84      3238

    accuracy                           0.81      5581
   macro avg       0.81      0.81      0.81      5581
weighted avg       0.82      0.81      0.81      5581





random forest: CV Accuracy = 0.841




random forest
              precision    recall  f1-score   support

           0       0.83      0.75      0.79      2343
           1       0.83      0.89      0.86      3238

    accuracy                           0.83      5581
   macro avg       0.83      0.82      0.82      5581
weighted avg       0.83      0.83      0.83      5581





SVM: CV Accuracy = 0.848




SVM
              precision    recall  f1-score   support

           0       0.82      0.77      0.80      2343
           1       0.84      0.88      0.86      3238

    accuracy                           0.83      5581
   macro avg       0.83      0.83      0.83      5581
weighted avg       0.83      0.83      0.83      5581





KNeighbors: CV Accuracy = 0.818




KNeighbors
              precision    recall  f1-score   support

           0       0.80      0.72      0.76      2343
           1       0.81      0.87      0.84      3238

    accuracy                           0.81      5581
   macro avg       0.80      0.80      0.80      5581
weighted avg       0.81      0.81      0.81      5581





Voting Result(Hard)
              precision    recall  f1-score   support

           0       0.82      0.78      0.80      2343
           1       0.84      0.88      0.86      3238

    accuracy                           0.84      5581
   macro avg       0.83      0.83      0.83      5581
weighted avg       0.84      0.84      0.84      5581

