# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Load Data

In [2]:
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

# Dataset Overview

In [3]:
train_df.describe()

Unnamed: 0,id,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Work/Study Hours,Financial Stress,Depression
count,140700.0,140700.0,27897.0,112782.0,27898.0,27897.0,112790.0,140700.0,140696.0,140700.0
mean,70349.5,40.388621,3.142273,2.998998,7.658636,2.94494,2.974404,6.252679,2.988983,0.181713
std,40616.735775,12.384099,1.380457,1.405771,1.464466,1.360197,1.416078,3.853615,1.413633,0.385609
min,0.0,18.0,1.0,1.0,5.03,1.0,1.0,0.0,1.0,0.0
25%,35174.75,29.0,2.0,2.0,6.29,2.0,2.0,3.0,2.0,0.0
50%,70349.5,42.0,3.0,3.0,7.77,3.0,3.0,6.0,3.0,0.0
75%,105524.25,51.0,4.0,4.0,8.92,4.0,4.0,10.0,4.0,0.0
max,140699.0,60.0,5.0,5.0,10.0,5.0,5.0,12.0,5.0,1.0


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140700 entries, 0 to 140699
Data columns (total 20 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   id                                     140700 non-null  int64  
 1   Name                                   140700 non-null  object 
 2   Gender                                 140700 non-null  object 
 3   Age                                    140700 non-null  float64
 4   City                                   140700 non-null  object 
 5   Working Professional or Student        140700 non-null  object 
 6   Profession                             104070 non-null  object 
 7   Academic Pressure                      27897 non-null   float64
 8   Work Pressure                          112782 non-null  float64
 9   CGPA                                   27898 non-null   float64
 10  Study Satisfaction                     27897 non-null   

In [5]:
numerical_vars = ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction',
                  'Job Satisfaction', 'Work/Study Hours', 'Financial Stress', 'Depression']

categorical_vars = ['Gender', 'Working Professional or Student', 'Sleep Duration', 'Dietary Habits', 
                    'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']

In [6]:
train_df = train_df[numerical_vars+categorical_vars]

# Filter relevant columns

In [7]:
def identify_cat_above30(series):
    counts = series.value_counts()
    return list(counts[counts>=30].index)

In [8]:
levels_to_keep = train_df[categorical_vars].apply(identify_cat_above30, axis=0)
levels_to_keep

Gender                                                                      [Male, Female]
Working Professional or Student                            [Working Professional, Student]
Sleep Duration                           [Less than 5 hours, 7-8 hours, More than 8 hou...
Dietary Habits                                              [Moderate, Unhealthy, Healthy]
Have you ever had suicidal thoughts ?                                            [No, Yes]
Family History of Mental Illness                                                 [No, Yes]
dtype: object

In [9]:
for var in categorical_vars:
    train_df = train_df.loc[train_df[var].isin(levels_to_keep[var])]

# Handling Missing Values

In [10]:
total = train_df.isnull().sum().sort_values(ascending=False)

percent = (train_df.isnull().sum()/train_df.isnull().count()).sort_values(ascending=False)

missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Missing Percent'])

In [11]:
missing_data['Missing Percent'] = missing_data['Missing Percent'].apply(lambda x: x*100)
missing_data.loc[missing_data['Missing Percent'] > 10][:10]

Unnamed: 0,Total,Missing Percent
Academic Pressure,112727,80.179097
Study Satisfaction,112727,80.179097
CGPA,112726,80.178386
Work Pressure,27888,19.835839
Job Satisfaction,27880,19.830149


# Check for missing data and split by group (Student/Working Professional)

In [12]:
students_df = train_df[train_df['Working Professional or Student'] == 'Student']
working_professionals_df = train_df[train_df['Working Professional or Student'] == 'Working Professional']

# Handle missing values in the "Student" dataset

In [13]:
columns_to_drop = ['Work Pressure', 'Job Satisfaction']
students_df = students_df.drop(columns_to_drop, axis=1)

In [14]:
numerical_vars_students = ['Age', 'Academic Pressure', 'CGPA', 'Study Satisfaction',
                           'Work/Study Hours', 'Financial Stress']

categorical_vars_students = ['Gender', 'Working Professional or Student', 'Sleep Duration', 'Dietary Habits', 
                             'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']

In [15]:
columns_to_drop = ['Academic Pressure', 'CGPA', 'Study Satisfaction']
working_professionals_df = working_professionals_df.drop(columns_to_drop, axis=1)

In [16]:
numerical_vars_working = ['Age', 'Work Pressure', 'Job Satisfaction',
                          'Work/Study Hours', 'Financial Stress']

categorical_vars_working = ['Gender', 'Working Professional or Student', 'Sleep Duration', 'Dietary Habits', 
                            'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']

In [17]:
students_df_train = students_df[numerical_vars_students+categorical_vars_students]

In [18]:
working_professionals_df_train = working_professionals_df[numerical_vars_working+categorical_vars_working]

# Handle missing values for numerical variables using SimpleImputer

In [19]:
total = students_df_train.isnull().sum().sort_values(ascending=False)

percent = (students_df_train.isnull().sum()/students_df_train.isnull().count()).sort_values(ascending=False)

missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Missing Percent'])

In [20]:
missing_data

Unnamed: 0,Total,Missing Percent
Study Satisfaction,10,0.000359
Academic Pressure,9,0.000323
CGPA,9,0.000323
Financial Stress,3,0.000108
Age,0,0.0
Work/Study Hours,0,0.0
Gender,0,0.0
Working Professional or Student,0,0.0
Sleep Duration,0,0.0
Dietary Habits,0,0.0


In [21]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')


students_df_train[numerical_vars_students] = imputer.fit_transform(students_df_train[numerical_vars_students])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  students_df_train[numerical_vars_students] = imputer.fit_transform(students_df_train[numerical_vars_students])


In [22]:
total = students_df_train.isnull().sum().sort_values(ascending=False)

percent = (students_df_train.isnull().sum()/students_df_train.isnull().count()).sort_values(ascending=False)

missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Missing Percent'])

In [23]:
missing_data

Unnamed: 0,Total,Missing Percent
Age,0,0.0
Academic Pressure,0,0.0
CGPA,0,0.0
Study Satisfaction,0,0.0
Work/Study Hours,0,0.0
Financial Stress,0,0.0
Gender,0,0.0
Working Professional or Student,0,0.0
Sleep Duration,0,0.0
Dietary Habits,0,0.0


# Handle missing values in the "Working Professional" dataset

In [24]:
total = working_professionals_df_train.isnull().sum().sort_values(ascending=False)

percent = (working_professionals_df_train.isnull().sum()/working_professionals_df_train.isnull().count()).sort_values(ascending=False)

missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Missing Percent'])

In [25]:
missing_data

Unnamed: 0,Total,Missing Percent
Work Pressure,20,0.000177
Job Satisfaction,17,0.000151
Financial Stress,1,9e-06
Age,0,0.0
Work/Study Hours,0,0.0
Gender,0,0.0
Working Professional or Student,0,0.0
Sleep Duration,0,0.0
Dietary Habits,0,0.0
Have you ever had suicidal thoughts ?,0,0.0


In [26]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')

working_professionals_df_train[numerical_vars_working] = imputer.fit_transform(working_professionals_df_train[numerical_vars_working])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  working_professionals_df_train[numerical_vars_working] = imputer.fit_transform(working_professionals_df_train[numerical_vars_working])


In [27]:
total = working_professionals_df_train.isnull().sum().sort_values(ascending=False)

percent = (working_professionals_df_train.isnull().sum()/working_professionals_df_train.isnull().count()).sort_values(ascending=False)

missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Missing Percent'])

In [28]:
missing_data

Unnamed: 0,Total,Missing Percent
Age,0,0.0
Work Pressure,0,0.0
Job Satisfaction,0,0.0
Work/Study Hours,0,0.0
Financial Stress,0,0.0
Gender,0,0.0
Working Professional or Student,0,0.0
Sleep Duration,0,0.0
Dietary Habits,0,0.0
Have you ever had suicidal thoughts ?,0,0.0


# Split the data into features and labels

In [29]:
X_students = students_df_train
y_students = students_df['Depression']

In [30]:
X_working = working_professionals_df_train
y_working = working_professionals_df['Depression']

# Encoding categorical variables

In [31]:
from sklearn.preprocessing import LabelEncoder

for col in categorical_vars_students:
    le = LabelEncoder()
    X_students[col] = le.fit_transform(X_students[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_students[col] = le.fit_transform(X_students[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_students[col] = le.fit_transform(X_students[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_students[col] = le.fit_transform(X_students[col])
A value is trying to be set on a copy of a slice 

In [32]:
for col in categorical_vars_working:
    le = LabelEncoder()
    X_working[col] = le.fit_transform(X_working[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_working[col] = le.fit_transform(X_working[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_working[col] = le.fit_transform(X_working[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_working[col] = le.fit_transform(X_working[col])
A value is trying to be set on a copy of a slice from a

# Standardize the features

In [33]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [34]:
X_students = scaler.fit_transform(X_students)

In [35]:
X_working = scaler.fit_transform(X_working)

# Handle class imbalance using SMOTE

In [36]:
count_class = y_students.value_counts()
count_class

Depression
1    16319
0    11552
Name: count, dtype: int64

In [37]:
count_class = y_working.value_counts()
count_class

Depression
0    103496
1      9227
Name: count, dtype: int64

In [38]:
from imblearn.over_sampling import SMOTE

In [39]:
smote = SMOTE(sampling_strategy='minority')
X_students, y_students = smote.fit_resample(X_students, y_students)

In [40]:
X_working, y_working = smote.fit_resample(X_working, y_working)

# Split the data into training and testing sets

In [41]:
X_train_students, X_test_students, y_train_students, y_test_students = train_test_split(
    X_students, y_students, test_size=0.2, random_state=42
)

In [42]:
X_train_working, X_test_working, y_train_working, y_test_working = train_test_split(
    X_working, y_working, test_size=0.2, random_state=42
)

# Model 1: Random Forest

In [43]:
print("=== Random Forest Model ===")
rf_model_student = RandomForestClassifier(n_estimators=100, random_state=42)

=== Random Forest Model ===


In [44]:
cv_scores_rf = cross_val_score(rf_model_student, X_train_students, y_train_students, cv=5, scoring='accuracy')
print("Cross-Validation Scores (Random Forest):", cv_scores_rf)
print("Mean CV Accuracy (Random Forest):", np.mean(cv_scores_rf))
print("Standard Deviation CV Accuracy (Random Forest):", np.std(cv_scores_rf))

Cross-Validation Scores (Random Forest): [0.85848334 0.86020682 0.85867484 0.85733435 0.86058981]
Mean CV Accuracy (Random Forest): 0.8590578322481809
Standard Deviation CV Accuracy (Random Forest): 0.0011928312142778757


In [45]:
rf_model_student.fit(X_train_students, y_train_students)

In [46]:
y_pred_rf = rf_model_student.predict(X_test_students)
print("\nTest Set Evaluation Student (Random Forest)")
print(classification_report(y_test_students, y_pred_rf))
print("Test Accuracy (Random Forest):", accuracy_score(y_test_students, y_pred_rf))


Test Set Evaluation Student (Random Forest)
              precision    recall  f1-score   support

           0       0.87      0.86      0.86      3255
           1       0.86      0.87      0.86      3273

    accuracy                           0.86      6528
   macro avg       0.86      0.86      0.86      6528
weighted avg       0.86      0.86      0.86      6528

Test Accuracy (Random Forest): 0.8625919117647058


In [47]:
rf_model_working = RandomForestClassifier(n_estimators=100, random_state=42)

In [48]:
cv_scores_rf = cross_val_score(rf_model_working, X_train_working, y_train_working, cv=5, scoring='accuracy')
print("Cross-Validation Scores (Random Forest):", cv_scores_rf)
print("Mean CV Accuracy (Random Forest):", np.mean(cv_scores_rf))
print("Standard Deviation CV Accuracy (Random Forest):", np.std(cv_scores_rf))

Cross-Validation Scores (Random Forest): [0.97487847 0.97581449 0.97557293 0.97533064 0.97659883]
Mean CV Accuracy (Random Forest): 0.9756390708020432
Standard Deviation CV Accuracy (Random Forest): 0.0005709514077750439


In [49]:
rf_model_working.fit(X_train_working, y_train_working)

In [50]:
y_pred_rf = rf_model_working.predict(X_test_working)
print("\nTest Set Evaluation Working Professional (Random Forest)")
print(classification_report(y_test_working, y_pred_rf))
print("Test Accuracy (Random Forest):", accuracy_score(y_test_working, y_pred_rf))


Test Set Evaluation Working Professional (Random Forest)
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     20810
           1       0.98      0.98      0.98     20589

    accuracy                           0.98     41399
   macro avg       0.98      0.98      0.98     41399
weighted avg       0.98      0.98      0.98     41399

Test Accuracy (Random Forest): 0.9792507065388053


# Model 2: XGBoost

In [51]:
param_grid = {
    'n_estimators': [50, 100, 200],  
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

In [52]:
print("=== XGBoost Model ===")
xgb_model_student = XGBClassifier(random_state=42, eval_metric='logloss')

=== XGBoost Model ===


In [53]:
grid_search = GridSearchCV(estimator=xgb_model_student,
                           param_grid=param_grid,
                           cv=3,  # 3-Fold Cross Validation
                           scoring='accuracy',
                           verbose=1,
                           n_jobs=-1)

In [54]:
grid_search.fit(X_train_students, y_train_students)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


In [55]:
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200, 'subsample': 1.0}


In [56]:
best_xgb_model_student = grid_search.best_estimator_

In [57]:
y_pred_best_xgb = best_xgb_model_student.predict(X_test_students)

In [58]:
print("\nTest Set Evaluation Student (Best XGBoost)")
print(classification_report(y_test_students, y_pred_best_xgb))
print("Test Accuracy (Best XGBoost):", accuracy_score(y_test_students, y_pred_best_xgb))


Test Set Evaluation Student (Best XGBoost)
              precision    recall  f1-score   support

           0       0.87      0.86      0.87      3255
           1       0.86      0.88      0.87      3273

    accuracy                           0.87      6528
   macro avg       0.87      0.87      0.87      6528
weighted avg       0.87      0.87      0.87      6528

Test Accuracy (Best XGBoost): 0.8670343137254902


In [59]:
grid_search.fit(X_train_working, y_train_working)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


In [60]:
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 200, 'subsample': 1.0}


In [61]:
best_xgb_model_working = grid_search.best_estimator_

In [62]:
y_pred_best_xgb = best_xgb_model_working.predict(X_test_working)
print("\nTest Set Evaluation Working (Best XGBoost)")
print(classification_report(y_test_working, y_pred_best_xgb))
print("Test Accuracy (Best XGBoost):", accuracy_score(y_test_working, y_pred_best_xgb))


Test Set Evaluation Working (Best XGBoost)
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     20810
           1       0.98      0.98      0.98     20589

    accuracy                           0.98     41399
   macro avg       0.98      0.98      0.98     41399
weighted avg       0.98      0.98      0.98     41399

Test Accuracy (Best XGBoost): 0.979226551365975


# Model 3: Neural Network (Keras)

In [63]:
model_student = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_students.shape[1],)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

In [64]:
model_student.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [65]:
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [66]:
history_student = model_student.fit(
    X_train_students, y_train_students,
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stop],
    verbose=1
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100


In [67]:
test_loss, test_accuracy = model_student.evaluate(X_train_students, y_train_students)
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.8470


In [71]:
y_pred_students_nn = model_student.predict(X_test_students)
y_pred_students_nn = (y_pred_students_nn > 0.5).astype(int)
print("\nTest Set Evaluation Student (Neural Network)")
print(classification_report(y_test_students, y_pred_students_nn))
print("Test Accuracy (Neural Network):", accuracy_score(y_test_students, y_pred_students_nn))


Test Set Evaluation Student (Neural Network)
              precision    recall  f1-score   support

           0       0.85      0.82      0.84      3255
           1       0.83      0.86      0.84      3273

    accuracy                           0.84      6528
   macro avg       0.84      0.84      0.84      6528
weighted avg       0.84      0.84      0.84      6528

Test Accuracy (Neural Network): 0.8391544117647058


In [72]:
model_working = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_working.shape[1],)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

In [73]:
model_working.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [74]:
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [75]:
history_working = model_working.fit(
    X_train_working, y_train_working,
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stop],
    verbose=1
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100


In [76]:
test_loss, test_accuracy = model_working.evaluate(X_train_working, y_train_working)
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.9239


In [78]:
y_pred_working_nn = model_working.predict(X_test_working)
y_pred_working_nn = (y_pred_working_nn > 0.5).astype(int)
print("\nTest Set Evaluation Working Professional (Neural Network)")
print(classification_report(y_test_working, y_pred_working_nn))
print("Test Accuracy (Neural Network):", accuracy_score(y_test_working, y_pred_working_nn))


Test Set Evaluation Working Professional (Neural Network)
              precision    recall  f1-score   support

           0       0.92      0.93      0.92     20810
           1       0.93      0.92      0.92     20589

    accuracy                           0.92     41399
   macro avg       0.92      0.92      0.92     41399
weighted avg       0.92      0.92      0.92     41399

Test Accuracy (Neural Network): 0.9237421193748642


# Test Dataset

In [79]:
students_test = test_df[test_df['Working Professional or Student'] == 'Student']
working_professionals_test = test_df[test_df['Working Professional or Student'] == 'Working Professional']

In [80]:
columns_to_drop = ['Work Pressure', 'Job Satisfaction']
students_df_test = students_test.drop(columns_to_drop, axis=1)

In [81]:
num_vars_students_test = ['Age', 'Academic Pressure', 'CGPA', 'Study Satisfaction',
                           'Work/Study Hours', 'Financial Stress']

cat_vars_students_test = ['Gender', 'Working Professional or Student', 'Sleep Duration', 'Dietary Habits', 
                          'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']

In [82]:
students_df_test = students_df_test[num_vars_students_test+cat_vars_students_test]

In [83]:
columns_to_drop = ['Academic Pressure', 'CGPA', 'Study Satisfaction']
working_professionals_df_test = working_professionals_test.drop(columns_to_drop, axis=1)

In [84]:
num_vars_working_test = ['Age', 'Work Pressure', 'Job Satisfaction',
                          'Work/Study Hours', 'Financial Stress']

cat_vars_working_test = ['Gender', 'Working Professional or Student', 'Sleep Duration', 'Dietary Habits', 
                            'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']

In [85]:
working_professionals_df_test = working_professionals_df_test[num_vars_working_test+cat_vars_working_test]

In [86]:
imputer = SimpleImputer(strategy='mean')
students_df_test[num_vars_students_test] = imputer.fit_transform(students_df_test[num_vars_students_test])
working_professionals_df_test[num_vars_working_test] = imputer.fit_transform(working_professionals_df_test[num_vars_working_test])

In [87]:
for col in cat_vars_students_test:
    le = LabelEncoder()
    students_df_test[col] = le.fit_transform(students_df_test[col].astype(str))

In [88]:
for col in cat_vars_working_test:
    le = LabelEncoder()
    working_professionals_df_test[col] = le.fit_transform(working_professionals_df_test[col].astype(str))

In [89]:
scaler = StandardScaler()

students_df_test_scaled = scaler.fit_transform(students_df_test)
working_professionals_df_test_scaled = scaler.fit_transform(working_professionals_df_test)

# Test RF

In [90]:
y_pred_students = rf_model_student.predict(students_df_test_scaled)

In [91]:
y_pred_working = rf_model_working.predict(working_professionals_df_test_scaled)

In [92]:
students_test['Depression'] = y_pred_students

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  students_test['Depression'] = y_pred_students


In [93]:
working_professionals_test['Depression'] = y_pred_working

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  working_professionals_test['Depression'] = y_pred_working


In [94]:
students_submission_rf = students_test[['id', 'Depression']]
working_submission_rf = working_professionals_test[['id', 'Depression']]

submission_rf = pd.concat([students_submission_rf, working_submission_rf], axis=0)

In [95]:
submission_rf

Unnamed: 0,id,Depression
3,140703,1
8,140708,0
19,140719,1
20,140720,0
21,140721,1
...,...,...
93792,234492,1
93795,234495,0
93796,234496,1
93798,234498,1


In [96]:
submission_rf.to_csv('../data/submission-rf-smote.csv', index=False)

# Test XGB

In [97]:
y_pred_students_xgb = best_xgb_model_student.predict(students_df_test_scaled)

In [98]:
y_pred_working_xgb = best_xgb_model_working.predict(working_professionals_df_test_scaled)

In [99]:
students_test['Depression'] = y_pred_students_xgb

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  students_test['Depression'] = y_pred_students_xgb


In [100]:
working_professionals_test['Depression'] = y_pred_working_xgb

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  working_professionals_test['Depression'] = y_pred_working_xgb


In [101]:
students_submission_xgb = students_test[['id', 'Depression']]
working_submission_xgb = working_professionals_test[['id', 'Depression']]

submission_xgb = pd.concat([students_submission_xgb, working_submission_xgb], axis=0)

In [102]:
submission_xgb

Unnamed: 0,id,Depression
3,140703,0
8,140708,0
19,140719,0
20,140720,0
21,140721,0
...,...,...
93792,234492,1
93795,234495,1
93796,234496,1
93798,234498,1


In [103]:
submission_xgb.to_csv('../data/submission-xgb-smote.csv', index=False)

# Test Neural Network

In [104]:
y_pred_students_nn = model_student.predict(students_df_test_scaled)
y_pred_students_nn = (y_pred_students_nn > 0.5).astype(int)



In [105]:
y_pred_working_nn = model_working.predict(working_professionals_df_test_scaled)
y_pred_working_nn = (y_pred_working_nn > 0.5).astype(int)



In [106]:
students_test['Depression'] = y_pred_students_nn

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  students_test['Depression'] = y_pred_students_nn


In [107]:
working_professionals_test['Depression'] = y_pred_working_nn

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  working_professionals_test['Depression'] = y_pred_working_nn


In [108]:
students_submission_nn = students_test[['id', 'Depression']]
working_submission_nn = working_professionals_test[['id', 'Depression']]

submission_nn = pd.concat([students_submission_nn, working_submission_nn], axis=0)

In [109]:
submission_nn

Unnamed: 0,id,Depression
3,140703,1
8,140708,0
19,140719,1
20,140720,1
21,140721,1
...,...,...
93792,234492,1
93795,234495,0
93796,234496,1
93798,234498,1


In [111]:
submission_nn.to_csv('../data/submission-nn-smote.csv', index=False)