In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import xgboost as xgb

In [3]:
data = pd.read_csv("../data/cleaned_v1.csv")
test = pd.read_csv("../data/test_cleaned_v1.csv")

In [4]:
data.drop(["Gender_Male"], axis=1, inplace=True)
test.drop(["Gender_Male"], axis=1, inplace=True)

In [5]:
data.columns

Index(['Work Pressure', 'CGPA', 'Job Satisfaction', 'Sleep Duration',
       'Dietary Habits', 'Degree', 'Work/Study Hours', 'Financial Stress',
       'Depression', 'Gender_Female', 'Age_Group', 'City_encoded', 'Student',
       'Profession_encoded', 'suicidal_thoughts', 'family_history'],
      dtype='object')

In [6]:
test.columns

Index(['Age', 'Work Pressure', 'CGPA', 'Job Satisfaction', 'Sleep Duration',
       'Dietary Habits', 'Degree', 'Work/Study Hours', 'Financial Stress',
       'Gender_Female', 'Student', 'suicidal_thoughts', 'family_history',
       'City_encoded', 'Profession_encoded'],
      dtype='object')

In [7]:
test = test.rename(columns={"Age": "Age_Group"})

In [8]:
y = data["Depression"]
X = data.drop(["Depression"], axis=1)

In [9]:
test = test[X.columns]

In [10]:
model = LogisticRegression()
model.fit(X, y)

In [11]:
pred = model.predict(test)

In [12]:
pred

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [13]:
test_org = pd.read_csv("../data/test.csv")
test_ids = test_org["id"]
test_ids

0        140700
1        140701
2        140702
3        140703
4        140704
          ...  
93795    234495
93796    234496
93797    234497
93798    234498
93799    234499
Name: id, Length: 93800, dtype: int64

In [14]:
output_df = pd.DataFrame()
output_df["id"] = test_ids
output_df["Depression"] = pred
output_df.head()

Unnamed: 0,id,Depression
0,140700,0
1,140701,0
2,140702,0
3,140703,1
4,140704,0


In [15]:
#output_df.to_csv("../predictions/model1.csv", index=False)

In [16]:
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
xgb_model.fit(X, y)
pred = xgb_model.predict(test)

In [17]:
output_df = pd.DataFrame()
output_df["id"] = test_ids
output_df["Depression"] = pred
output_df.head()

Unnamed: 0,id,Depression
0,140700,0
1,140701,0
2,140702,0
3,140703,1
4,140704,0


In [18]:
def create_output_df(preds):
    output_df = pd.DataFrame()
    output_df["id"] = test_ids
    output_df["Depression"] = preds
    return output_df

In [19]:
#output_df.to_csv("../predictions/model2.csv", index=False)

In [20]:
best_params = {
    "C": 3.6348739977385867,
    "max_iter": 5000,
    "penalty": "l2",
    "solver": "newton-cg",
}
log_reg_tuned = LogisticRegression(**best_params)
log_reg_tuned.fit(X, y)
pred = log_reg_tuned.predict(test)

In [21]:
output_df = create_output_df(pred)
output_df.head()

Unnamed: 0,id,Depression
0,140700,0
1,140701,0
2,140702,0
3,140703,1
4,140704,0


In [22]:
# output_df.to_csv("../predictions/model3.csv", index=False)

In [23]:
best_params = {
    "subsample": 0.5,
    "n_estimators": 500,
    "min_child_weight": 3,
    "max_depth": 2,
    "learning_rate": 0.1,
    "colsample_bytree": 0.7,
}
xgboost_tuned = xgb.XGBClassifier(
    objective="binary:logistic", tree_method="hist", **best_params
)
xgboost_tuned.fit(X, y)
pred = xgboost_tuned.predict(test)

In [24]:
output_df = create_output_df(pred)
output_df.head()

Unnamed: 0,id,Depression
0,140700,0
1,140701,0
2,140702,0
3,140703,1
4,140704,0


In [None]:
# output_df.to_csv("../predictions/model4.csv", index=False)

In [25]:
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [26]:
models = []
accuracy_scores = []
for train_index, val_index in skf.split(X, y):
    # Split the data into train and validation sets
    X_strat_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_strat_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Initialize CatBoost Classifier
    cat_model = CatBoostClassifier(
        iterations=500,  # Number of boosting iterations
        learning_rate=0.1,  # Learning rate
        depth=6,  # Tree depth
        loss_function="Logloss",  # Loss function for binary classification
        eval_metric="Accuracy",  # Evaluation metric
        verbose=0,  # Suppress training output
        random_seed=42,
    )

    # Fit the model
    cat_model.fit(X_strat_train, y_strat_train, eval_set=(X_val, y_val), use_best_model=True)

    models.append(cat_model)

    # Make predictions
    y_pred = cat_model.predict(X_val)

    # Calculate accuracy
    acc = accuracy_score(y_val, y_pred)
    accuracy_scores.append(acc)
    print(f"Fold Accuracy: {acc}")

Fold Accuracy: 0.9378109452736318
Fold Accuracy: 0.9377043354655294
Fold Accuracy: 0.9387348969438521
Fold Accuracy: 0.9401918976545842
Fold Accuracy: 0.9388415067519545


In [27]:
mean_accuracy = np.mean(accuracy_scores)
print(f"\nMean Accuracy Across {n_splits} Folds: {mean_accuracy:.4f}")


Mean Accuracy Across 5 Folds: 0.9387


In [32]:
predictions = np.zeros(test.shape[0])

for model in models:
    predictions += model.predict_proba(test)[
        :, 1
    ]  # Collect probabilities for the positive class

# Average predictions across all models
predictions /= len(models)

# Convert probabilities to binary predictions (threshold = 0.5)
binary_predictions = (predictions >= 0.5).astype(int)

print("Predictions on Test Data:", binary_predictions)

Predictions on Test Data: [0 0 0 ... 0 1 0]


In [34]:
output_df = create_output_df(binary_predictions)
output_df.head()

Unnamed: 0,id,Depression
0,140700,0
1,140701,0
2,140702,0
3,140703,1
4,140704,0


In [35]:
output_df.to_csv("../predictions/model5.csv", index=False)