### 1. Import Dependencies

In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report   
import optuna
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

### 2. Load Data

In [28]:
train_df = pd.read_csv('../Data/Processed/DP_train_encoded.csv')
test_df = pd.read_csv('../Data/Processed/DP_test_encoded.csv')

### 3. Split the data into Train and Test

In [29]:
X = train_df.drop(columns=['Depression', 'id'])
Y = train_df['Depression']
test = test_df.drop(columns=['id'])
test_ids = test_df['id']

In [30]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

### 4. Model Training

#### 4.1 Random Forest Classifier with Cross Validation

In [31]:
model_rfc = RandomForestClassifier(
                                    n_estimators=200,
                                    max_depth=None,
                                    min_samples_split=2,
                                    min_samples_leaf=1,
                                    random_state=42,
                                    n_jobs=-1
                                    )

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(model_rfc, X_train, Y_train, cv=skf, scoring='accuracy', n_jobs=-1)

In [32]:
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores):.4f}")
print(f"Std deviation: {np.std(cv_scores):.4f}")

# 5️⃣ Train final model on full training data
model_rfc.fit(X_train, Y_train)

# 6️⃣ Predict on test set
y_pred = model_rfc.predict(X_test)

Cross-validation scores: [0.93650441 0.93765852 0.93832599 0.93583411 0.93574512]
Mean accuracy: 0.9368
Std deviation: 0.0010


In [34]:
y_hat_pred = model_rfc.predict(test)