In [126]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix , recall_score , f1_score , precision_score, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split
import warnings
warnings.filterwarnings('ignore')

In [127]:
df = pd.read_csv("final_data.csv")
df.head()

Unnamed: 0,Age,Gender,CGPA,Stress_Level,Depression_Score,Anxiety_Score,Sleep_Quality,Physical_Activity,Diet_Quality,Social_Support,...,Residence_Type_With Family,Relationship_Status_In a Relationship,Relationship_Status_Married,Relationship_Status_Single,Course_Business,Course_Computer Science,Course_Engineering,Course_Law,Course_Medical,Course_Others
0,25,0,3.56,3,3,2,3,2,3,2,...,0,0,1,0,0,0,0,0,0,1
1,24,1,2.44,0,3,0,2,1,2,1,...,0,0,0,1,0,0,1,0,0,0
2,19,1,3.74,4,0,3,3,1,2,2,...,0,1,0,0,1,0,0,0,0,0
3,18,0,3.4,3,3,4,3,1,2,3,...,0,0,1,0,1,0,0,0,0,0
4,21,1,3.35,2,4,3,3,2,3,3,...,0,0,0,1,0,0,0,0,1,0


In [128]:
df.Risk.value_counts()

Risk
1    4265
0    2742
Name: count, dtype: int64

In [129]:
df = df.drop(['Stress_Level', 'Depression_Score', 'Anxiety_Score'], axis=1)
df.head()

Unnamed: 0,Age,Gender,CGPA,Sleep_Quality,Physical_Activity,Diet_Quality,Social_Support,Substance_Use,Counseling_Service_Use,Family_History,...,Residence_Type_With Family,Relationship_Status_In a Relationship,Relationship_Status_Married,Relationship_Status_Single,Course_Business,Course_Computer Science,Course_Engineering,Course_Law,Course_Medical,Course_Others
0,25,0,3.56,3,2,3,2,1,1,0,...,0,0,1,0,0,0,0,0,0,1
1,24,1,2.44,2,1,2,1,2,2,0,...,0,0,0,1,0,0,1,0,0,0
2,19,1,3.74,3,1,2,2,1,2,0,...,0,1,0,0,1,0,0,0,0,0
3,18,0,3.4,3,1,2,3,1,1,0,...,0,0,1,0,1,0,0,0,0,0
4,21,1,3.35,3,2,3,3,1,1,0,...,0,0,0,1,0,0,0,0,1,0


In [130]:
X = df.drop('Risk', axis=1)
y = df['Risk']             

X_train , X_test , y_train , y_test =  train_test_split(X, y, test_size=0.2, random_state=42)  

For Random Forest, no scaling is actually required.

Reasons:
Random Forest is tree-based, which splits data based on thresholds, not distances.
Feature ranges do not affect the splitting criteria.
Standardization or normalization does not improve model performance.

Summary:
You can keep numeric features as-it-is, or optionally scale for consistency.
Using StandardScaler or MinMaxScaler has no real impact on Random Forest.

In [131]:
acc = []
recall_list = []
precision_list = []
f_score = []
depths = []

for depth in range(1, 31):
    rf = RandomForestClassifier(
        n_estimators=300,
        max_depth=depth,
        random_state=42
    )
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    
    depths.append(depth)
    acc.append(accuracy_score(y_test, y_pred))
    recall_list.append(recall_score(y_test, y_pred, average='macro'))
    precision_list.append(precision_score(y_test, y_pred, average='macro'))
    f_score.append(f1_score(y_test, y_pred, average='macro'))

In [132]:
results_df = pd.DataFrame({
    'Max Depth': depths,
    'Accuracy': acc,
    'Recall': recall_list,
    'Precision': precision_list,
    'F1-Score': f_score
})

print(results_df.to_string(index=False))

 Max Depth  Accuracy   Recall  Precision  F1-Score
         1  0.611983 0.500000   0.305991  0.379646
         2  0.611983 0.500000   0.305991  0.379646
         3  0.611983 0.500000   0.305991  0.379646
         4  0.611270 0.500763   0.528396  0.386176
         5  0.612696 0.512692   0.562687  0.434507
         6  0.608417 0.516932   0.550236  0.460005
         7  0.618402 0.532827   0.579272  0.490232
         8  0.614123 0.535721   0.569646  0.504688
         9  0.608417 0.530387   0.558271  0.499083
        10  0.617689 0.543008   0.577196  0.517842
        11  0.625535 0.555136   0.591376  0.536311
        12  0.611270 0.538099   0.565741  0.514084
        13  0.618402 0.547291   0.578888  0.526744
        14  0.621969 0.552895   0.585193  0.535113
        15  0.616262 0.547561   0.575667  0.529518
        16  0.616976 0.552180   0.577733  0.538217
        17  0.613409 0.547921   0.571971  0.532919
        18  0.611270 0.545163   0.568393  0.529321
        19  0.619829 0.555856  

In [133]:
best_accuracy_idx = results_df['Accuracy'].idxmax()
best_recall_idx = results_df['Recall'].idxmax()
best_precision_idx = results_df['Precision'].idxmax()
best_f1_idx = results_df['F1-Score'].idxmax()

print(f"Best Accuracy: Depth {results_df.loc[best_accuracy_idx, 'Max Depth']:.0f} - {results_df.loc[best_accuracy_idx, 'Accuracy']:.4f}")
print(f"Best Recall: Depth {results_df.loc[best_recall_idx, 'Max Depth']:.0f} - {results_df.loc[best_recall_idx, 'Recall']:.4f}")
print(f"Best Precision: Depth {results_df.loc[best_precision_idx, 'Max Depth']:.0f} - {results_df.loc[best_precision_idx, 'Precision']:.4f}")
print(f"Best F1-Score: Depth {results_df.loc[best_f1_idx, 'Max Depth']:.0f} - {results_df.loc[best_f1_idx, 'F1-Score']:.4f}")

Best Accuracy: Depth 20 - 0.6270
Best Recall: Depth 27 - 0.5620
Best Precision: Depth 20 - 0.5934
Best F1-Score: Depth 27 - 0.5505


In [134]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42
)

param_grid = {
    'max_depth': [12],
    'min_samples_leaf': [3],
    'max_features': ['sqrt', 'log2', None]
}

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='f1_macro', 
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print("Best hyperparameters:", grid_search.best_params_)
print("Best F1-score (CV):", grid_search.best_score_)

best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

print("Classification Report on Test Set:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best hyperparameters: {'max_depth': 12, 'max_features': None, 'min_samples_leaf': 3}
Best F1-score (CV): 0.5338695872539612
Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.53      0.26      0.35       544
           1       0.65      0.85      0.73       858

    accuracy                           0.62      1402
   macro avg       0.59      0.56      0.54      1402
weighted avg       0.60      0.62      0.59      1402

Confusion Matrix:
[[143 401]
 [127 731]]


In [135]:
low_importance_features = [
    'Course_Others', 'Course_Medical', 'Course_Law', 
    'Course_Engineering', 'Course_Computer_Science', 
    'Course_Business', 'Chronic_Illness'
]

available_features = [col for col in low_importance_features if col in X.columns]
X_reduced = X.drop(columns=available_features , errors='ignore')


X_train_red, X_test_red, y_train, y_test = train_test_split(
    X_reduced, y, test_size=0.2, random_state=42
)

rf_reduced = RandomForestClassifier(n_estimators=100, random_state=42)
rf_reduced.fit(X_train_red, y_train)

y_pred_red = rf_reduced.predict(X_test_red)

print("Accuracy:", accuracy_score(y_test, y_pred_red))
print("Classification Report:\n", classification_report(y_test, y_pred_red))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_red))


Accuracy: 0.5805991440798859
Classification Report:
               precision    recall  f1-score   support

           0       0.40      0.16      0.23       544
           1       0.61      0.84      0.71       858

    accuracy                           0.58      1402
   macro avg       0.51      0.50      0.47      1402
weighted avg       0.53      0.58      0.53      1402

Confusion Matrix:
 [[ 89 455]
 [133 725]]
