In [72]:
import pandas as pd

df = pd.read_csv('../data/cleaned_student_data.csv')
df

Unnamed: 0,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,...,health,absences,final_grade,need_counselling,log_absences,low_studytime,has_failures,avg_parent_edu,high_goout,high_walc
0,0,18,1,0,0,4,4,0,4,0,...,3,4,11,0,1.609438,1,0,4.0,1,0
1,0,17,1,0,1,1,1,0,2,0,...,3,2,11,0,1.098612,1,0,1.0,0,0
2,0,15,1,1,1,1,1,0,2,2,...,3,6,12,0,1.945910,1,0,1.0,0,0
3,0,15,1,0,1,4,2,1,3,1,...,5,0,14,0,0.000000,0,0,3.0,0,0
4,0,16,1,0,1,3,3,2,2,1,...,5,0,13,0,0.000000,1,0,3.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644,0,19,0,0,1,2,3,3,2,0,...,5,4,10,0,1.609438,0,1,2.5,0,0
645,0,18,1,1,1,3,1,4,3,0,...,1,4,16,0,1.609438,1,0,2.0,1,0
646,0,18,1,0,1,1,1,2,2,0,...,5,6,9,1,1.945910,1,0,1.0,0,0
647,1,17,1,1,1,3,1,3,3,0,...,2,6,10,0,1.945910,1,0,2.0,1,1


In [73]:
from sklearn.model_selection import train_test_split

y = df['need_counselling']

X = df.drop(columns=['need_counselling','final_grade'])

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [75]:
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)
print("Target distribution in train:", y_train.value_counts(normalize=True).to_dict())
print("Target distribution in test:", y_test.value_counts(normalize=True).to_dict())

Train shape: (519, 35)
Test shape: (130, 35)
Target distribution in train: {0: 0.8458574181117534, 1: 0.15414258188824662}
Target distribution in test: {0: 0.8461538461538461, 1: 0.15384615384615385}


**Logistic Regresssion**


In [76]:
from sklearn.linear_model import LogisticRegression


logreg = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)


In [77]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("\nAccuracy Score: ", accuracy_score(y_test, y_pred))
print("\nClassification Report: \n", classification_report(y_test, y_pred))
print("\nConfusion Matrix: \n", confusion_matrix(y_test, y_pred))


Accuracy Score:  0.8

Classification Report: 
               precision    recall  f1-score   support

           0       0.90      0.85      0.88       110
           1       0.38      0.50      0.43        20

    accuracy                           0.80       130
   macro avg       0.64      0.68      0.66       130
weighted avg       0.82      0.80      0.81       130


Confusion Matrix: 
 [[94 16]
 [10 10]]


**Random Forest**

In [78]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    class_weight='balanced',
    random_state=42
)

In [79]:
rf_model.fit(X_train, y_train)

In [80]:
y_pred_rf = rf_model.predict(X_test)

In [81]:
print("\nAccuracy Score: ", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report: \n", classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix: \n", confusion_matrix(y_test, y_pred_rf))


Accuracy Score:  0.8384615384615385

Classification Report: 
               precision    recall  f1-score   support

           0       0.85      0.98      0.91       110
           1       0.33      0.05      0.09        20

    accuracy                           0.84       130
   macro avg       0.59      0.52      0.50       130
weighted avg       0.77      0.84      0.78       130


Confusion Matrix: 
 [[108   2]
 [ 19   1]]


<h1>SMOTE</h1>
<h3>Using SMOTE to create synthetic samples of class 1 so that we get a balanced training data</h3>

In [82]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

In [83]:
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)



In [84]:
print("After SMOTE:\n", y_train_resampled.value_counts())


After SMOTE:
 need_counselling
0    439
1    439
Name: count, dtype: int64


In [85]:
rf_smote_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    random_state=42
)
rf_smote_model.fit(X_train_resampled, y_train_resampled)

y_pred_smote = rf_smote_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_smote))
print("\nClassification Report:\n", classification_report(y_test, y_pred_smote))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_smote))


Accuracy: 0.8307692307692308

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.94      0.90       110
           1       0.42      0.25      0.31        20

    accuracy                           0.83       130
   macro avg       0.64      0.59      0.61       130
weighted avg       0.80      0.83      0.81       130


Confusion Matrix:
 [[103   7]
 [ 15   5]]


<h1>XGBoost</h1>

In [86]:
import xgboost as xgb

# Class distribution in training data
neg, pos = y_train.value_counts()[0], y_train.value_counts()[1]

pos

np.int64(80)

In [87]:
scale_pos_weight = neg / pos
print(f"scale_pos_weight = {scale_pos_weight:.2f}")

scale_pos_weight = 5.49


In [88]:
xgb_model = xgb.XGBClassifier(
    n_estimators = 100,
    max_depth = 4,
    learning_rate = 0.1,
    scale_pos_weight = scale_pos_weight,
    use_label_encoder = False,
    eval_metric = 'logloss',
    random_state = 42
)

In [89]:
xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [90]:
y_pred_xgb = xgb_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))

Accuracy: 0.823076923076923

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.92      0.90       110
           1       0.40      0.30      0.34        20

    accuracy                           0.82       130
   macro avg       0.64      0.61      0.62       130
weighted avg       0.80      0.82      0.81       130


Confusion Matrix:
 [[101   9]
 [ 14   6]]


In [91]:
import joblib

joblib.dump(X_test, '../data/X_test.pkl')  


joblib.dump(xgb_model, '../models/xgb_model.pkl')


['../models/xgb_model.pkl']