In [3]:
import pandas as pd

df = pd.read_csv('../data/cleaned_student_data.csv')

In [4]:
from sklearn.model_selection import train_test_split

y = df['need_counselling']

X = df.drop(columns=['need_counselling','final_grade'])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [6]:
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)
print("Target distribution in train:", y_train.value_counts(normalize=True).to_dict())
print("Target distribution in test:", y_test.value_counts(normalize=True).to_dict())

Train shape: (519, 36)
Test shape: (130, 36)
Target distribution in train: {0: 0.8458574181117534, 1: 0.15414258188824662}
Target distribution in test: {0: 0.8461538461538461, 1: 0.15384615384615385}


**Logistic Regresssion**


In [7]:
from sklearn.linear_model import LogisticRegression


logreg = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)


In [8]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("\nAccuracy Score: ", accuracy_score(y_test, y_pred))
print("\nClassification Report: \n", classification_report(y_test, y_pred))
print("\nConfusion Matrix: \n", confusion_matrix(y_test, y_pred))


Accuracy Score:  0.8

Classification Report: 
               precision    recall  f1-score   support

           0       0.92      0.84      0.88       110
           1       0.40      0.60      0.48        20

    accuracy                           0.80       130
   macro avg       0.66      0.72      0.68       130
weighted avg       0.84      0.80      0.82       130


Confusion Matrix: 
 [[92 18]
 [ 8 12]]


**Random Forest**

In [9]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    class_weight='balanced',
    random_state=42
)

In [10]:
rf_model.fit(X_train, y_train)

In [11]:
y_pred_rf = rf_model.predict(X_test)

In [12]:
print("\nAccuracy Score: ", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report: \n", classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix: \n", confusion_matrix(y_test, y_pred_rf))


Accuracy Score:  0.8538461538461538

Classification Report: 
               precision    recall  f1-score   support

           0       0.86      0.98      0.92       110
           1       0.60      0.15      0.24        20

    accuracy                           0.85       130
   macro avg       0.73      0.57      0.58       130
weighted avg       0.82      0.85      0.81       130


Confusion Matrix: 
 [[108   2]
 [ 17   3]]


<h1>SMOTE</h1>
<h3>Using SMOTE to create synthetic samples of class 1 so that we get a balanced training data</h3>

In [13]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

In [14]:
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)



In [15]:
print("After SMOTE:\n", y_train_resampled.value_counts())


After SMOTE:
 need_counselling
0    439
1    439
Name: count, dtype: int64


In [16]:
rf_smote_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    random_state=42
)
rf_smote_model.fit(X_train_resampled, y_train_resampled)

y_pred_smote = rf_smote_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_smote))
print("\nClassification Report:\n", classification_report(y_test, y_pred_smote))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_smote))


Accuracy: 0.823076923076923

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.93      0.90       110
           1       0.38      0.25      0.30        20

    accuracy                           0.82       130
   macro avg       0.63      0.59      0.60       130
weighted avg       0.80      0.82      0.81       130


Confusion Matrix:
 [[102   8]
 [ 15   5]]


<h1>XGBoost</h1>

In [19]:
import xgboost as xgb

# Class distribution in training data
neg, pos = y_train.value_counts()[0], y_train.value_counts()[1]

pos

np.int64(80)

In [20]:
scale_pos_weight = neg / pos
print(f"scale_pos_weight = {scale_pos_weight:.2f}")

scale_pos_weight = 5.49


In [21]:
xgb_model = xgb.XGBClassifier(
    n_estimators = 100,
    max_depth = 4,
    learning_rate = 0.1,
    scale_pos_weight = scale_pos_weight,
    use_label_encoder = False,
    eval_metric = 'logloss',
    random_state = 42
)

In [22]:
xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [24]:
y_pred_xgb = xgb_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))

Accuracy: 0.823076923076923

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.91      0.90       110
           1       0.41      0.35      0.38        20

    accuracy                           0.82       130
   macro avg       0.65      0.63      0.64       130
weighted avg       0.81      0.82      0.82       130


Confusion Matrix:
 [[100  10]
 [ 13   7]]
