In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
df = pd.read_csv('Student_performance_data _.csv')

df.head() ## Display the first few rows of the dataframe

In [None]:
print("Shape:", df.shape)

df.info()

In [None]:
df["AtRisk"] = (df["GPA"] < 2.5) ##Compare every GPA to 2.5, return True/False
df["AtRisk"] = df["AtRisk"].astype(int) ##True = 1, False = 0
df[["GPA", "AtRisk"]].head(10) ## Display first 10 rows of GPA and AtRisk columns

In [None]:
target = 'AtRisk'

drop_cols = ["StudentID", "GPA", "GradeClass", "AtRisk"] ## Remove ID + target + columns that leak target info
feature_cols = [cols for cols in df.columns if cols not in drop_cols] ## List of feature columns

x = df[feature_cols]
y = df[target]

print("Features:", feature_cols) ##sanity check
print("X shape:", x.shape, "Y shape:", y.shape)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, ##x = features, y = target
    test_size = 0.2, ##80% train, 20% test
    random_state = 42, ##radom seed for reproducibility. Consistency purposes = 42
    stratify = y ##maintain same proportion of classes in train and test sets
)

print("Train:", x_train.shape, "Test:", x_test.shape)
print("AtRisk rate (train):", y_train.mean(), "AtRisk rate (test):", y_test.mean())

##The more closely the train mean and the test mean are, the better the representation of the data

In [None]:
model = Pipeline(steps = [
    ["scaler", StandardScaler()], ##Standardize features by removing the mean and scaling to unit variance
    ["clf", LogisticRegression(max_iter = 2000)] ##Logistic Regression classifier with max iterations set to 2000
])

model.fit(x_train, y_train) ##Train the model

##Pipeline - Every time daa goes in, do these steps in this order
    ##Rescales numeric features so the are comparable. Each feature has mean = 0 and std = 1.
    ##It learns a weighted formula and pushes through a sigmoid function to output a probability between 0 and 1.
        ##Probability > 0.5 = AtRisk, otherwise not AtRisk
        ##max_iter = 2000 prevents warning, makes training stable. It guess weights and improve them repeatedy until convergence.
##Last line - Fits the scaler using only training data and trains the logistic regression model on the scaled training data. Prevent data leakage and honest evaluation.


In [None]:
y_pred = model.predict(x_test) ##Take students the model has never seen before and guess whether they are atRisk or not

print("Accuracy:", accuracy_score(y_test, y_pred)) ##Put of all students, how many did we get right?
##Remember that 70% students are atRisk, if a model always predict AtRis get 70%, so accuracy alone is not enough

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred)) ##Confusion matrix
## [[TN, FP] --> True Negative - Correctly predicted notAtRisk, False Positive - Incorrectly predicted AtRisk (fine)
##  [FN, TP]] --> False Negative - Incorrectly predicted notAtRisk (not fine), True Positive - Correctly predicted AtRisk

print("\nReport", classification_report(y_test, y_pred)) ##Precision, recall, f1-score for each class
##precision (1) - Of all students predicted AtRisk, how many were actually AtRisk?
##recall (1) - Of all students who were actually AtRisk, how many did we correctly identify? - Low recall --> model misses struggling students
##f1-score - Balance between precision and recall.

##If recall for 1 is below 0.6, we need to fix it, if it's above 0.6, we are good.

In [None]:
clf = model.named_steps['clf'] ##Trained logistic regression model inside the pipeline
weights = pd.Series(clf.coef_[0], index=feature_cols).sort_values() 
##clf.coef_[0] is the weight of each feature in the same order as the feature_cols list
##pd.Series labels each weight with the feature name so easier to read
##.sort_values() sorts the weights in ascending order, negative weigts are features that decrease the likelihood of being AtRisk

weights.sort_values().head(10) ## Which features most strongly reduce the chance at being atRisk?

In [None]:
one_student = x_test.iloc[[0]] ## Selects one row from the test set. [[0]] keeps it as a df not a series, 2d array [rows, features]
pred = model.predict(one_student)[0] ##Predict whether the student is atRisk or not
prob = model.predict_proba(one_student)[0][pred] 
## model.predict_proba returns a 2d array with the probability of each class for each student. [[prob_class_0, prob_class_1]
## [0] gets the [[prob_class_0, prob_class_1] for the first student in the test set
## [pred] gets the probability of the predicted class (1 or 0). If pred = 1, get prob_class_1, if pred = 0, get prob_class_0

print("Prediction (1 = AtRisk; 0 = Not AtRisk):", pred)
print("Risk Probability:", prob)
print("\nStudent Features:\n", one_student)