In [15]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
df = pd.read_csv('Student_performance_data _.csv')

df.head() ## Display the first few rows of the dataframe

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
2,1003,15,0,2,3,4.21057,26,0,2,0,0,0,0,0.112602,4.0
3,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3.0
4,1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4.0


In [20]:
print("Shape:", df.shape)

df.info()

Shape: (2392, 15)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2392 entries, 0 to 2391
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   StudentID          2392 non-null   int64  
 1   Age                2392 non-null   int64  
 2   Gender             2392 non-null   int64  
 3   Ethnicity          2392 non-null   int64  
 4   ParentalEducation  2392 non-null   int64  
 5   StudyTimeWeekly    2392 non-null   float64
 6   Absences           2392 non-null   int64  
 7   Tutoring           2392 non-null   int64  
 8   ParentalSupport    2392 non-null   int64  
 9   Extracurricular    2392 non-null   int64  
 10  Sports             2392 non-null   int64  
 11  Music              2392 non-null   int64  
 12  Volunteering       2392 non-null   int64  
 13  GPA                2392 non-null   float64
 14  GradeClass         2392 non-null   float64
dtypes: float64(3), int64(12)
memory usage: 280.4 KB


In [32]:
df["AtRisk"] = (df["GPA"] < 2.5) ##Compare every GPA to 2.5, return True/False
df["AtRisk"] = df["AtRisk"].astype(int) ##True = 1, False = 0
df[["GPA", "AtRisk"]].head(10) ## Display first 10 rows of GPA and AtRisk columns

Unnamed: 0,GPA,AtRisk
0,2.929196,0
1,3.042915,0
2,0.112602,1
3,2.054218,1
4,1.288061,1
5,3.084184,0
6,2.748237,0
7,1.360143,1
8,2.896819,0
9,3.573474,0


In [None]:
target = 'AtRisk'

drop_cols = ["StudentID", "GPA", "GradeClass", "AtRisk"] ## Remove ID + target + columns that leak target info
feature_cols = [cols for cols in df.columns if cols not in drop_cols] ## List of feature columns

x = df[feature_cols]
y = df[target]

print("Features:", feature_cols) ##sanity check
print("X shape:", x.shape, "Y shape:", y.shape)

Features: ['Age', 'Gender', 'Ethnicity', 'ParentalEducation', 'StudyTimeWeekly', 'Absences', 'Tutoring', 'ParentalSupport', 'Extracurricular', 'Sports', 'Music', 'Volunteering']
X shape: (2392, 12) Y shape: (2392,)


In [41]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, ##x = features, y = target
    test_size = 0.2, ##80% train, 20% test
    random_state = 42, ##radom seed for reproducibility. Consistency purposes = 42
    stratify = y ##maintain same proportion of classes in train and test sets
)

print("Train:", x_train.shape, "Test:", x_test.shape)
print("AtRisk rate (train):", y_train.mean(), "AtRisk rate (test):", y_test.mean())

##The more closely the train mean and the test mean are, the better the representation of the data

Train: (1913, 12) Test: (479, 12)
AtRisk rate (train): 0.7046523784631469 AtRisk rate (test): 0.7056367432150313


In [None]:
model = Pipeline(steps = [
    ["scaler", StandardScaler()], ##Standardize features by removing the mean and scaling to unit variance
    ["clf", LogisticRegression(max_iter = 2000)] ##Logistic Regression classifier with max iterations set to 2000
])

model.fit(x_train, y_train) ##Train the model

##Pipeline - Every time daa goes in, do these steps in this order
    ##Rescales numeric features so the are comparable. Each feature has mean = 0 and std = 1.
    ##It learns a weighted formula and pushes through a sigmoid function to output a probability between 0 and 1.
        ##Probability > 0.5 = AtRisk, otherwise not AtRisk
        ##max_iter = 2000 prevents warning, makes training stable. It guess weights and improve them repeatedy until convergence.
##Last line - Fits the scaler using only training data and trains the logistic regression model on the scaled training data. Prevent data leakage and honest evaluation.


0,1,2
,"steps  steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.","[('scaler', ...), ['clf', LogisticRegre...max_iter=2000)]]"
,"transform_input  transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6",
,"memory  memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.",False

0,1,2
,"copy  copy: bool, default=True If False, try to avoid a copy and do inplace scaling instead. This is not guaranteed to always work inplace; e.g. if the data is not a NumPy array or scipy.sparse CSR matrix, a copy may still be returned.",True
,"with_mean  with_mean: bool, default=True If True, center the data before scaling. This does not work (and will raise an exception) when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.",True
,"with_std  with_std: bool, default=True If True, scale the data to unit variance (or equivalently, unit standard deviation).",True

0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'


In [None]:
y_pred = model.predict(x_test) ##Take students the model has never seen before and guess whether they are atRisk or not

print("Accuracy:", accuracy_score(y_test, y_pred)) ##Put of all students, how many did we get right?
##Remember that 70% students are atRisk, if a model always predict AtRis get 70%, so accuracy alone is not enough

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred)) ##Confusion matrix
## [[TN, FP] --> True Negative - Correctly predicted notAtRisk, False Positive - Incorrectly predicted AtRisk (fine)
##  [FN, TP]] --> False Negative - Incorrectly predicted notAtRisk (not fine), True Positive - Correctly predicted AtRisk

print("\nReport", classification_report(y_test, y_pred)) ##Precision, recall, f1-score for each class
##precision (1) - Of all students predicted AtRisk, how many were actually AtRisk?
##recall (1) - Of all students who were actually AtRisk, how many did we correctly identify? - Low recall --> model misses struggling students
##f1-score - Balance between precision and recall.

##If recall for 1 is below 0.6, we need to fix it, if it's above 0.6, we are good.

Accuracy: 0.9269311064718163

Confusion Matrix:
 [[121  20]
 [ 15 323]]

Report               precision    recall  f1-score   support

           0       0.89      0.86      0.87       141
           1       0.94      0.96      0.95       338

    accuracy                           0.93       479
   macro avg       0.92      0.91      0.91       479
weighted avg       0.93      0.93      0.93       479



In [None]:
clf = model.named_steps['clf'] ##Trained logistic regression model inside the pipeline
weights = pd.Series(clf.coef_[0], index=feature_cols).sort_values() 
##clf.coef_[0] is the weight of each feature in the same order as the feature_cols list
##pd.Series labels each weight with the feature name so easier to read
##.sort_values() sorts the weights in ascending order, negative weigts are features that decrease the likelihood of being AtRisk

weights.sort_values().head(10) ## Which features most strongly reduce the chance at being atRisk?

ParentalSupport     -1.272695
StudyTimeWeekly     -1.236855
Tutoring            -0.894937
Extracurricular     -0.846089
Sports              -0.608827
Music               -0.397935
Age                 -0.077018
Gender              -0.065521
Ethnicity           -0.039360
ParentalEducation    0.005495
dtype: float64

In [None]:
one_student = x_test.iloc[[0]] ## Selects one row from the test set. [[0]] keeps it as a df not a series, 2d array [rows, features]
pred = model.predict(one_student)[0] ##Predict whether the student is atRisk or not
prob = model.predict_proba(one_student)[0][pred] 
## model.predict_proba returns a 2d array with the probability of each class for each student. [[prob_class_0, prob_class_1]
## [0] gets the [[prob_class_0, prob_class_1] for the first student in the test set
## [pred] gets the probability of the predicted class (1 or 0). If pred = 1, get prob_class_1, if pred = 0, get prob_class_0

print("Prediction (1 = AtRisk; 0 = Not AtRisk):", pred)
print("Risk Probability:", prob)
print("\nStudent Features:\n", one_student)

Prediction (1 = AtRisk; 0 = Not AtRisk): 1
Risk Probability: 0.9372492474159437

Student Features:
       Age  Gender  Ethnicity  ParentalEducation  StudyTimeWeekly  Absences  \
1905   16       1          0                  0         9.396909        13   

      Tutoring  ParentalSupport  Extracurricular  Sports  Music  Volunteering  
1905         0                3                1       0      0             1  
