In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
sb.set()
sb.set_palette(sb.color_palette("Set2"))

# Additional Libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Import the Dataset
df = pd.read_csv('datasets/student-dropout-academic-success-raw.csv')
clean_df = df.drop(df[df.Target == "Enrolled"].index)
student_df = clean_df.iloc[:,[1, 13, 14, 15, 16, 17, 22, 23, 28, 29, 34]].copy()
student_df["Target"] = student_df["Target"].map({
    "Dropout": 1,
    "Graduate": 0
})

In [2]:
predictors = student_df.iloc[:, 0:10] # Predictors
response = student_df.iloc[:, -1] # Response = Target

# Partition Dataset into 2 random portions - 80% Train, 20% Test
predictors_train, predictors_test, response_train, response_test = train_test_split(predictors, response, test_size = 0.2)

In [3]:
# Check sample sizes
print("Train Set :", predictors_train.shape, response_train.shape)
print("Test Set  :", predictors_test.shape, response_test.shape)

Train Set : (3539, 10) (3539,)
Test Set  : (885, 10) (885,)


#### Machine Learning Model

**Models:**
- Logistic Regression
- Decision Tree Classifier
- Random Forest Classifier
- AdaBoost
- XGBoost

**Metric:** 
- Accuracy Score with CV
- Precision Score
- Recall Score
- Explained Variance (R^2)
- Root Mean Squared Error (RMSE)

#### Logistic Regression

In [8]:
logreg = LogisticRegression(solver='lbfgs', max_iter=1000)
logreg.fit(predictors_train, response_train)

LogisticRegression(max_iter=1000)

In [9]:
# Coefficients of the Logistic Regression line
print('Coefficients \t: a = ', logreg.coef_)
print('Intercept \t: b = ', logreg.intercept_)

Coefficients 	: a =  [[ 0.02384679  0.37421748 -1.58463013  0.16639068 -0.4497374   0.0297742
   0.08579978  0.01161992 -0.29973872 -0.0791188 ]
 [ 0.01750594  0.26851145  0.34186066  0.03204492 -0.26790359 -0.02391971
  -0.09799599  0.00962851 -0.05510726  0.06279351]
 [-0.04135273 -0.64272893  1.24276947 -0.1984356   0.71764099 -0.00585448
   0.01219621 -0.02124844  0.35484599  0.01632529]]
Intercept 	: b =  [ 2.03399156 -0.15349066 -1.8805009 ]


In [10]:
response_train_pred = logreg.predict(predictors_train)
response_test_pred = logreg.predict(predictors_test)

In [11]:

scores = cross_val_score(logreg, predictors_train, response_train, cv=10)
mse_train = np.mean(np.square(np.array(response_train) - np.array(response_train_pred)))

print("Explained Variance (R^2):", logreg.score(predictors_train, response_train))
print("Mean Squared Error (MSE):", mse_train)
print("Root Mean Squared Error (RMSE):", np.sqrt(mse_train), "\n")

print("Accuracy Score without CV: ",accuracy_score(response_test, response_test_pred))
print("Accuracy Score with CV: ",scores.mean())
print("Precision Score: ", precision_score(response_test, response_test_pred, average = 'macro'))
print("Recall Score: ", recall_score(response_test, response_test_pred, average = 'macro'))
print("F1 Score: ", f1_score(response_test, response_test_pred, average = 'macro'))

Explained Variance (R^2): 0.7428652161627578
Mean Squared Error (MSE): 0.5046623339926533
Root Mean Squared Error (RMSE): 0.710395899476238 

Accuracy Score without CV:  0.7107344632768362
Accuracy Score with CV:  0.7428674316992365
Precision Score:  0.617612706164108
Recall Score:  0.5924676648001097
F1 Score:  0.5807360533358834


#### Decision Tree Classifier

In [12]:
dectree = DecisionTreeClassifier()
dectree.fit(predictors_train, response_train)

DecisionTreeClassifier()

In [13]:
response_train_pred = dectree.predict(predictors_train)
response_test_pred = dectree.predict(predictors_test)

In [14]:
scores = cross_val_score(dectree, predictors_train, response_train, cv=10)
mse_train = np.mean(np.square(np.array(response_train) - np.array(response_train_pred)))

print("Explained Variance (R^2):", dectree.score(predictors_train, response_train))
#print("Mean Squared Error (MSE):", mse_train)
print("Root Mean Squared Error (RMSE):", np.sqrt(mse_train), "\n")

#print("Accuracy Score without CV: ",accuracy_score(response_test, response_test_pred))
print("Accuracy Score with CV: ",scores.mean())
print("Precision Score: ", precision_score(response_test, response_test_pred, average = 'macro'))
print("Recall Score: ", recall_score(response_test, response_test_pred, average = 'macro'))
#print("F1 Score: ", f1_score(response_test, response_test_pred, average = 'macro'))

Explained Variance (R^2): 0.9819157954224357
Mean Squared Error (MSE): 0.04944899689177734
Root Mean Squared Error (RMSE): 0.22237130411043898 

Accuracy Score without CV:  0.6621468926553672
Accuracy Score with CV:  0.6634712952737633
Precision Score:  0.6015466713579921
Recall Score:  0.6001588418910226
F1 Score:  0.6008109009571571


#### Random Forest Classifier

In [15]:
randfclf = RandomForestClassifier(max_depth = 10, random_state = 0)
randfclf.fit(predictors_train, response_train)

RandomForestClassifier(max_depth=10, random_state=0)

In [16]:
response_train_pred = randfclf.predict(predictors_train)
response_test_pred = randfclf.predict(predictors_test)

In [17]:
scores = cross_val_score(randfclf, predictors_train, response_train, cv=10)
mse_train = np.mean(np.square(np.array(response_train) - np.array(response_train_pred)))

print("Explained Variance (R^2):", randfclf.score(predictors_train, response_train))
print("Mean Squared Error (MSE):", mse_train)
print("Root Mean Squared Error (RMSE):", np.sqrt(mse_train), "\n")

print("Accuracy Score without CV: ",accuracy_score(response_test, response_test_pred))
print("Accuracy Score with CV: ",scores.mean())
print("Precision Score: ", precision_score(response_test, response_test_pred, average = 'macro'))
print("Recall Score: ", recall_score(response_test, response_test_pred, average = 'macro'))
print("F1 Score: ", f1_score(response_test, response_test_pred, average = 'macro'))

Explained Variance (R^2): 0.8736931336535745
Mean Squared Error (MSE): 0.25939530940943767
Root Mean Squared Error (RMSE): 0.5093086582902726 

Accuracy Score without CV:  0.7288135593220338
Accuracy Score with CV:  0.7541652662409373
Precision Score:  0.6596718963373168
Recall Score:  0.6254292685627512
F1 Score:  0.6274889756420006
