# Building a Student Intervention System

In [1]:
# Import libraries
import time
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import f1_score, make_scorer

In [2]:
# Read student data
student_data = pd.read_csv("student-data.csv")
print "Student data read successfully!"

Student data read successfully!


In [3]:
n_students = student_data.shape[0]
n_features = student_data.shape[1] - 1
n_passed = student_data[student_data['passed'] == 'yes'].shape[0]
n_failed = student_data[student_data['passed'] == 'no'].shape[0]
grad_rate = (float(n_passed) / n_students) * 100
print "Total number of students: {}".format(n_students)
print "Number of students who passed: {}".format(n_passed)
print "Number of students who failed: {}".format(n_failed)
print "Number of features: {}".format(n_features)
print "Graduation rate of the class: {:.2f}%".format(grad_rate)

Total number of students: 395
Number of students who passed: 265
Number of students who failed: 130
Number of features: 30
Graduation rate of the class: 67.09%


In [4]:
# Extract feature (X) and target (y) columns
feature_cols = list(student_data.columns[:-1])  # all columns but last are features
target_col = student_data.columns[-1]  # last column is the target/label
print "Feature column(s):-\n{}".format(feature_cols)
print "Target column: {}".format(target_col)

X_all = student_data[feature_cols]  # feature values for all students
y_all = student_data[target_col]  # corresponding targets/labels
print "\nFeature values:-"
print X_all.head()  # print the first 5 rows

Feature column(s):-
['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']
Target column: passed

Feature values:-
  school sex  age address famsize Pstatus  Medu  Fedu     Mjob      Fjob  \
0     GP   F   18       U     GT3       A     4     4  at_home   teacher   
1     GP   F   17       U     GT3       T     1     1  at_home     other   
2     GP   F   15       U     LE3       T     1     1  at_home     other   
3     GP   F   15       U     GT3       T     4     2   health  services   
4     GP   F   16       U     GT3       T     3     3    other     other   

    ...    higher internet  romantic  famrel  freetime goout Dalc Walc health  \
0   ...       yes       no        no       4         3     4    1    1      3   
1   ...    

In [5]:
# Preprocess feature columns
def preprocess_features(X):
    outX = pd.DataFrame(index=X.index)  # output dataframe, initially empty

    # Check each column
    for col, col_data in X.iteritems():
        # If data type is non-numeric, try to replace all yes/no values with 1/0
        if col_data.dtype == object:
            col_data = col_data.replace(['yes', 'no'], [1, 0])
        # Note: This should change the data type for yes/no columns to int

        # If still non-numeric, convert to one or more dummy variables
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix=col)  # e.g. 'school' => 'school_GP', 'school_MS'

        outX = outX.join(col_data)  # collect column(s) in output dataframe

    return outX

X_all = preprocess_features(X_all)
print "Processed feature columns ({}):-\n{}".format(len(X_all.columns), list(X_all.columns))

Processed feature columns (48):-
['school_GP', 'school_MS', 'sex_F', 'sex_M', 'age', 'address_R', 'address_U', 'famsize_GT3', 'famsize_LE3', 'Pstatus_A', 'Pstatus_T', 'Medu', 'Fedu', 'Mjob_at_home', 'Mjob_health', 'Mjob_other', 'Mjob_services', 'Mjob_teacher', 'Fjob_at_home', 'Fjob_health', 'Fjob_other', 'Fjob_services', 'Fjob_teacher', 'reason_course', 'reason_home', 'reason_other', 'reason_reputation', 'guardian_father', 'guardian_mother', 'guardian_other', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']


In [6]:
num_all = student_data.shape[0] 
num_train = 300  # about 75% of the data
num_test = num_all - num_train

# Shuffle the data or randomly select samples to avoid any bias due to ordering in the dataset
X_train, X_test, y_train, y_test = train_test_split(X_all,
                                                    y_all,
                                                    test_size=num_test,
                                                    random_state=88,
                                                    stratify=y_all)

print "Training set: {} samples".format(X_train.shape[0])
print "Test set: {} samples".format(X_test.shape[0])

Training set: 300 samples
Test set: 95 samples


In [7]:
# Train a model
def train_classifier(clf, X_train, y_train):
    print "Training {}...".format(clf.__class__.__name__)
    start = time.time()
    clf.fit(X_train, y_train)
    end = time.time()
    print "Done!\nTraining time (secs): {:.3f}".format(end - start)

# Choose a model and instantiate an object
clf = LinearSVC()

# Fit model to training data
train_classifier(clf, X_train, y_train)  # note: using entire training set here

Training LinearSVC...
Done!
Training time (secs): 0.022


In [8]:
# Predict on training set and compute F1 score
def predict_labels(clf, features, target):
    print "Predicting labels using {}...".format(clf.__class__.__name__)
    start = time.time()
    y_pred = clf.predict(features)
    end = time.time()
    print "Done!\nPrediction time (secs): {:.3f}".format(end - start)
    return f1_score(target.values, y_pred, pos_label='yes')

train_f1_score = predict_labels(clf, X_train, y_train)
print "F1 score for training set: {}".format(train_f1_score)

Predicting labels using LinearSVC...
Done!
Prediction time (secs): 0.002
F1 score for training set: 0.841870824053


In [9]:
# Predict on test data
print "F1 score for test set: {}".format(predict_labels(clf, X_test, y_test))

Predicting labels using LinearSVC...
Done!
Prediction time (secs): 0.000
F1 score for test set: 0.763888888889


In [10]:
# Train and predict using different training set sizes
def train_predict(clf, X_train, y_train, X_test, y_test):
    print "------------------------------------------"
    print "Training set size: {}".format(len(X_train))
    train_classifier(clf, X_train, y_train)
    print "F1 score for training set: {}".format(predict_labels(clf, X_train, y_train))
    print "F1 score for test set: {}".format(predict_labels(clf, X_test, y_test))

# Use the run_test function on LinearSVC model, with 300/200/100 train size 
train_predict(LinearSVC(), X_train, y_train, X_test, y_test)
train_predict(LinearSVC(), X_train[0:200], y_train[0:200], X_test, y_test)
train_predict(LinearSVC(), X_train[0:100], y_train[0:100], X_test, y_test)

------------------------------------------
Training set size: 300
Training LinearSVC...
Done!
Training time (secs): 0.023
Predicting labels using LinearSVC...
Done!
Prediction time (secs): 0.000
F1 score for training set: 0.832917705736
Predicting labels using LinearSVC...
Done!
Prediction time (secs): 0.000
F1 score for test set: 0.734375
------------------------------------------
Training set size: 200
Training LinearSVC...
Done!
Training time (secs): 0.014
Predicting labels using LinearSVC...
Done!
Prediction time (secs): 0.000
F1 score for training set: 0.857142857143
Predicting labels using LinearSVC...
Done!
Prediction time (secs): 0.000
F1 score for test set: 0.727272727273
------------------------------------------
Training set size: 100
Training LinearSVC...
Done!
Training time (secs): 0.005
Predicting labels using LinearSVC...
Done!
Prediction time (secs): 0.000
F1 score for training set: 0.924242424242
Predicting labels using LinearSVC...
Done!
Prediction time (secs): 0.000


In [11]:
# Now repeat the above process with 3 other algorithms: 

# K-Nearest Neighbors 
train_predict(KNeighborsClassifier(), X_train, y_train, X_test, y_test)
train_predict(KNeighborsClassifier(), X_train[0:200], y_train[0:200], X_test, y_test)
train_predict(KNeighborsClassifier(), X_train[0:100], y_train[0:100], X_test, y_test)

# Random Forests:
train_predict(RandomForestClassifier(), X_train, y_train, X_test, y_test)
train_predict(RandomForestClassifier(), X_train[0:200], y_train[0:200], X_test, y_test)
train_predict(RandomForestClassifier(), X_train[0:100], y_train[0:100], X_test, y_test)

# Naive Bayes:
train_predict(GaussianNB(), X_train, y_train, X_test, y_test)
train_predict(GaussianNB(), X_train[0:200], y_train[0:200], X_test, y_test)
train_predict(GaussianNB(), X_train[0:100], y_train[0:100], X_test, y_test)

------------------------------------------
Training set size: 300
Training KNeighborsClassifier...
Done!
Training time (secs): 0.001
Predicting labels using KNeighborsClassifier...
Done!
Prediction time (secs): 0.006
F1 score for training set: 0.87414187643
Predicting labels using KNeighborsClassifier...
Done!
Prediction time (secs): 0.003
F1 score for test set: 0.771428571429
------------------------------------------
Training set size: 200
Training KNeighborsClassifier...
Done!
Training time (secs): 0.000
Predicting labels using KNeighborsClassifier...
Done!
Prediction time (secs): 0.003
F1 score for training set: 0.857142857143
Predicting labels using KNeighborsClassifier...
Done!
Prediction time (secs): 0.002
F1 score for test set: 0.753623188406
------------------------------------------
Training set size: 100
Training KNeighborsClassifier...
Done!
Training time (secs): 0.000
Predicting labels using KNeighborsClassifier...
Done!
Prediction time (secs): 0.001
F1 score for training 

In [12]:
# Make a scorer to evaluate performance
f1_scorer = make_scorer(f1_score,
                        pos_label="yes")

# Make an StratifiedShuffleSplit iterator for cross-validation in GridSearchCV
sss = StratifiedShuffleSplit(y_train,
                             n_iter=20,
                             test_size=0.5,
                             random_state=88)

# Parameters to tune
params = {'n_estimators': [10, 20],
          'max_features': ["auto", "sqrt", "log2"]}

#Make the estimator using GridSearchCV and run cross-validation
print 'GridSearching with cross-validation...'
classifier = GridSearchCV(RandomForestClassifier(),
                          param_grid=params,
                          scoring=f1_scorer,
                          cv=sss,
                          verbose=1)

# Fit the model and tune the parameters
classifier.fit(X_train, y_train)

# Calculate feature scores
est = classifier.best_estimator_
scr = classifier.best_score_
print "\nThe best estimator is:\n{}\nwith a F1 score of {}".format(est, scr)

# Print the F1 score with the hold-out test data
print "\nRun the model again on test data:"
predict_labels(classifier, X_test, y_test)

[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    1.0s
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:    2.6s finished


GridSearching with cross-validation...
Fitting 20 folds for each of 6 candidates, totalling 120 fits

The best estimator is:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='log2', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
with a F1 score of 0.766422931671

Run the model again on test data:
Predicting labels using GridSearchCV...
Done!
Prediction time (secs): 0.001


0.78321678321678323