In [36]:
#Packages
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
import scipy
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")
sns.set_color_codes("pastel")
import warnings
warnings.filterwarnings('ignore')

In [37]:
student_datam = pd.read_csv("C:\\Users\\Suyash\\Downloads\\student-mat.csv")
student_datap = pd.read_csv("C:\\Users\\Suyash\\Downloads\\student-por.csv")
student_data = pd.concat([student_datam,student_datap])
#Cleaning duplicates
student_data=student_data.drop_duplicates(["school","sex","age","address","famsize","Pstatus","Medu","Fedu","Mjob","Fjob","reason","nursery","internet"])
student_data.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [38]:
fig, ax = plt.subplots(figsize=(5, 4))
sns.distplot(student_data['Walc'],  
             hist_kws={"alpha": 1, "color": "#a2cffe"}, 
             kde=False, bins=4)
ax = ax.set(ylabel="Students", xlabel="Weekend Alcohol Consumption")

In [39]:
plot1 = sns.factorplot(x="Walc", y="health", hue="sex", data=student_data)
plot1.set(ylabel="Health", xlabel="Weekend Alcohol Consumption")

plot2 = sns.factorplot(x="Dalc", y="health", hue="sex", data=student_data)
plot2.set(ylabel="Health", xlabel="Workday Alcohol Consumption")

<seaborn.axisgrid.FacetGrid at 0x11f7c5ecc18>

In [40]:
plot1 = sns.factorplot(x="G3", y="Walc", data=student_data)
plot1.set(ylabel="Final Grade", xlabel="Weekend Alcohol Consumption")

plot2 = sns.factorplot(x="G3", y="Dalc", data=student_data)
plot2.set(ylabel="Final Grade", xlabel="Workday Alcohol Consumption")

<seaborn.axisgrid.FacetGrid at 0x11f7c647dd8>

In [41]:
#First, decide how many training vs test samples you want
from sklearn.cross_validation import train_test_split

y = student_data['G3']
X = student_data.drop(['G3'], axis=1)
X = pd.get_dummies(X)
num_all = student_data.shape[0]  # same as len(student_data)
num_train = int(0.75*num_all)  # about 75% of the data
num_test = num_all - num_train

# TODO: Then, select features (X) and corresponding labels (y) for the training and test sets
# Note: Shuffle the data or randomly select samples to avoid any bias due to ordering in the dataset
X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.25, train_size = 0.75, random_state=0)

print (X_train.shape[0])
print (X_test.shape[0])

496
166


First we will look at cross-validation score from different machine learning models, The models I have taken are DecisionTreeRegressor , LinearRegression , Ridge and Lasso

In [42]:
names = ['DecisionTreeRegressor', 'LinearRegression', 'Ridge', 'Lasso']

clf_list = [DecisionTreeRegressor(),
            LinearRegression(),
            Ridge(),
            Lasso()]
for name, clf in zip(names, clf_list):
    print(name, end=': ')
    print(cross_val_score(clf, X, y, cv=5).mean())


DecisionTreeRegressor: 0.597211835144
LinearRegression: 0.797659207321
Ridge: 0.798160300943
Lasso: 0.824593486283


As seen The Ridge and Lasso model gave the best crossvalidation score. We will further analyze the data through F1 score and training time. The F1 Score is the 2*((precision*recall)/(precision+recall)). We have selected F1 score for analysis as it gives more accurate measure than precision and recall. A model can have very high precission but poor recall and vice-versa. So Analysis has to be done which takes into account both. Hence we select F1 score for analysis. 

In [43]:
# Train a model
import time

def train_classifier(clf, X_train, y_train):
    print (clf.__class__.__name__)
    start = time.time()
    clf.fit(X_train, y_train)
    end = time.time()
    print ("Done!\nTraining time (secs): ",(end - start))

# TODO: Choose a model, import it and instantiate an object
from sklearn import tree
clf = tree.DecisionTreeRegressor()

# Fit model to training data
train_classifier(clf, X_train, y_train)  # note: using entire training set here
#print clf  # you can inspect the learned model by printing it

DecisionTreeRegressor
Done!
Training time (secs):  0.025432586669921875


In [44]:
# Predict on training set and compute F1 score
from sklearn.metrics import f1_score

def predict_labels(clf, features, target):
    print ("Predicting labels using {}...",(clf.__class__.__name__))
    start = time.time()
    y_pred = clf.predict(features)
    end = time.time()
    print ("Done!\nPrediction time (secs): ",(end - start))
    return f1_score(target.values, y_pred, pos_label='yes', average= 'weighted', labels=np.unique(y_pred))

train_f1_score = predict_labels(clf, X_train, y_train)
print ("F1 score for training set: {}",(train_f1_score))
print ("F1 score for test set: {}",(predict_labels(clf, X_test, y_test)))

Predicting labels using {}... DecisionTreeRegressor
Done!
Prediction time (secs):  0.0
F1 score for training set: {} 1.0
Predicting labels using {}... DecisionTreeRegressor
Done!
Prediction time (secs):  0.0010001659393310547
F1 score for test set: {} 0.37006497088


In [45]:

# Train and predict from training set
def train_predict(clf, X_train, y_train, X_test, y_test):
    print("------------------------------------------")
    print("Training set size: ",len(X_train))
    train_classifier(clf, X_train, y_train)
    print("F1 score for training set: ",predict_labels(clf, X_train, y_train))
    print("F1 score for test set: ",predict_labels(clf, X_test, y_test))

clf = tree.DecisionTreeRegressor()
train_predict(clf, X_train, y_train, X_test, y_test)

------------------------------------------
Training set size:  496
DecisionTreeRegressor
Done!
Training time (secs):  0.01579904556274414
Predicting labels using {}... DecisionTreeRegressor
Done!
Prediction time (secs):  0.004019498825073242
F1 score for training set:  1.0
Predicting labels using {}... DecisionTreeRegressor
Done!
Prediction time (secs):  0.0
F1 score for test set:  0.348378626164


The first model which we took was Decision Tree Regressor. The model took very less training time but has not very good f1 score

In [46]:
# Train and predict using SVC
from sklearn.svm import SVC
clf = SVC()
train_predict(clf, X_train, y_train, X_test, y_test)

------------------------------------------
Training set size:  496
SVC
Done!
Training time (secs):  0.13573741912841797
Predicting labels using {}... SVC
Done!
Prediction time (secs):  0.06694173812866211
F1 score for training set:  0.668797528787
Predicting labels using {}... SVC
Done!
Prediction time (secs):  0.017154932022094727
F1 score for test set:  0.426485711805


In [47]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
train_predict(clf, X_train, y_train, X_test, y_test)


------------------------------------------
Training set size:  496
GaussianNB
Done!
Training time (secs):  0.010929107666015625
Predicting labels using {}... GaussianNB
Done!
Prediction time (secs):  0.0078582763671875
F1 score for training set:  0.211902025928
Predicting labels using {}... GaussianNB
Done!
Prediction time (secs):  0.008934974670410156
F1 score for test set:  0.0862638189454


In [48]:
from sklearn import ensemble
clf = ensemble.RandomForestClassifier()
train_predict(clf, X_train, y_train, X_test, y_test)

------------------------------------------
Training set size:  496
RandomForestClassifier
Done!
Training time (secs):  0.06612944602966309
Predicting labels using {}... RandomForestClassifier
Done!
Prediction time (secs):  0.009911060333251953
F1 score for training set:  0.991889874793
Predicting labels using {}... RandomForestClassifier
Done!
Prediction time (secs):  0.010133743286132812
F1 score for test set:  0.307916284879


Thus we used different model to train the dataset. The SVC model gave the best F1 score, although it took little longer than all other models. But still training time was not very large considering the amount of data we have at present. Therefore SVC model can be used to train the datasets. Next we use grid-search to optimize the parameters of this model.

In [49]:
# TODO: Fine-tune your model and report the best F1 score
from sklearn import grid_search
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.cross_validation import StratifiedShuffleSplit


#cv = StratifiedShuffleSplit(y_train, random_state=42)
cv=2
clf = SVC()
param_grid = [
  {'C': [1, 10, 100, 200, 300, 400, 500, 600, 700],
   'gamma': [1e-2, 1e-3, 1e-4, 1e-5, 1e-6],
   'kernel': ['rbf'], 'tol':[1e-3, 1e-4, 1e-5, 1e-6]
  }
 ]

regressor = grid_search.GridSearchCV(clf, param_grid,cv=cv, scoring='f1_weighted')
regressor.fit(X_train, y_train)
reg = regressor.best_estimator_
print (reg)
train_f1_score = predict_labels(reg, X_train, y_train)
print ("F1 score for training set: {}",train_f1_score)

print ("F1 score for test set: {}",predict_labels(reg, X_test, y_test))

SVC(C=200, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Predicting labels using {}... SVC
Done!
Prediction time (secs):  0.056917428970336914
F1 score for training set: {} 0.636237060432
Predicting labels using {}... SVC
Done!
Prediction time (secs):  0.015566110610961914
F1 score for test set: {} 0.340918454673


The model was optimized using grid search and final F1 score and training time were printed.
----End of Analysis-----