In [17]:
from sklearn import datasets
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# Load the wine dataset
data = pd.read_csv('wine_original.csv')
labels = data['class']
del data['class']

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=5)

In [18]:
X_train

Unnamed: 0,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315,Proline
54,13.74,1.67,2.25,16.4,118,2.60,2.90,0.21,1.62,5.850000,0.92,3.20,1060
140,12.93,2.81,2.70,21.0,96,1.54,0.50,0.53,0.75,4.600000,0.77,2.31,600
148,13.32,3.24,2.38,21.5,92,1.93,0.76,0.45,1.25,8.420000,0.55,1.62,650
81,12.72,1.81,2.20,18.8,86,2.20,2.53,0.26,1.77,3.900000,1.16,3.14,714
88,11.64,2.06,2.46,21.6,84,1.95,1.69,0.48,1.35,2.800000,1.00,2.75,680
61,12.64,1.36,2.02,16.8,100,2.02,1.41,0.53,0.62,5.750000,0.98,1.59,450
136,12.25,4.72,2.54,21.0,89,1.38,0.47,0.53,0.80,3.850000,0.75,1.27,720
46,14.38,3.59,2.28,16.0,102,3.25,3.17,0.27,2.19,4.900000,1.04,3.44,1065
171,12.77,2.39,2.28,19.5,86,1.39,0.51,0.48,0.64,9.899999,0.57,1.63,470
63,12.37,1.13,2.16,19.0,87,3.50,3.10,0.19,1.87,4.450000,1.22,2.87,420


In [19]:
# Initialize Gaussian Naive Bayes
gnb = GaussianNB()
# Train the classifier
gnb.fit(X_train, y_train)
# Make predictions on test data
y_pred = gnb.predict(X_test)
y_train_pred = gnb.predict(X_train)

# print the accuracy
print ('Training accuracy = ' + str(np.sum(y_train_pred == y_train)/len(y_train)))
print ('Test accuracy = ' + str(np.sum(y_pred == y_test)/len(y_test)))

Training accuracy = 0.992957746479
Test accuracy = 0.916666666667


In [20]:
X_train, X_valid, y_train, y_valid = train_test_split(data, labels, test_size=0.2, random_state=5)

alphas = [0.1, 0.5, 1, 2, 3, 4, 5, 10, 100]
best_alpha = 0.1
best_acc = 0.0

for alpha in alphas:
    # Initialize
    clf = MultinomialNB(alpha=alpha)
    # Train
    clf.fit(X_train, y_train)
    # Make predictions on validation data
    y_pred = clf.predict(X_valid)
    accuracy = np.sum(y_pred == y_valid)/len(y_valid)
    print ('Validation accuracy = ' + str(accuracy) + ' at alpha = ' + str(alpha))
    if accuracy > best_acc:
        best_acc = accuracy
        best_alpha = alpha

print ('Best alpha = ' + str(best_alpha))        

Validation accuracy = 0.777777777778 at alpha = 0.1
Validation accuracy = 0.777777777778 at alpha = 0.5
Validation accuracy = 0.777777777778 at alpha = 1
Validation accuracy = 0.777777777778 at alpha = 2
Validation accuracy = 0.75 at alpha = 3
Validation accuracy = 0.75 at alpha = 4
Validation accuracy = 0.75 at alpha = 5
Validation accuracy = 0.75 at alpha = 10
Validation accuracy = 0.722222222222 at alpha = 100
Best alpha = 0.1


In [21]:
X_train = np.concatenate((X_train, X_valid))
y_train = np.concatenate((y_train, y_valid))

clf = MultinomialNB(alpha=best_alpha)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_train_pred = clf.predict(X_train)

# print the accuracy
print ('Training accuracy = ' + str(np.sum(y_train_pred == y_train)/len(y_train)))
print ('Test accuracy = ' + str(np.sum(y_pred == y_test)/len(y_test)))

Training accuracy = 0.870786516854
Test accuracy = 0.777777777778


In [None]:
# Practice Problem.
# 1. In the code stub use Gaussian Naive Bayes model to predict the accuracy on the test iris data (use 2 features)
# 2. In the code stub use Multinomial Naive Bayes model to predict the accuracy on the test iris data (use 2 features)

In [25]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=5)

# Intialize
clf = LinearDiscriminantAnalysis()
# Train
clf.fit(X_train, y_train)
# Test
y_pred = clf.predict(X_test)
y_train_pred = clf.predict(X_train)

# print the accuracy
print ('Training accuracy = ' + str(np.sum(y_train_pred == y_train)/len(y_train)))
print ('Test accuracy = ' + str(np.sum(y_pred == y_test)/len(y_test)))

Training accuracy = 1.0
Test accuracy = 0.944444444444


In [24]:
# Note that we tuned no parameters for the discrimant analysis we did (both Linear and Quadratic), and yet ended up with an accuracy better than Naive Bayes. 
# Exercise for the reader: 
#     optimize 'solver' for Linear discriminant analysis
#     optimize 'reg_param' for Quadratic discriminant analysis
# Refer:
#     http://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html
#     http://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html