In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
wine = pd.read_csv('winequality-red.csv')

In [3]:
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
bins = [1,4,6,10]
labels = [0,1,2]
wine['quality_categorical'] = pd.cut(wine['quality'],bins=bins,labels=labels,include_lowest=True )

In [5]:
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,quality_categorical
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1


In [6]:
target = wine['quality_categorical']
features = wine.drop(['quality','quality_categorical'], axis=1)

In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(features,target,test_size=0.2,random_state=0)
print("Training set has {} samples".format(X_train.shape[0]))
print("Testing set has {} samples".format(X_test.shape[0]))

Training set has 1279 samples
Testing set has 320 samples


In [8]:
import time
from sklearn.metrics import fbeta_score
from sklearn.metrics import accuracy_score

def train_predict_evaluate(learner,sample_size,X_train,y_train,X_test,y_test):
    results={}
    #start = time()
    learner = learner.fit(X_train[:sample_size], y_train[:sample_size])
    #end=time()
    
    #results['train_time'] =end-start
    #start = time()
    predictions_train =learner.predict(X_train[:300])
    predictions_test =learner.predict(X_test)
    #end = time()
    #results['pred_time'] = end-start
    results['acc_train'] = accuracy_score(y_train[:300], predictions_train)
    results['acc_test'] = accuracy_score(y_test, predictions_test)
    results['f_train'] = fbeta_score(y_train[:300], predictions_train,beta=0.5, average='micro')
    results['f_test'] = fbeta_score(y_test, predictions_test,beta=0.5, average='micro')
    
    print("{} trained on {} samples".format(learner.__class__.__name__, sample_size))
    return results

In [9]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
clf_A = GaussianNB()
clf_B = DecisionTreeClassifier(max_depth=None, random_state=None)
clf_C = RandomForestClassifier(max_depth=None, random_state=None)
samples_100 = len(y_train)
samples_10 = int(len(y_train)*10/100)
samples_1 = int(len(y_train)*1/100)
results={}
for clf in [clf_A, clf_B,clf_C]:
    clf_name = clf.__class__.__name__
    results[clf_name]={}
    for i, samples in enumerate([samples_1, samples_10, samples_100]):
        results[clf_name][i] = \
        train_predict_evaluate(clf, samples,X_train,y_train,X_test, y_test)
print(results)
        

  from numpy.core.umath_tests import inner1d


GaussianNB trained on 12 samples
GaussianNB trained on 127 samples
GaussianNB trained on 1279 samples
DecisionTreeClassifier trained on 12 samples
DecisionTreeClassifier trained on 127 samples
DecisionTreeClassifier trained on 1279 samples
RandomForestClassifier trained on 12 samples
RandomForestClassifier trained on 127 samples
RandomForestClassifier trained on 1279 samples
{'RandomForestClassifier': {0: {'acc_train': 0.84, 'f_train': 0.84, 'acc_test': 0.840625, 'f_test': 0.840625}, 1: {'acc_train': 0.8866666666666667, 'f_train': 0.8866666666666667, 'acc_test': 0.834375, 'f_test': 0.834375}, 2: {'acc_train': 0.9866666666666667, 'f_train': 0.9866666666666666, 'acc_test': 0.88125, 'f_test': 0.88125}}, 'GaussianNB': {0: {'acc_train': 0.8266666666666667, 'f_train': 0.8266666666666667, 'acc_test': 0.85, 'f_test': 0.85}, 1: {'acc_train': 0.8433333333333334, 'f_train': 0.8433333333333334, 'acc_test': 0.8, 'f_test': 0.8}, 2: {'acc_train': 0.77, 'f_train': 0.77, 'acc_test': 0.7625, 'f_test': 0

In [10]:
model = RandomForestClassifier(max_depth=None, random_state=None)
model=model.fit(X_train, y_train)
importances = model.feature_importances_
print(X_train.columns)
print(importances)
#vs.feature_plot(importances, X_train, y_train)

Index([u'fixed acidity', u'volatile acidity', u'citric acid',
       u'residual sugar', u'chlorides', u'free sulfur dioxide',
       u'total sulfur dioxide', u'density', u'pH', u'sulphates', u'alcohol'],
      dtype='object')
[0.08863108 0.11834923 0.07351152 0.07606819 0.0878049  0.0538491
 0.07654855 0.08468801 0.0653972  0.11912653 0.15602569]


In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
clf=RandomForestClassifier(max_depth=None, random_state=None)

parameters = {"n_estimators": [10, 20, 30], 'max_features': [3,4,5,None], 'max_depth': [5,6,7,None]}
scorer = make_scorer(fbeta_score, beta=0.5, average= "micro")
grid_obj = GridSearchCV(clf, parameters, scoring=scorer)
grid_fit = grid_obj.fit(X_train,y_train)
best_clf = grid_fit.best_estimator_
#comparing the two models
predictions = (clf.fit(X_train, y_train)).predict(X_test)
best_predictions = best_clf.predict(X_test)
print("Unoptimized model \n-----")
print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions)))
print("F-score on testing data: {:.4f}".format(fbeta_score(y_test,predictions, beta=0.5, average='micro')))
print(" \nunoptimized models \n------")
print(best_clf)
print(" \n Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
print("Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta=0.5, average="micro")))

Unoptimized model 
-----
Accuracy score on testing data: 0.8875
F-score on testing data: 0.8875
 
unoptimized models 
------
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
 
 Final accuracy score on the testing data: 0.8938
Final F-score on the testing data: 0.8938


In [12]:
from sklearn.externals import joblib

joblib.dump(clf, 'model.pkl')

['model.pkl']

In [20]:
wine_data = [[8, 0.2, 0.6, 1.8, 1.3, 0.4,0.065, 3, 16, 0.92, 9.5 ],
            [8,0,0.16, 1.8, 0.065, 3, 16, 0.9962, 3.42, 0.92, 1],
            [3,2.4,0.3, 1.0, 1.3, 2.5, 3.4, 1.5, 2.3, 4.2,0.875]]

In [21]:
for i,data in enumerate(best_clf.predict(wine_data)):
    print("The predicted quality for wine {} is {}".format(i+1,data))

The predicted quality for wine 1 is 1
The predicted quality for wine 2 is 1
The predicted quality for wine 3 is 1
