In [None]:
import numpy as np
import pandas as pd
import iqplot
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import holoviews as hv
hv.extension('bokeh')

import bokeh.io
bokeh.io.output_notebook()

In [None]:
df = pd.read_csv('data/WineQT.csv')
df.head()

In [None]:
hv.Points(
    data=df,
    kdims=['fixed acidity', 'residual sugar'],
)

In [None]:
p = iqplot.strip(
    data=df,
    q="fixed acidity",
    jitter=True,
)

bokeh.io.show(p)

In [None]:
p = iqplot.strip(
    data=df,
    q="volatile acidity",
    jitter=True,
)

bokeh.io.show(p)

In [None]:
p = iqplot.strip(
    data=df,
    q="alcohol",
    jitter=True,
)

bokeh.io.show(p)

In [None]:
names = list(df.columns)
print(names)

In [None]:
names.remove('Id')
names.remove('quality')
print(names)

In [None]:
Y = df["quality"]
features = names
X = pd.get_dummies(df[features])
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, Y)
predictions = model.predict(X)
print("Accuracy:",metrics.accuracy_score(Y, predictions))

In [None]:
def subsets(numbers):
    if numbers == []:
        return [[]]
    x = subsets(numbers[1:])
    return x + [[numbers[0]] + y for y in x]

def subsets_of_given_size(numbers, n):
    return [x for x in subsets(numbers) if len(x)==n]

In [None]:
combo = subsets(names)
combo.remove([])

In [None]:
m = 0
res = [0, 0]
cats = []
for feature in combo:
    X = pd.get_dummies(df[feature])
    Y = df["quality"]
    for i in range(1, 15):
        for j in range(1, 12):
            model = RandomForestClassifier(n_estimators=i, max_depth=j, random_state=1)
            x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3)
            model.fit(x_train, y_train)
            predictions = model.predict(x_test)
            acc = metrics.accuracy_score(y_test, predictions)
            if acc > m:
                m = acc
                res = [i, j]
                cats = feature
print(res, m, cats)

In [None]:
err = 100
n_iter = 100
for features in combo:
    X = pd.get_dummies(df[features])
    Y = df["quality"]
    x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3)
    clf_ = SGDRegressor(max_iter=n_iter)
    clf_.fit(x_train, y_train)
    y_pred_sksgd=clf_.predict(x_test)
    mse = mean_squared_error(y_test, y_pred_sksgd)
    
    if err > mse:
        err = mse
        feature = features
print(err, features)
print(1 - err)

In [None]:
acc = 0
for i in range(len(combo)):
    features = combo[i]
    X = pd.get_dummies(df[features])
    Y = df["quality"]
    classifier = SVC(kernel='rbf', random_state = 1)
    x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3)
    classifier.fit(x_train,y_train)
    Y_pred = classifier.predict(x_test)
    cm = confusion_matrix(y_test,Y_pred)
    accuracy = float(cm.diagonal().sum())/len(y_test)
    
    if accuracy > acc:
        acc = accuracy
        feat = features
        
print(acc, features)

In [None]:
acc = 0
for i in range(len(combo)):
    features = combo[i]
    X = pd.get_dummies(df[features])
    Y = df["quality"]
    x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3)
    classifier = GaussianNB()
    classifier.fit(x_train,y_train)
    Y_pred = classifier.predict(x_test)
    cm = confusion_matrix(y_test,Y_pred)
    accuracy = float(cm.diagonal().sum())/len(y_test)
    
    if accuracy > acc:
        acc = accuracy
        feat = features
        
print(acc, features)

In [None]:
acc = 0
for i in range(len(combo)):
    features = combo[i]
    X = pd.get_dummies(df[features])
    Y = df["quality"]
    x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3)
    for i in range(1, 26):
        classifier = KNeighborsClassifier(n_neighbors = i)
        classifier.fit(x_train,y_train)
        Y_pred = classifier.predict(x_test)
        cm = confusion_matrix(y_test,Y_pred)
        accuracy = float(cm.diagonal().sum())/len(y_test)
    
        if accuracy > acc:
            acc = accuracy
            feat = features
            n = i
        
print(acc, features, i)