In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn.inspection import partial_dependence
from sklearn.inspection import plot_partial_dependence
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

#pip3 install  scikit-plot
import scikitplot as skplt


%matplotlib inline 
plt.rcParams['figure.figsize'] = [10, 5]

1. Read data frame and rename columns

In [None]:
df = pd.read_csv("Wine.txt", sep='\t')
features = ['alcohol', 'malic_acid', 'ash','ash_alcalinity','magnesium','phenols','flavanoids',
              'nonflavanoid','proanthocyanins','color','hue','OD280_OD315 ','proline']

df.columns = features + ['d']

2. Look at first rows to check dataframe is loaded

In [None]:
df.head()

3. Check we have no NaN values in dataframe and check column types are not "objects"

In [None]:
df.info()

4. Split into train set, validation set, test set

In [None]:
np.random.seed(42)
def split_train_test(data, test_ratio=0.2):
    shuffled_indices = np.random.permutation(len(data)) 
    test_set_size = int(len(data) * test_ratio) 
    test_indices = shuffled_indices[:test_set_size] 
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]
train_set, test_set = split_train_test(df, 0.2,)

print("train set shape:{}\n test set shape:{}\n".format(train_set.shape, test_set.shape))

5. Copy train set to make sure we don't mutate it by chance

In [None]:
wines = train_set.copy()

6. Check correlations between features visually

In [None]:
wines.describe()

In [None]:
sns.set(style="white")
corr = wines.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=np.bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

7. We see that
  a) i1 and i13 correlate with target
  b) we see thatn i1 and i13 dont corellate with each other

In [None]:
wines.hist(bins=50, figsize=(20,15)) 
plt.show()

8. Check visually whether correlated features really separate target well

In [None]:
ax = sns.boxplot(x="d", y="alcohol", data=wines)
ax = sns.swarmplot(x="d", y="alcohol", data=wines, color=".25")

In [None]:
ax = sns.boxplot(x="d", y="proline", data=wines)
ax = sns.swarmplot(x="d", y="proline", data=wines, color=".25")

11. We will need customer transformer for dropping non-relevant features.

In [None]:
#Custom Transformer that extracts columns passed as argument to its constructor 
class ColumnSelector( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self, feature_names ):
        self._feature_names = feature_names 
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        return X[ self._feature_names ] 

12. Construct pipeline using dropper and standard scaler transformers. 
We need scaling as many classifiers allow 0..1 scale only for features

In [None]:
def try_model(tset, vset, depth, pipe,clf):
    Xtest = tset[features]
    Ytest = tset['d']
    Xvalidation = vset[features]
    Yvalidation = vset['d']
    clf.fit(pipe.fit_transform(Xtest), Ytest)
    return f1_score(Yvalidation,  clf.predict(pipe.fit_transform(Xvalidation)), average='macro')

kf = KFold(n_splits=4)

depths=np.arange(1,10)

scores = []
for d in depths:
    s = np.zeros(4)
    idx=0
    for train, test in kf.split(train_set):
        s[idx] = try_model(train_set.iloc[train], train_set.iloc[test],d, Pipeline([('std_scaler', StandardScaler())]),
                                        RandomForestClassifier(max_depth=d, random_state=0))
        idx+=1
    scores.append(s.mean())    

    
plt.scatter(x=depths, y = scores)
plt.xticks(depths)

In [None]:
depths=np.arange(1,20)

scores = []
for d in depths:
    s = np.zeros(4)
    idx=0
    for train, test in kf.split(train_set):
        s[idx] = try_model(train_set.iloc[train], train_set.iloc[test],d, Pipeline([('std_scaler', StandardScaler())]), 
                           DecisionTreeClassifier(random_state=0, max_depth=d))
        idx+=1
    scores.append(s.mean())    

plt.scatter(x=depths, y = scores)
plt.xticks(depths)

In [None]:
train_set.head(5)

In [None]:
#So best params  for random forest:
depth = 4

X = train_set[features]
y = train_set['d']
    

rf = RandomForestClassifier(max_depth=depth, random_state=0).fit(Pipeline([('std_scaler', StandardScaler())]).fit_transform(X),y)

feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = X.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)

ax = feature_importances.plot.bar()

In [None]:
#So best params  for random forest:
depth = 3

X = train_set[features]
y = train_set['d']
    

     
    
dt = DecisionTreeClassifier(random_state=0, max_depth=d).fit(Pipeline([('std_scaler', StandardScaler())]).fit_transform(X),y)


tree.plot_tree(dt.fit(X, y), feature_names=features) 

feature_importances = pd.DataFrame(dt.feature_importances_,
                                   index = X.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)

ax = feature_importances.plot.bar()

In [None]:
pipeline = Pipeline([
            ('dropper', ColumnSelector(['alcohol','proline'])),
            ('std_scaler', StandardScaler()),
        ])

y = train_set['d']
clf = BaggingClassifier(base_estimator=SVC(), n_estimators=10, random_state=0).fit(pipeline.fit_transform(train_set), y)
y_score = clf.decision_function(pipeline.fit_transform(test_set))

skplt.metrics.plot_roc_curve(test_set['d'], y_score)
plt.show()

15. Utility functions

In [None]:
#So best params  for random forest:
depth = 4

X = train_set[features]
y = train_set['d']
    

rf = RandomForestClassifier(warm_start=True, oob_score=True, 
                            max_depth=depth, random_state=0).fit(Pipeline([('std_scaler', StandardScaler())]).fit_transform(X),y)

feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = X.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)

rf_test_probas=rf.predict_proba(Pipeline([('std_scaler', StandardScaler())]).fit_transform(test_set[features]))

print('Out of Bag Score:{}'.format(rf.oob_score_)) 

In [None]:

def precision(mtrx):
    true_positives, true_negatives, false_negatives, false_positives = mtrx
    return true_positives / (true_positives + false_positives)


def falsepositiverate(mtrx):
    true_positives, true_negatives, false_negatives, false_positives = mtrx
    return false_positives / (false_positives + true_negatives)


def recall(mtrx):
    true_positives, true_negatives, false_negatives, false_positives = mtrx
    return true_positives / (true_positives + false_negatives)


def f1_score(mtrx):
    return 2 / (precision(mtrx) + recall(mtrx))


def get_confusion_matrix(labels, probas, target, thre):
    true_positives = 0
    true_negatives = 0
    false_positives = 0
    false_negatives = 0
    for i in range(0, len(labels)):
        if labels[i] == target:
            positive = True
        else:
            positive = False

        proba = probas[i][target]

        if proba > thre and positive:
            true_positives += 1
        if proba > thre and not positive:
            false_positives += 1
        if proba <= thre and positive:
            false_negatives += 1
        if proba <= thre and not positive:
            true_negatives += 1
    return (true_positives, true_negatives, false_negatives, false_positives)

def roc(lbls, probas, tgt):
    thresholds = np.linspace(0, 1, 20)

    x = np.zeros(len(thresholds))
    y = np.zeros(len(thresholds))

    idx = 0
    for t in thresholds:
        mtrx = get_confusion_matrix(labels=lbls, probas=probas, target=tgt, thre=t)

        tpr = recall(mtrx)
        fpr = falsepositiverate(mtrx)
        x[idx] = fpr
        y[idx] = tpr

        idx+=1

    return (x,y)


tuples = []
for i in np.arange(0, 3):    
    rf_x, y =  roc(test_set['d'].values, rf_test_probas, i)
    tuples.append((rf_x,y))

f, (ax1, ax2, ax3) = plt.subplots(3, 1)

plots = []
plots.append(ax1)
plots.append(ax2)
plots.append(ax3)


idx = 0
for p in plots:
    x,y = tuples[i]
    p.plot(x, y)
    p.set_title('Class %d'%idx)
    idx+=1


