In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split

#pip3 install  scikit-plot
import scikitplot as skplt


%matplotlib inline 
plt.rcParams['figure.figsize'] = [10, 5]

1. Read data frame and rename columns

In [None]:
df = pd.read_csv("Wine.txt", sep='\t')
df.columns = ['i1', 'i2', 'i3','i4','i5','i6','i7','i8','i9','i10','i11','i12','i13','d']

2. Look at first rows to check dataframe is loaded

In [None]:
df.head()

3. Check we have no NaN values in dataframe and check column types are not "objects"

In [None]:
df.info()

4. Split into train set and test set

In [None]:
np.random.seed(42)
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data)) 
    test_set_size = int(len(data) * test_ratio) 
    test_indices = shuffled_indices[:test_set_size] 
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]
train_set, test_set = split_train_test(df, 0.2)

print("train set shape:{}\n test set shape:{}".format(train_set.shape, test_set.shape))

5. Copy train set to make sure we don't mutate it by chance

In [None]:
wines = train_set.copy()

6. Check correlations between features visually

In [None]:
wines.describe()

In [None]:
sns.set(style="ticks")
g = sns.pairplot(wines, hue='d')

In [None]:
sns.set(style="white")
corr = wines.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=np.bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

7. Draw feature  histograms. Why???

In [None]:
wines.hist(bins=50, figsize=(20,15)) 
plt.show()

8. Check visually whether correlated features really separate

In [None]:
ax = sns.boxplot(x="d", y="i1", data=wines)
ax = sns.swarmplot(x="d", y="i1", data=wines, color=".25")

In [None]:
ax = sns.boxplot(x="d", y="i13", data=wines)
ax = sns.swarmplot(x="d", y="i13", data=wines, color=".25")

In [None]:
ax = sns.boxplot(x="d", y="i10", data=wines)
ax = sns.swarmplot(x="d", y="i10", data=wines, color=".25")

9. Ok, so  we use i1, i13 for classification

10. Prepare predictor and label frames

In [None]:
X=wines
y=wines[['d']]

11. We will need customer transformer for dropping non-relevant features.

In [None]:
#Custom Transformer that extracts columns passed as argument to its constructor 
class ColumnSelector( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self, feature_names ):
        self._feature_names = feature_names 
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        return X[ self._feature_names ] 

12. Construct pipeline using dropper and standard scaler transformers. 
We need scaling as many classifiers allow 0..1 scale only for features

In [None]:
pipeline = Pipeline([
            ('dropper', ColumnSelector(['i1','i13'])),
            ('std_scaler', StandardScaler()),
        ])
X_tr = pipeline.fit_transform(X)

In [None]:
clf = BaggingClassifier(base_estimator=SVC(), n_estimators=10, random_state=0).fit(X_tr, y)

13. Pre-check whether first 5 rows of training set were classified correctly

In [None]:
some_data = wines.iloc[:5]
some_data

In [None]:
some_labels = y.iloc[:5]
print(some_labels)

In [None]:
some_data_prepared = pipeline.transform(some_data)
print("Predictions:", clf.predict(some_data_prepared))

14. Verify classifier accuracy

In [None]:
y_score = clf.decision_function(pipeline.transform(test_set))

skplt.metrics.plot_roc_curve(test_set['d'], y_score)
plt.show()