In [1]:
import os, shutil
import pandas as pd
import seaborn as sns
from modules import WineDb
from modules.PreprocessingFunctions import Functions as prep

if os.path.exists('modules\__pycache__'):
    shutil.rmtree('modules\__pycache__')

In [2]:
winedb = WineDb.WineDb()

# Outliers Detection

In [3]:
excluded = ['type', 'quality']
winedb_noout_zscore = prep.zscore_outliers(winedb(), 4, excluded)
winedb_noout_iqr = prep.iqr_outliers(winedb(), excluded)
winedb_noout_iso = prep.isoforest_outliers(winedb(), excluded)
winedb_noout_elliptic = prep.ellipticenvelope_outliers(winedb(), excluded)

# Normalization

In [4]:
prep.minmax_norm(winedb(), excluded)

Unnamed: 0,alcohol,chlorides,citric acid,density,fixed acidity,free sulfur dioxide,pH,residual sugar,sulphates,total sulfur dioxide,volatile acidity,type,quality
0,0.202899,0.111296,0.000000,0.206092,0.297521,0.034722,0.612403,0.019939,0.191011,0.064516,0.413333,1,5
1,0.260870,0.147841,0.000000,0.186813,0.330579,0.083333,0.372093,0.030675,0.258427,0.140553,0.533333,1,5
2,0.260870,0.137874,0.024096,0.190669,0.330579,0.048611,0.418605,0.026074,0.241573,0.110599,0.453333,1,5
3,0.260870,0.109635,0.337349,0.209948,0.611570,0.055556,0.341085,0.019939,0.202247,0.124424,0.133333,1,6
4,0.202899,0.111296,0.000000,0.206092,0.297521,0.034722,0.612403,0.019939,0.191011,0.064516,0.413333,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,0.463768,0.049834,0.174699,0.077694,0.198347,0.079861,0.426357,0.015337,0.157303,0.198157,0.086667,-1,6
6493,0.231884,0.063123,0.216867,0.150183,0.231405,0.194444,0.333333,0.113497,0.134831,0.373272,0.160000,-1,5
6494,0.202899,0.053156,0.114458,0.104685,0.223140,0.100694,0.209302,0.009202,0.134831,0.241935,0.106667,-1,6
6495,0.695652,0.021595,0.180723,0.030461,0.140496,0.065972,0.480620,0.007669,0.089888,0.239631,0.140000,-1,7


# One Hot Encode

In [5]:
prep.one_hot_encode(winedb(), 'quality')

Unnamed: 0,alcohol,chlorides,citric acid,density,fixed acidity,free sulfur dioxide,pH,residual sugar,sulphates,target_col__3,target_col__4,target_col__5,target_col__6,target_col__7,target_col__8,target_col__9,total sulfur dioxide,type,volatile acidity
0,9.4,0.076,0.00,0.99780,7.4,11.0,3.51,1.9,0.56,0,0,1,0,0,0,0,34.0,1,0.70
1,9.8,0.098,0.00,0.99680,7.8,25.0,3.20,2.6,0.68,0,0,1,0,0,0,0,67.0,1,0.88
2,9.8,0.092,0.04,0.99700,7.8,15.0,3.26,2.3,0.65,0,0,1,0,0,0,0,54.0,1,0.76
3,9.8,0.075,0.56,0.99800,11.2,17.0,3.16,1.9,0.58,0,0,0,1,0,0,0,60.0,1,0.28
4,9.4,0.076,0.00,0.99780,7.4,11.0,3.51,1.9,0.56,0,0,1,0,0,0,0,34.0,1,0.70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,11.2,0.039,0.29,0.99114,6.2,24.0,3.27,1.6,0.50,0,0,0,1,0,0,0,92.0,-1,0.21
6493,9.6,0.047,0.36,0.99490,6.6,57.0,3.15,8.0,0.46,0,0,1,0,0,0,0,168.0,-1,0.32
6494,9.4,0.041,0.19,0.99254,6.5,30.0,2.99,1.2,0.46,0,0,0,1,0,0,0,111.0,-1,0.24
6495,12.8,0.022,0.30,0.98869,5.5,20.0,3.34,1.1,0.38,0,0,0,0,1,0,0,110.0,-1,0.29


# Dimensionality reduction

In [6]:
prep.pca(winedb(), excluded=excluded, n_components=5)

Unnamed: 0,0,1,2,3,4,type,quality
0,-84.111149,-0.145117,0.025666,0.399654,-1.581266,1,5
1,-48.779427,5.847319,-0.862358,0.775057,-0.780022,1,5
2,-63.734575,-0.876416,-0.423401,0.649272,-0.880127,1,5
3,-57.481266,-0.345098,-1.008598,3.645054,0.892979,1,6
4,-84.111149,-0.145117,0.025666,0.399654,-1.581266,1,5
...,...,...,...,...,...,...,...
6492,-24.741512,-0.916209,-2.879699,-1.154000,-0.281481,-1,6
6493,57.017361,13.706750,0.011390,0.216101,-0.593701,-1,5
6494,-4.896142,0.508568,-3.964622,0.178853,-1.575022,-1,6
6495,-8.188532,-8.973281,-4.061992,-2.494224,0.807627,-1,7


# Unbalanced sampling

In [7]:
prep.random_undersample(winedb())

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,6.5,0.320,0.23,1.2,0.054,39.0,208.0,0.99272,3.18,0.46,9.9,6,-1
1,6.1,0.240,0.26,1.7,0.033,61.0,134.0,0.99030,3.19,0.81,11.9,7,-1
2,6.9,0.230,0.38,8.3,0.047,47.0,162.0,0.99540,3.34,0.52,10.5,7,-1
3,5.4,0.290,0.38,1.2,0.029,31.0,132.0,0.98895,3.28,0.36,12.4,6,-1
4,6.1,0.300,0.56,2.7,0.046,46.0,184.0,0.99240,3.31,0.57,10.9,6,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3193,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,1
3194,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,1
3195,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,1
3196,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5,1
