# Wine Type Analysis 

In [111]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Datasets

In [112]:
### Datasets ###
red_wine = pd.read_csv('/Users/tanchaud/CAB_Berlin/Project_2/Sprint_1/winequality-red.csv', sep=';')
white_wine =  pd.read_csv('/Users/tanchaud/CAB_Berlin/Project_2/Sprint_1/winequality-white.csv', sep= ';')

## Data Wrangling ##

In [113]:
##### Data Enriching ##### ... creating new columns that will help define the targets for each model. 

# we are creating a new column called "quality_label", we define a range and associate that range with a label
red_wine['quality_label'] = red_wine['quality'].apply(lambda value: 'low'
if value <= 5 else 'medium'
if value <= 7 else 'high')

# here we are transforming these labels into categrical data type (specific to pandas) instead of simple string
red_wine['quality_label'] = pd.Categorical(red_wine['quality_label'],
categories=['low', 'medium', 'high'])

red_wine['wine_type'] = 'red'

# we are creating a new column called "quality_label", we define a range and associate that range with a label
white_wine['quality_label'] = white_wine['quality'].apply(lambda value: 'low'
if value <= 5 else 'medium'
if value <= 7 else 'high')

# here we are transforming these labels into categrical data type (specific to pandas) instead of simple string
white_wine['quality_label'] = pd.Categorical(white_wine['quality_label'],
categories=['low', 'medium', 'high'])

white_wine['wine_type'] = 'white'

# Data Cleaning # 
red_wine = red_wine.drop_duplicates()
white_wine = white_wine.drop_duplicates()

## Exploratory Data Analysis ## 

In [None]:
plt.pie(x=[white_wine.shape[0],red_wine.shape[0]], labels=['White Wine', 'Red Wine'], autopct = '%0.0f%%')
plt.savefig('wines_count.png', facecolor='w')
plt.title('Wines')

### Univariate Analysis ### 

In [None]:
# # Data Structuring # 
wines = pd.concat([red_wine, white_wine])
wines = wines.sample(frac=1, random_state=42).reset_index(drop=True)
wines = wines.drop(['quality'],axis=1)

wines.head(3)
print(wines.shape)

##### ------------------ HISTOGRAMS ---------------------- ##### 

features = wines.select_dtypes(include=['float64']).columns

for feature in features: 

        
        fig = plt.figure()
        fig.set_size_inches(10,7)

        wines[feature].plot.hist(bins=15, color='blue', edgecolor='black', linewidth=1.0)
        plt.xlabel(feature)

        plt.show() 
        

#### ----------------- BOXPLOTS ----------------------- #####

features = wines.select_dtypes(include=['float64']).columns

for feature in features: 

        fig = plt.figure()

        # Seaborn Aesthetics Settings 
        sns.set_theme()
        sns.set_context()
        
        fig.set_size_inches(10,7)
        sns.set(font_scale = 1.5)
        sns.boxplot(x=wines['wine_type'],y=wines[feature])
        
        plt.show()   
        

#### Observations from Univariate Analysis: 

#### Wine Type: 
#### Difference between red and white wine for each feature. Outliers are present. The data is positively skewed and highly imbalanced. 

#

#### Outlier Removal #####

In [None]:
df = red_wine; feature = 'alcohol'

df_no_outliers = df.copy(deep=True)

# Figure Settings 
sns.set_theme()
sns.set_context()
fig.set_size_inches(10,7)
sns.set(font_scale = 1.5)
# Plot Feature 
sns.boxplot(x=df_no_outliers['quality_label'],y=df_no_outliers[feature])
plt.title('Before Outlier Removal')
plt.show() 
# Check Count 
print('Before Outlier Removal')
print(df_no_outliers[df_no_outliers['quality_label']=='medium']['alcohol'].describe().loc['count'])         


# Outlier Removal
q1=df_no_outliers[feature].quantile(0.25)
q3=df_no_outliers[feature].quantile(0.75)
iqr=(q3-q1)
lower=(q1-(1.5*iqr))
upper=(q3+(1.5*iqr))
median=df_no_outliers[feature].median()
for iter_range in range(df_no_outliers[feature].size):
    if df_no_outliers.iloc[iter_range]['quality_label']!="high":
        if( (df_no_outliers.iloc[iter_range][feature]<=lower) or \
            (df_no_outliers.iloc[iter_range][feature]>=upper)):
            df_no_outliers.iloc[iter_range,df_no_outliers.columns.get_loc(feature)] =median


# Plot Feature 
sns.boxplot(x=df_no_outliers['quality_label'],y=df_no_outliers[feature])
plt.title('After Outlier Removal')
plt.show() 
# Check Count 
print('After Outlier Removal')
print(df_no_outliers[df_no_outliers['quality_label']=='medium']['alcohol'].describe().loc['count'])

In [None]:
# Data Structuring # 
wines = pd.concat([red_wine, white_wine])
wines = wines.sample(frac=1, random_state=42).reset_index(drop=True)
wines = wines.drop(['quality'],axis=1)
wines

#### We chose alcohol as a feature to remove the outliers, because alcohol has the highest correlation with quality, which is what we finally want to train a model to determine given the physiochemical wine properties as features. 

In [None]:
 # --- Encoding Categorical Variables ---#

encoder = LabelEncoder()
wines['wine_type'] = encoder.fit_transform(wines['wine_type'])
wines['wine_type']

In [None]:
wines

In [None]:
wines.shape

### Multivariate Analysis ### 

In [None]:
import matplotlib
matplotlib.style.use('default')

fig = plt.figure()
fig.set_size_inches(10,5)

wines_corrmat = wines.corr()
# Getting the Upper Triangle of the co-relation matrix
matrix = np.triu(wines_corrmat)
hm = sns.heatmap(wines_corrmat, annot = True, mask=matrix)
hm.set(xlabel='wine composition ', ylabel='wine composition', title = "Correlation matrix of wine data\n")
plt.show()



In [None]:
from scipy import stats
import scipy.stats

features = wines.select_dtypes(include=['float64']).columns

for feature in features: 

    # F and p values
    f_value, p_value = stats.f_oneway(wines[wines['quality_label'] == 'low'][feature],
    wines[wines['quality_label'] == 'medium'][feature],
    wines[wines['quality_label'] == 'high'][feature])
    #print('ANOVA test for mean alcohol levels across wine samples with different quality ratings')
    print(feature,' F Statistic:', f_value, '\tp-value:', p_value)

    # anova table as output
    import statsmodels.api as sm
    from statsmodels.formula.api import ols

    # Ordinary Least Squares (OLS) model
    X = wines.iloc[:,:12]
    y = wines['quality_label']
    X = sm.add_constant(X)
    model = ols(y,X).fit()
    anova_table = sm.stats.anova_lm(model, typ=1)
    anova_table

#print('Critical Value: ', scipy.stats.f.ppf(q=0.05,dfn=2,dfd=6494))

# ## Post-Hoc Test (Tukey's HSD) to see which labels differ #

# from bioinfokit.analys import stat

# res = stat()
# res.tukey_hsd()



# Red and White Wine Classification using Machine Learning #

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler # data normalisation with sklearn
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [None]:
# Features & Target
X = wines.select_dtypes(include=['float64'])
y = wines['wine_type'] # --> what you're trying to predict

In [None]:
# --- Data Split --- #
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

# --- Feature Scaling ---- # 
# fit scaler on training data
norm = MinMaxScaler().fit(X_train)

# transform training data
X_train_norm = norm.transform(X_train)

# transform testing data
X_test_norm = norm.transform(X_test)

X_train = X_train_norm
X_test = X_test_norm


#### Comparing different models 

In [None]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'

for name, model in models:

	kfold = model_selection.KFold(n_splits=5, random_state=seed, shuffle=True)
	cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)

	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)

# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison using cross validation results')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import classification_report

model = LinearDiscriminantAnalysis()

model.fit(X_train, y_train)
  
y_pred = model.predict(X_test)

#### Evaluation metrics ####

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=model.classes_)
disp.plot()
plt.show()

# Accuracy 
print('Accuracy Score: ', accuracy_score(y_test,y_pred))

# Balanced Accuracy
print('Balanced Accuracy Score: ', balanced_accuracy_score(y_test, y_pred))

# Classification Report 
print(classification_report(y_test, y_pred))

# Kappa score 
kappa = cohen_kappa_score(y_test, y_pred)
print("Cohen-Kappa score:", kappa)