In [104]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

# Question 2
# Abalone Datasets

In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

## Loading the dataset and normalizing it

In [78]:
# Load raw abalone dataset
abalone_df = pd.read_csv(r"C:\Users\15485\Desktop\UWaterloo_Academics\ECE657A\Assignments\Assignment2\Assignment2_Submission\abalone.csv", names = ['Sex', 'Length', 'Diameter', 'Height', 'Whole_weight', 
                      'Sucked_weight', 'Viscera_weight', 'Shell_weight', 'Rings'], sep = ',')

# Separate indep and dep features
X_abalone = abalone_df.iloc[:, 1:-1] # Removed the sex feature
y_abalone = abalone_df.iloc[:, -1]

# Normalize dataset 
sc1 = MinMaxScaler()
X_abalone = sc1.fit_transform(X_abalone)
X_abalone

array([[0.51351351, 0.5210084 , 0.0840708 , ..., 0.15030262, 0.1323239 ,
        0.14798206],
       [0.37162162, 0.35294118, 0.07964602, ..., 0.06624075, 0.06319947,
        0.06826109],
       [0.61486486, 0.61344538, 0.11946903, ..., 0.17182246, 0.18564845,
        0.2077728 ],
       ...,
       [0.70945946, 0.70588235, 0.18141593, ..., 0.3527236 , 0.37788018,
        0.30543099],
       [0.74324324, 0.72268908, 0.13274336, ..., 0.35642233, 0.34298881,
        0.29347285],
       [0.85810811, 0.84033613, 0.17256637, ..., 0.63517149, 0.49506254,
        0.49177877]])

## Applying PCA pre-processing on Abalone dataset and select first 3 principal components

In [79]:
# Apply PCA on abalone dataset
pca = PCA(n_components=3)
abalone_pca = pca.fit_transform(X_abalone)
abalone_pca_df = pd.DataFrame(data=abalone_pca, columns=['PC1', 'PC2', 'PC3'])
abalone_pca_df

Unnamed: 0,PC1,PC2,PC3
0,-0.230816,-0.026563,-0.006786
1,-0.497671,0.043791,0.003049
2,-0.068857,-0.081454,0.011720
3,-0.230997,-0.012962,0.004214
4,-0.532797,0.057362,-0.000513
...,...,...,...
4172,0.100632,-0.034549,-0.011468
4173,0.128141,-0.023082,-0.028686
4174,0.273938,0.019037,-0.025086
4175,0.262282,-0.027659,-0.045737


## Applying LDA pre-processing on Abalone dataset and select 3 linear descriptors

In [80]:
# Apply LDA on raw abalone dataset
X_abalone_lda = X_abalone
y_abalone_lda = y_abalone
lda = LinearDiscriminantAnalysis(n_components=3)
X_abalone_lda = lda.fit(X_abalone_lda, y_abalone_lda).transform(X_abalone_lda)
abalone_lda_df = pd.DataFrame(X_abalone_lda, y_abalone_lda)
abalone_lda_df

Unnamed: 0_level_0,0,1,2
Rings,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
15,-0.791003,-0.235208,0.359351
7,-2.355522,0.336978,0.214024
9,0.766719,-0.246564,1.129422
10,-0.611434,0.098075,0.230542
7,-2.674301,0.527509,0.102575
...,...,...,...
11,0.921330,-0.612381,-0.272399
10,0.425796,-0.894428,-0.034727
9,1.064523,-0.385654,-0.787231
10,0.840757,-1.513723,-0.864217


# Wine Datasets

In [81]:
wine_r = pd.read_csv(r"C:\Users\15485\Desktop\UWaterloo_Academics\ECE657A\Assignments\Assignment2\Assignment2_Submission\winequality-red.csv", sep=';')
wine_r["colour"]=1
wine_w = pd.read_csv(r"C:\Users\15485\Desktop\UWaterloo_Academics\ECE657A\Assignments\Assignment2\Assignment2_Submission\winequality-white.csv", sep=';')
wine_w["colour"]=0
wine_raw = pd.concat([wine_w,wine_r], ignore_index=True)
wine_raw.head(100)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,colour
0,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.0010,3.00,0.45,8.8,6,0
1,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.9940,3.30,0.49,9.5,6,0
2,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.9951,3.26,0.44,10.1,6,0
3,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.40,9.9,6,0
4,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.40,9.9,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,7.1,0.260,0.29,12.4,0.044,62.0,240.0,0.9969,3.04,0.42,9.2,6,0
96,6.0,0.340,0.66,15.9,0.046,26.0,164.0,0.9979,3.14,0.50,8.8,6,0
97,8.6,0.265,0.36,1.2,0.034,15.0,80.0,0.9913,2.95,0.36,11.4,7,0
98,9.8,0.360,0.46,10.5,0.038,4.0,83.0,0.9956,2.89,0.30,10.1,4,0


In [82]:
# Separate indep and dep features
X_wine1 = wine_raw.iloc[:, :-2]
y_wine = wine_raw.iloc[:, -2]
X_wine = pd.concat([X_wine1, wine_raw.iloc[:, -1]], axis=1)

# Normalize dataset 
sc2 = MinMaxScaler()
X_wine = sc2.fit_transform(X_wine)
X_wine.shape, y_wine.shape

((6497, 12), (6497,))

## Applying PCA pre-processing on Wine dataset and select first 3 principal components

In [83]:
# Apply PCA on wine dataset for dimensionality reduction
pca = PCA(n_components=2)
wine_pca = pca.fit_transform(X_wine)
wine_pca_df = pd.DataFrame(data=wine_pca, columns=['PC1', 'PC2'])
wine_pca_df

Unnamed: 0,PC1,PC2
0,-0.298897,-0.337622
1,-0.241913,-0.084556
2,-0.225052,-0.036821
3,-0.290807,-0.123910
4,-0.290807,-0.123910
...,...,...
6492,0.793106,0.054298
6493,0.789921,0.151012
6494,0.790787,0.111938
6495,0.808678,0.031407


## Applying LDA pre-processing on Abalone dataset and select first 3 linear descriptors

In [84]:
# Apply LDA on raw wine dataset
X_wine_lda = X_wine
y_wine_lda = y_wine
lda = LinearDiscriminantAnalysis(n_components=2)
X_wine_lda = lda.fit(X_wine_lda, y_wine_lda).transform(X_wine_lda)
wine_lda_df = pd.DataFrame(X_wine_lda, y_wine_lda)
print(wine_lda_df.shape)
wine_lda_df

(6497, 2)


Unnamed: 0_level_0,0,1
quality,Unnamed: 1_level_1,Unnamed: 2_level_1
6,0.752078,-1.466209
6,1.445150,0.392049
6,-0.123015,0.911451
6,0.288961,-0.721769
6,0.288961,-0.721769
...,...,...
5,0.512278,-0.224430
6,-0.514707,-0.597340
6,-0.231160,-0.831907
5,0.630811,0.158871


# Abalone - raw dataset - Multimonial naive bayes:

If we apply Standardisation to the Abalone dataset, values become negative and that is not acceptable as a values to Naive Bayes classifiers. Hence, we need to use MinMaxScaler (Normalization) to scale down values only within 0 and 1. However, this will decrease the accuracy of the model.

In [85]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB

kf = StratifiedKFold(n_splits=5)
acc_list = []
acc_all = []
for train_index, test_index in kf.split(X_abalone, y_abalone):
    X_train, X_test = X_abalone[train_index], X_abalone[test_index]
    y_train, y_test = y_abalone[train_index], y_abalone[test_index]
    # Create model for every fold
    # Multinimial NB
    multi = MultinomialNB()
    multi.fit(X_train, y_train)
    y_pred = multi.predict(X_test)
    acc_list.append(accuracy_score(y_test, y_pred))

avg_acc = sum(acc_list)/5
acc_all.append(avg_acc)

The accuracy of a model on the Raw abalone dataset has significantly reduced from 26% to 16.5% with Naive Bayes compared to KNN using 10 neighbors measured in the previous assignment. While it’s likely that neither algorithm is adequate for predicting the abalone age, the KNN model is more accurate so far 

# Wine - Raw dataset - Multinomial Naive Bayes: Mean accuracy

In [86]:
# Running KNN on the Wine Raw dataset

# Separate indep and dep features
X_wine1 = wine_raw.iloc[:, :-2]
y_wine = wine_raw.iloc[:, -2]
X_wine = pd.concat([X_wine1, wine_raw.iloc[:, -1]], axis=1)

# Normalize dataset 
sc2 = MinMaxScaler(feature_range=(0, 1))
X_wine = sc2.fit_transform(X_wine)
X_wine.shape, y_wine.shape

kf = StratifiedKFold(n_splits=5)
acc_list = []

for train_index, test_index in kf.split(X_wine, y_wine):
    X_train, X_test = X_wine[train_index], X_wine[test_index]
    y_train, y_test = y_wine[train_index], y_wine[test_index]
    # Create model for every fold
    # Multinimial NB
    multi = KNeighborsClassifier(n_neighbors=10)
    multi.fit(X_train, y_train)
    y_pred = multi.predict(X_test)
    acc_list.append(accuracy_score(y_test, y_pred))

avg_acc = sum(acc_list)/5
avg_acc

0.46159223071001365

In [87]:
kf = StratifiedKFold(n_splits=5)
acc_list = []

for train_index, test_index in kf.split(X_wine, y_wine):
    X_train, X_test = X_wine[train_index], X_wine[test_index]
    y_train, y_test = y_wine[train_index], y_wine[test_index]
    # Create model for every fold
    # Multinimial NB
    multi = MultinomialNB()
    multi.fit(X_train, y_train)
    y_pred = multi.predict(X_test)
    acc_list.append(accuracy_score(y_test, y_pred))

avg_acc = sum(acc_list)/5
acc_all.append(avg_acc)

In [88]:
acc_all

[0.16495086382259405, 0.414954106709303]

KNN Algorithm has worked slightly better on the Wine (Raw) dataset compared to Multinomial Naive Bayes as the accuracy has gone down from 46.15% to an average of 41.5% accross 5-folds. A combination of Standardisation and then KNN has no significant effect on the accuracy improvement. 

# Abalone - Raw - Complement NB

In [89]:
kf = StratifiedKFold(n_splits=5)
acc_list = []
for train_index, test_index in kf.split(X_abalone, y_abalone):
    X_train, X_test = X_abalone[train_index], X_abalone[test_index]
    y_train, y_test = y_abalone[train_index], y_abalone[test_index]
    # Create model for every fold
    comp = ComplementNB()
    comp.fit(X_train, y_train)
    y_pred = comp.predict(X_test)
    acc_list.append(accuracy_score(y_test, y_pred))

avg_acc = sum(acc_list)/5
avg_acc

0.17500329484571525

# Wine dataset - raw - complement NB

In [90]:


X_wine1 = wine_raw.iloc[:, :-2]
y_wine = wine_raw.iloc[:, -2]
X_wine = pd.concat([X_wine1, wine_raw.iloc[:, -1]], axis=1)

# Normalize dataset 
sc2 = MinMaxScaler()
X_wine = sc2.fit_transform(X_wine)
X_wine.shape, y_wine.shape

kf = StratifiedKFold(n_splits=5)
acc_list = []

for train_index, test_index in kf.split(X_wine, y_wine):
    X_train, X_test = X_wine[train_index], X_wine[test_index]
    y_train, y_test = y_wine[train_index], y_wine[test_index]
    # Create model for every fold
    multi = ComplementNB()
    multi.fit(X_train, y_train)
    y_pred = multi.predict(X_test)
    acc_list.append(accuracy_score(y_test, y_pred))

avg_acc = sum(acc_list)/5
avg_acc

0.38971090187718366

## Test accuracy of Raw abalone

In [103]:
#X_abalone = abalone_df.iloc[:, 1:-1] # Removed the sex feature
#y_abalone = abalone_df.iloc[:, -1]
#abalone_pca = pca.fit_transform(X_abalone)

abalone_df = pd.read_csv(r"C:\Users\15485\Desktop\UWaterloo_Academics\ECE657A\Assignments\Assignment2\Assignment2_Submission\abalone.csv", names = ['Sex', 'Length', 'Diameter', 'Height', 'Whole_weight', 
                      'Sucked_weight', 'Viscera_weight', 'Shell_weight', 'Rings'], sep = ',')

# Separate indep and dep features
X_abalone = abalone_df.iloc[:, 1:-1] # Removed the sex feature
y_abalone = abalone_df.iloc[:, -1]
# print(y_abalone)

# Normalize dataset 
sc1 = MinMaxScaler(feature_range=(0, 1))
X_abalone = sc1.fit_transform(X_abalone)
X_abalone

# Split data into features and target variable
X = X_abalone
y = y_abalone

# Apply PCA with 3 components
pca = PCA(n_components=3)
X_pca_fit = pca.fit_transform(X)

# Normalize the values between 0 and 1 using MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
X_pca = scaler.fit_transform(X_pca_fit)

# Apply LDA with 3 components
lda = LDA(n_components=3)
X_lda_fit = lda.fit_transform(X, y)

scaler = MinMaxScaler(feature_range=(0, 1))
X_lda = scaler.fit_transform(X_lda_fit)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

mnb_clf = MultinomialNB()

cnb_clf = ComplementNB()

mnb_cv_scores = cross_val_score(mnb_clf, X_train, y_train, cv=5)
print("Cross-validation accuracy of Raw abalone using Multinomial Naive Bayes classifier: {:.2f}%".format(mnb_cv_scores.mean() * 100))

cnb_cv_scores = cross_val_score(cnb_clf, X_train, y_train, cv=5)
print("Cross-validation accuracy of Raw abalone using Complement Naive Bayes classifier: {:.2f}%".format(cnb_cv_scores.mean() * 100))

mnb_clf.fit(X_train, y_train)
mnb_test_acc = mnb_clf.score(X_test, y_test)
print("Test accuracy of Raw abalone using Multinomial Naive Bayes classifier: {:.2f}%".format(mnb_test_acc * 100))

cnb_clf.fit(X_train, y_train)
cnb_test_acc = cnb_clf.score(X_test, y_test)
print("Test accuracy of Raw abalone using Complement Naive Bayes classifier: {:.2f}%".format(cnb_test_acc * 100))


Cross-validation accuracy of Raw abalone using Multinomial Naive Bayes classifier: 16.37%
Cross-validation accuracy of Raw abalone using Complement Naive Bayes classifier: 18.14%
Test accuracy of Raw abalone using Multinomial Naive Bayes classifier: 16.99%
Test accuracy of Raw abalone using Complement Naive Bayes classifier: 19.14%


## Test accuracy of PCA processed abalone

In [92]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

mnb_clf = MultinomialNB()

cnb_clf = ComplementNB()

pca_mnb_cv_scores = cross_val_score(mnb_clf, X_train, y_train, cv=5)
print("Cross-validation accuracy of processed abalone using Multinomial Naive Bayes classifier with PCA: {:.2f}%".format(pca_mnb_cv_scores.mean() * 100))

pca_cnb_cv_scores = cross_val_score(cnb_clf, X_train, y_train, cv=5)
print("Cross-validation accuracy of processed abalone using Complement Naive Bayes classifier with PCA: {:.2f}%".format(pca_cnb_cv_scores.mean() * 100))

mnb_clf.fit(X_train, y_train)
pca_mnb_test_acc = mnb_clf.score(X_test, y_test)
print("Test accuracy of processed abalone using Multinomial Naive Bayes classifier with PCA: {:.2f}%".format(pca_mnb_test_acc * 100))

cnb_clf.fit(X_train, y_train)
pca_cnb_test_acc = cnb_clf.score(X_test, y_test)
print("Test accuracy of processed abalone using Complement Naive Bayes classifier with PCA: {:.2f}%".format(pca_cnb_test_acc * 100))


Cross-validation accuracy of processed abalone using Multinomial Naive Bayes classifier with PCA: 16.37%
Cross-validation accuracy of processed abalone using Complement Naive Bayes classifier with PCA: 18.26%
Test accuracy of processed abalone using Multinomial Naive Bayes classifier with PCA: 16.99%
Test accuracy of processed abalone using Complement Naive Bayes classifier with PCA: 17.22%


## Test accuracy of LDA processed abalone

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X_lda, y, test_size=0.2, random_state=42)

lda_mnb_cv_scores = cross_val_score(mnb_clf, X_train, y_train, cv=5)
print("Cross-validation accuracy of processed abalone using Multinomial Naive Bayes classifier with LDA: {:.2f}%".format(lda_mnb_cv_scores.mean() * 100))

lda_cnb_cv_scores = cross_val_score(cnb_clf, X_train, y_train, cv=5)
print("Cross-validation accuracy of processed abalone using Complement Naive Bayes classifier with LDA: {:.2f}%".format(lda_cnb_cv_scores.mean() * 100))

mnb_clf.fit(X_train, y_train)
lda_mnb_test_acc = mnb_clf.score(X_test, y_test)
print("Test accuracy of processed abalone using Multinomial Naive Bayes classifier LDA: {:.2f}%".format(lda_mnb_test_acc * 100))

cnb_clf.fit(X_train, y_train)
lda_cnb_test_acc = cnb_clf.score(X_test, y_test)
print("Test accuracy of processed abalone using Complement Naive Bayes classifier LDA: {:.2f}%".format(lda_cnb_test_acc * 100))

Cross-validation accuracy of processed abalone using Multinomial Naive Bayes classifier with LDA: 16.37%
Cross-validation accuracy of processed abalone using Complement Naive Bayes classifier with LDA: 23.97%
Test accuracy of processed abalone using Multinomial Naive Bayes classifier LDA: 16.99%
Test accuracy of processed abalone using Complement Naive Bayes classifier LDA: 21.53%


# Wine Dataset implementation

In [94]:
wine_r = pd.read_csv(r"C:\Users\15485\Desktop\UWaterloo_Academics\ECE657A\Assignments\Assignment2\Assignment2_Submission\winequality-red.csv", sep=';')
wine_r["colour"]=1
wine_w = pd.read_csv(r"C:\Users\15485\Desktop\UWaterloo_Academics\ECE657A\Assignments\Assignment2\Assignment2_Submission\winequality-white.csv", sep=';')
wine_w["colour"]=0
wine_raw = pd.concat([wine_w,wine_r], ignore_index=True)
wine_raw.head(100)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,colour
0,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.0010,3.00,0.45,8.8,6,0
1,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.9940,3.30,0.49,9.5,6,0
2,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.9951,3.26,0.44,10.1,6,0
3,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.40,9.9,6,0
4,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.40,9.9,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,7.1,0.260,0.29,12.4,0.044,62.0,240.0,0.9969,3.04,0.42,9.2,6,0
96,6.0,0.340,0.66,15.9,0.046,26.0,164.0,0.9979,3.14,0.50,8.8,6,0
97,8.6,0.265,0.36,1.2,0.034,15.0,80.0,0.9913,2.95,0.36,11.4,7,0
98,9.8,0.360,0.46,10.5,0.038,4.0,83.0,0.9956,2.89,0.30,10.1,4,0


In [95]:
#Split dataset into X and Y
X_wine = wine_raw.iloc[:, :-1]
y_wine = wine_raw.iloc[:, -2]
print(y)
sc_wine = MinMaxScaler(feature_range=(0, 1))
X_wine = sc_wine.fit_transform(X_wine)
print("The normalized dataset is: \n", X)

0       15
1        7
2        9
3       10
4        7
        ..
4172    11
4173    10
4174     9
4175    10
4176    12
Name: Rings, Length: 4177, dtype: int64
The normalized dataset is: 
 [[0.51351351 0.5210084  0.0840708  ... 0.15030262 0.1323239  0.14798206]
 [0.37162162 0.35294118 0.07964602 ... 0.06624075 0.06319947 0.06826109]
 [0.61486486 0.61344538 0.11946903 ... 0.17182246 0.18564845 0.2077728 ]
 ...
 [0.70945946 0.70588235 0.18141593 ... 0.3527236  0.37788018 0.30543099]
 [0.74324324 0.72268908 0.13274336 ... 0.35642233 0.34298881 0.29347285]
 [0.85810811 0.84033613 0.17256637 ... 0.63517149 0.49506254 0.49177877]]


In [96]:
# Apply PCA with 3 components
pca = PCA(n_components=3)
X_wine_pca_fit = pca.fit_transform(X_wine)

# Normalize the values between 0 and 1 using MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
X_wine_pca = scaler.fit_transform(X_wine_pca_fit)

# Apply LDA with 3 components
lda = LDA(n_components=3)
X_wine_lda_fit = lda.fit_transform(X_wine, y_wine)

# Normalize the values between 0 and 1 using MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
X_wine_lda = scaler.fit_transform(X_wine_lda_fit)

# Test Accuracy of raw wine

In [97]:
X_train, X_test, y_train, y_test = train_test_split(X_wine, y_wine, test_size=0.2, random_state=42)

mnb_clf = MultinomialNB()

cnb_clf = ComplementNB()

mnb_cv_scores = cross_val_score(mnb_clf, X_train, y_train, cv=5)
print("Cross-validation accuracy of raw wine using Multinomial Naive Bayes classifier: {:.2f}%".format(mnb_cv_scores.mean() * 100))

cnb_cv_scores = cross_val_score(cnb_clf, X_train, y_train, cv=5)
print("Cross-validation accuracy of raw wine using Complement Naive Bayes classifier: {:.2f}%".format(cnb_cv_scores.mean() * 100))

mnb_clf.fit(X_train, y_train)
mnb_test_acc = mnb_clf.score(X_test, y_test)
print("Test accuracy of raw wine using Multinomial Naive Bayes classifier: {:.2f}%".format(mnb_test_acc * 100))

cnb_clf.fit(X_train, y_train)
cnb_test_acc = cnb_clf.score(X_test, y_test)
print("Test accuracy of raw wine using Complement Naive Bayes classifier: {:.2f}%".format(cnb_test_acc * 100))

Cross-validation accuracy of raw wine using Multinomial Naive Bayes classifier: 43.54%
Cross-validation accuracy of raw wine using Complement Naive Bayes classifier: 47.51%
Test accuracy of raw wine using Multinomial Naive Bayes classifier: 44.62%
Test accuracy of raw wine using Complement Naive Bayes classifier: 48.54%


## Test accuracy of PCA processed wine

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X_wine_pca, y_wine, test_size=0.2, random_state=42)

mnb_clf = MultinomialNB()

cnb_clf = ComplementNB()

pca_mnb_cv_scores = cross_val_score(mnb_clf, X_train, y_train, cv=5)
print("Cross-validation accuracy of processed wine using Multinomial Naive Bayes classifier with PCA: {:.2f}%".format(pca_mnb_cv_scores.mean() * 100))

pca_cnb_cv_scores = cross_val_score(cnb_clf, X_train, y_train, cv=5)
print("Cross-validation accuracy of processed wine using Complement Naive Bayes classifier with PCA: {:.2f}%".format(pca_cnb_cv_scores.mean() * 100))

mnb_clf.fit(X_train, y_train)
pca_mnb_test_acc = mnb_clf.score(X_test, y_test)
print("Test accuracy of processed wine using Multinomial Naive Bayes classifier with PCA: {:.2f}%".format(pca_mnb_test_acc * 100))

cnb_clf.fit(X_train, y_train)
pca_cnb_test_acc = cnb_clf.score(X_test, y_test)
print("Test accuracy of processed wine using Complement Naive Bayes classifier with PCA: {:.2f}%".format(pca_cnb_test_acc * 100))

Cross-validation accuracy of processed wine using Multinomial Naive Bayes classifier with PCA: 43.43%
Cross-validation accuracy of processed wine using Complement Naive Bayes classifier with PCA: 45.41%
Test accuracy of processed wine using Multinomial Naive Bayes classifier with PCA: 44.54%
Test accuracy of processed wine using Complement Naive Bayes classifier with PCA: 46.92%


## Test accuracy of LDA processed wine

In [99]:
X_train, X_test, y_train, y_test = train_test_split(X_wine_lda, y_wine, test_size=0.2, random_state=42)

lda_mnb_cv_scores = cross_val_score(mnb_clf, X_train, y_train, cv=5)
print("Cross-validation accuracy of processed wine using Multinomial Naive Bayes classifier with LDA: {:.2f}%".format(lda_mnb_cv_scores.mean() * 100))

lda_cnb_cv_scores = cross_val_score(cnb_clf, X_train, y_train, cv=5)
print("Cross-validation accuracy of processed wine using Complement Naive Bayes classifier with LDA: {:.2f}%".format(lda_cnb_cv_scores.mean() * 100))

mnb_clf.fit(X_train, y_train)
lda_mnb_test_acc = mnb_clf.score(X_test, y_test)
print("Test accuracy of processed wine using Multinomial Naive Bayes classifier with LDA: {:.2f}%".format(lda_mnb_test_acc * 100))

cnb_clf.fit(X_train, y_train)
lda_cnb_test_acc = cnb_clf.score(X_test, y_test)
print("Test accuracy of processed wine using Complement Naive Bayes classifier with LDA: {:.2f}%".format(lda_cnb_test_acc * 100))

Cross-validation accuracy of processed wine using Multinomial Naive Bayes classifier with LDA: 43.43%
Cross-validation accuracy of processed wine using Complement Naive Bayes classifier with LDA: 0.54%
Test accuracy of processed wine using Multinomial Naive Bayes classifier with LDA: 44.54%
Test accuracy of processed wine using Complement Naive Bayes classifier with LDA: 0.15%


### Conclusion for Abalone Dataset

The above accuracies summarizes the test accuracy of two algorithms (Multinomial Naive Bayes and Complement Naive Bayes) on three versions of the abalone dataset: raw, PCA-preprocessed (with 3 principal components), and LDA-preprocessed (with 3 linear discriminants).

For Multinomial Naive Bayes, the test accuracy remains the same (16.99%) across all three versions of the dataset.

For Complement Naive Bayes, the test accuracy is highest on the LDA-preprocessed dataset (21.53%), followed by the raw dataset (19.14%), and lowest on the PCA-preprocessed dataset (17.22%). This suggests that LDA pre-processing is more effective for improving the performance of Complement Naive Bayes on the abalone dataset compared to PCA pre-processing.

### Conclusion for Wine Dataset

For the Multinomial Naive Bayes algorithm, there is not much difference in performance between the raw wine dataset and the dataset preprocessed with PCA or LDA. The accuracy remains around 44-45% for all three settings.

For the Complement Naive Bayes algorithm, the performance is significantly better on the raw wine dataset compared to the preprocessed datasets. The accuracy is around 48.5% for the raw dataset, but drops to around 47% for the dataset preprocessed with PCA, and drops even further to 0.15% for the dataset preprocessed with LDA.