In [4]:
import pandas as pd

df_wine = pd.read_csv(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', 
    header=None)
df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
       'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids',
       'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue',
       'OD280/OD315 of diluted wines','Proline']

In [5]:
df_wine.head(2)

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050


In [8]:
import numpy as np
print('Class labels:', np.unique(df_wine['Class label']))

Class labels: [1 2 3]


In [9]:
# data preparation
from sklearn.model_selection import train_test_split
X = df_wine.iloc[:, 1:].values
y= df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.3, random_state=0)

In [10]:
# apply decision tree on the original dataset
from sklearn.metrics import accuracy_score,confusion_matrix,\
 classification_report
from sklearn.tree import DecisionTreeClassifier
acc = [] ### Blank vector
for i in range(1000):
  X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)# split dataset
  model_dt=DecisionTreeClassifier() # model
  dt = model_dt.fit(X_train,y_train) # fit the model
  YPred = dt.predict(X_test) # predict
  a = accuracy_score(y_test,YPred) # compute accuracy
  acc.append(a) # append accuracy
print("Accuracy using 1000 MC run:", np.mean(acc))  # average of accuracy to stablize the result


Accuracy using 1000 MC run: 0.9067222222222222


In [11]:
# implement LDA for dimension reduction and the perform dt classifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda_model = LinearDiscriminantAnalysis()
X_lda = lda_model.fit_transform(X, y)

In [12]:
X_lda.shape

(178, 2)

In [None]:
LDA=pd.DataFrame(X_lda)
LDA.columns=['lda1','lda2']

In [13]:
lda_model.explained_variance_ratio_

array([0.68747889, 0.31252111])

In [14]:
# apply dt classifier on the transformed data using LDA
from sklearn.model_selection import train_test_split 
acc = [] ### Blank vector
for i in range(1000):
  X_train,X_test,y_train,y_test=train_test_split(X_lda,y,test_size=0.3)# split dataset
  model_dt=DecisionTreeClassifier() # model
  dt = model_dt.fit(X_train,y_train) # fit the model
  YPred = dt.predict(X_test) # predict
  a = accuracy_score(y_test,YPred) # compute accuracy
  acc.append(a) # append accuracy
print("Accuracy using 1000 MC run:", np.mean(acc))  # average of accuracy to stablize the result


Accuracy using 1000 MC run: 0.9913888888888888


In [15]:
# Principal Component Analysis
from numpy import array
from sklearn.decomposition import PCA

In [16]:
# define a matrix
 
# create the PCA instance
pca = PCA(4)
# fit on data
pca.fit(X)
# access values and vectors
PCAs=pca.components_
df=pd.DataFrame(PCAs)
df_pca=df.T  # transpose the dataframe 
df_pca.columns=['pca1','pca2','pca3','pca4']
df_pca.head()

Unnamed: 0,pca1,pca2,pca3,pca4
0,0.001659,0.001203,-0.016874,-0.141447
1,-0.000681,0.002155,-0.122003,-0.16039
2,0.000195,0.004594,-0.051987,0.009773
3,-0.004671,0.02645,-0.938593,0.330965
4,0.017868,0.999344,0.02978,0.005394


In [17]:
#Variance explained by each component
print(pca.explained_variance_)

[9.92017895e+04 1.72535266e+02 9.43811370e+00 4.99117861e+00]


In [18]:
ratio_var=pca.explained_variance_/sum(pca.explained_variance_)

In [19]:
df_new=np.dot(X, df_pca)

In [20]:
# Apply dt classifier on the output of pca and evaluate the accuracy in
# 1000 monte carlo runs

from sklearn.model_selection import train_test_split
acc = [] ### Blank vector
for i in range(1000):
  X_train,X_test,y_train,y_test=train_test_split(df_new,y,test_size=0.3)# split dataset
  model_dt=DecisionTreeClassifier() # model
  dt = model_dt.fit(X_train,y_train) # fit the model
  YPred = dt.predict(X_test) # predict
  a = accuracy_score(y_test,YPred) # compute accuracy
  acc.append(a) # append accuracy
print("Accuracy using 1000 MC run:", np.mean(acc))  # average of accuracy to stablize the result


Accuracy using 1000 MC run: 0.8805
