In [225]:
import numpy as np
from numpy.linalg import inv
from numpy.linalg import det
import pandas as pd
from sklearn.model_selection import train_test_split

## Step 1: Load Data

In [226]:
df = pd.read_table('wine.data',sep = ",",header = None, names = ['Class','Alcohol','Malic acid','Ash','Alcalinity of ash','Magnesium','Total phenols','Flavanoids','Nonflavanoid phenols','Proanthocyanins','Color intensity','Hue','OD','Proline'])
df.head()

Unnamed: 0,Class,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


## Step 2: Split data
### 50% training, 50% testing

In [227]:
data = df.iloc[:,1:14]
label = df.iloc[:,0]
data_train, data_test, label_train, label_test = train_test_split(data, label, test_size = 0.5, random_state = 1)
data_train

Unnamed: 0,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD,Proline
88,11.64,2.06,2.46,21.6,84,1.95,1.69,0.48,1.35,2.80,1.00,2.75,680
102,12.34,2.45,2.46,21.0,98,2.56,2.11,0.34,1.31,2.80,0.80,3.38,438
46,14.38,3.59,2.28,16.0,102,3.25,3.17,0.27,2.19,4.90,1.04,3.44,1065
62,13.67,1.25,1.92,18.0,94,2.10,1.79,0.32,0.73,3.80,1.23,2.46,630
74,11.96,1.09,2.30,21.0,101,3.38,2.14,0.13,1.65,3.21,0.99,3.13,886
...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,12.70,3.55,2.36,21.5,106,1.70,1.20,0.17,0.84,5.00,0.78,1.29,600
137,12.53,5.51,2.64,25.0,96,1.79,0.60,0.63,1.10,5.00,0.82,1.69,515
72,13.49,1.66,2.24,24.0,87,1.88,1.84,0.27,1.03,3.74,0.98,2.78,472
140,12.93,2.81,2.70,21.0,96,1.54,0.50,0.53,0.75,4.60,0.77,2.31,600


## Step 3: Collect data 
### 將training set中相同類別的資料放在一起

In [228]:
train_all = pd.concat([label_train, data_train], axis=1)
train_set = train_all.groupby("Class")
train_set.groups

{1: Int64Index([46, 10, 34, 32, 38, 27, 23,  9, 15, 41, 52, 26, 43, 24,  3, 49, 30,
              8,  1, 57, 22,  7, 50, 20, 25, 37],
            dtype='int64'),
 2: Int64Index([ 88, 102,  62,  74,  92, 116, 119, 125, 110,  83, 100, 103,  67,
             104, 122,  87,  98, 111,  64,  82, 109,  80,  76, 121, 115,  60,
             128,  61,  63,  86,  96,  68, 101,  71, 129,  79,  72],
            dtype='int64'),
 3: Int64Index([176, 145, 154, 177, 150, 132, 153, 167, 158, 143, 149, 131, 169,
             155, 174, 141, 142, 157, 156, 139, 146, 134, 144, 133, 137, 140],
            dtype='int64')}

## Step 4: Calculate parameters
### 計算每個label的參數
### (probability, mean, covariance matrix, inverse of covariance matrix, determinant of covariance matrix)

In [229]:
# calculate probability
probability = train_set.size() / 89

# calculate mean
mean = train_set.mean()

# calculate the covariance matrix
cov1 = train_set.get_group(1).iloc[:,1:].cov()
cov2 = train_set.get_group(2).iloc[:,1:].cov()
cov3 = train_set.get_group(3).iloc[:,1:].cov()

# calculate the inverse of covariance matrix
inv_cov1 = pd.DataFrame(np.linalg.pinv(cov1.values), cov1.columns, cov1.index)
inv_cov2 = pd.DataFrame(np.linalg.pinv(cov1.values), cov1.columns, cov1.index)
inv_cov3 = pd.DataFrame(np.linalg.pinv(cov1.values), cov1.columns, cov1.index)

# calculate the determinant of covariance matrix
det_cov1 = np.linalg.det(inv_cov1)
det_cov2 = np.linalg.det(inv_cov2)
det_cov3 = np.linalg.det(inv_cov3)


## Step 5: Predict test data
### 代入test data，計算每個label可能的機率，並預測結果

In [230]:
lnP1 = np.log(probability.iloc[0])
lnP2 = np.log(probability.iloc[1])
lnP3 = np.log(probability.iloc[2])

ln_det1 = np.log(det_cov1)
ln_det2 = np.log(det_cov2)
ln_det3 = np.log(det_cov3)

#建立result陣列存放預測結果

predict = np.zeros(89, dtype=int)
i = 0

for index, row in data_test.iterrows():
    label1 = -lnP1 + np.transpose(row-mean.iloc[0]).dot(inv_cov1).dot(row-mean.iloc[0])/2 + ln_det1 / 2
    label2 = -lnP2 + np.transpose(row-mean.iloc[1]).dot(inv_cov2).dot(row-mean.iloc[1])/2 + ln_det2 / 2
    label3 = -lnP3 + np.transpose(row-mean.iloc[2]).dot(inv_cov3).dot(row-mean.iloc[2])/2 + ln_det3 / 2
    item = min(label1, label2, label3)
    if item == label1:
        predict[i] = 1
        i += 1
    if item == label2:
        predict[i] = 2
        i += 1
    if item == label3:
        predict[i] = 3
        i += 1
print("Predict:", predict)

Predict: [3 2 1 2 1 3 2 1 3 2 1 1 2 1 1 2 3 1 2 1 1 2 1 2 1 3 1 1 1 3 2 3 3 1 2 2 2
 1 2 1 1 2 3 1 1 1 2 1 1 1 2 3 3 1 2 1 1 1 3 2 1 1 3 2 3 1 2 1 2 1 3 3 3 3
 2 2 1 3 1 2 3 1 2 1 3 2 1 1 2]


## Step 6: Calculate accuracy

In [231]:
answer = label_test.to_numpy()
result = np.array (predict == answer)

correct = 0
for i in range(len(result)):
    if result[i] == True:
        correct += 1

accuracy = correct / len(result)

print("Result:",result)
print(f"predict accuracy: {round(accuracy*100,3)}%")

Result: [ True  True  True  True  True  True  True  True  True  True  True  True
  True  True False  True  True  True  True  True  True  True False  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True False  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True False  True False  True  True
 False  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True False  True  True]
predict accuracy: 92.135%
