# LDA

### Load the iris flower datasets
[Iris Wikipedia] https://en.wikipedia.org/wiki/Iris_flower_data_set



In [None]:
import pandas as pd

feature_dict = {i:label for i,label in zip(
                range(4),
                  ('sepal length in cm',
                  'sepal width in cm',
                  'petal length in cm',
                  'petal width in cm', ))}
label_dict = {1: 'Iris-Setosa', 2: 'Iris-Versicolor', 3: 'Iris-Virginica'}
df = pd.io.parsers.read_csv(
    filepath_or_buffer='/home/if/ChallengeAll/machine_learning/LDA_python/datasets.csv',
    header=None,
    sep=',',
    )
df.columns = [l for i, l in sorted(feature_dict.items())] + ['label']
df.dropna(how='all', inplace=True) 
df.tail(0)


### Step1:  compute the d-dimentional mean Vector of  every class
### where $\mu_i, (i = 1,2,3)$ 

$$ \pmb \mu_i = \begin{bmatrix}
x_{\omega_i (\text{sepal length)}}\\
x_{\omega_i (\text{sepal width})}\\
x_{\omega_i (\text{petal length)}}\\
x_{\omega_i (\text{petal width})}\\
\end{bmatrix} \; , \quad $$  



In [None]:
import numpy as np
x = df[[0,1,2,3]].values
y = df['label'].values

np.set_printoptions(precision=4)

mean_vectors = []
for cl in range(1,4):
    mean_vectors.append(np.mean(x[y==cl], axis=0))
    print('mean vector class %s: %s\n' %(cl, mean_vectors[cl-1]))

### Step2: compute the scatter matrix
#### Flow the LDA logic formulas：



#### 1. Within-class scatter $ S_w $
$$ S_W = \sum\limits_{i=1}^{c}S_i  \quad \text{with} \quad i = 1,2,3
\\ \text{where} \quad S_i = \sum\limits_{\pmb x \in D_i}^n (\pmb x - \pmb \mu_i)\;(\pmb x - \pmb \mu_i)^T
$$

#### 2. Between-class scatter $S_b$
First, we should compute the overall mean vector $ \mu $
$$ \pmb \mu_i = \frac{1}{N} \sum\limits_{\pmb x=1}^N \; \pmb x_k
$$

The between-class scatter matrix $S_b$ is computed as follow:
$$ S_B =  \sum\limits_{i=1}^{c}(\pmb \mu_i - \pmb \mu) (\pmb \mu_i - \pmb \mu)^T
$$

In [None]:
S_W = np.zeros((4,4))
for cl,mv in zip(range(1,4), mean_vectors):
    class_sc_mat = np.zeros((4,4))
    for row in x[y == cl]:
        row, mv = row.reshape(4,1), mv.reshape(4,1) # get the col vector
        class_sc_mat += (row-mv).dot((row-mv).T) # the formula to compute the sw
    S_W += class_sc_mat

print('S_W:')
print(S_W)

In [None]:
overall_mean = np.mean(x, axis=0)

S_B = np.zeros((4,4))
for i,mean_vec in enumerate(mean_vectors):
    n = x[y==i+1,:].shape[0]
    mean_vec = mean_vec.reshape(4,1)
    overall_mean = overall_mean.reshape(4,1)
    S_B += n * (mean_vec - overall_mean).dot((mean_vec - overall_mean).T)
    
print('S_B:')
print(S_B)

### Step3: compute the eigenvalue(特征值) and eigevector(特征向量) of matrix
for the logic formulas:
$$ \pmb S_b\pmb{W} =  \lambda \pmb S_w\pmb{W} \\
\pmb S_{W}^{-1}S_B\pmb{W} = \lambda\pmb{W}$$
$$ $$
$\pmb W$ is the eigevector of $\pmb S_{W}^{-1}S_B \text{with the eigenvalue}$ $\pmb \lambda$

In [None]:
eig_vals, eig_vecs = np.linalg.eig(np.linalg.inv(S_W).dot(S_B))

for i in range(len(eig_vals)):
    eigvec_sc = eig_vecs[:,i].reshape(4,1)
    print('eigenvector: {}: {} '.format(i+1, eigvec_sc.real))   
    print('eigenvalue: {:}: {:.2e}'.format(i+1, eig_vals[i].real))


### Step4: choose the linear discriminatants for new feature space
we are not only interested in merely projecting the data into a subspace that improves 
the class separability, but also reduces the dimensionality of our feature space

In [None]:
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i]) for i in range(len(eig_vals))]

eig_pairs = sorted(eig_pairs, key=lambda k: k[0], reverse=True)
print('Eigenvalues in decreasing order:')
for i in eig_pairs:
    print(i[0])


print('Variance:')
eigv_sum = sum(eig_vals)
for i,j in enumerate(eig_pairs):
    print('eigenvalue {0:}: {1:.2%}'.format(i+1, (j[0]/eigv_sum).real))

W = np.hstack((eig_pairs[0][1].reshape(4,1), eig_pairs[1][1].reshape(4,1)))
print('W: ', W.real)

### Step5:  Transform X samples to new space
$$ \pmb X_{new} = \pmb X \times \pmb W $$



In [None]:
x_lda = x.dot(W)
assert x_lda.shape == (150,2)

overall_mean = overall_mean.reshape(1,4)
all_mean = overall_mean.dot(W)

from matplotlib import pyplot as plt

def plot_step_lda():

    ax = plt.subplot(111)
    #plt.figure()
    plt.scatter(x=all_mean[:,0].real, y=all_mean[:,1].real,lw=2, s=40, marker ='^', color ='c', alpha=1)
    for center in mean_vectors:
        LDA_center = center.dot(W)
        plt.scatter(x=LDA_center[0].real, y=LDA_center[1].real,lw=2, s=40, marker ='x', color ='k', alpha=1)
    for label,marker,color in zip(
        range(1,4),('o', 'o', 'o'),('red', 'green', 'blue')):

        plt.scatter(x=x_lda[:,0].real[y == label],
                    y=x_lda[:,0].real[y==label],
                    marker=marker,
                    lw =1,
                    color=color,
                    alpha=0.5,
                    label=label_dict[label]
                    )
    leg = plt.legend(loc='upper right', fancybox=True)
    leg.get_frame().set_alpha(0.5)
    plt.title('LDA')

    plt.tick_params(axis="both", which="both", bottom="off", top="off", labelbottom="on",
                    left="off", right="off", labelleft="on")
    plt.grid()
    plt.show()
    

In [None]:
plot_step_lda()