## Running PCA on the image dataset

In [1]:
import glob
filelist = glob.glob('Data/emotion_classification/train/*.gif');

In [2]:
import numpy as np
from PIL import Image
x = np.array([np.array(Image.open(fname).resize((100,100),Image.ANTIALIAS)) for fname in filelist])

Resized the dataset to 100x100 above and flattening the dataset to 10000x1

In [3]:
image_data = np.zeros((20,10000))
for i in range(x.shape[0]):
    image_data[i] = (x[i,:,:].flatten()).astype(int)
image_data = image_data.astype(int).T

print(image_data)
print(image_data.shape)


[[ 40 115  97 ...,  90 104 100]
 [ 54 115 110 ..., 104 109 103]
 [ 67 117 114 ..., 116 111 108]
 ..., 
 [ 34 220 100 ...,  89  86 109]
 [ 32 227 101 ...,  91  88 117]
 [ 32 213  95 ...,  93  88 103]]
(10000, 20)


Finding the mean vector of the image dataset

In [4]:
mean_vector = np.zeros((10000, 1))
for i in range(image_data.shape[0]):
    mean_vector[i] = np.mean(image_data[i, :])
    
print(mean_vector)
print(mean_vector.shape)

[[  59.65]
 [  66.  ]
 [  70.85]
 ..., 
 [ 101.15]
 [ 110.  ]
 [ 118.5 ]]
(10000, 1)


The scatter matrix is computed:

In [5]:
scatter_matrix = np.zeros((10000, 10000))
for i in range(image_data.shape[1]):
    scatter_matrix += (image_data[:,i].reshape(10000,1) - mean_vector).dot((image_data[:,i].reshape(10000,1) -mean_vector).T)
print('Scatter Matrix:\n', scatter_matrix)


Scatter Matrix:
 [[  33434.55   34090.     34021.95 ...,   11040.05    6694.      3576.5 ]
 [  34090.     35316.     35782.   ...,    8347.      3221.      -293.  ]
 [  34021.95   35782.     36930.55 ...,    6734.45    1253.     -2928.5 ]
 ..., 
 [  11040.05    8347.      6734.45 ...,   91548.55   97619.     88690.5 ]
 [   6694.      3221.      1253.   ...,   97619.    115426.    111833.  ]
 [   3576.5     -293.     -2928.5  ...,   88690.5   111833.    121683.  ]]


Computing the eigenvectors and eigenvalues

In [6]:
eig_val_sc, eig_vec_sc = np.linalg.eig(scatter_matrix)

In [7]:
print(eig_vec_sc.shape)

(10000, 10000)


In [8]:
print(eig_vec_sc)

[[  1.16894148e-03 +0.00000000e+00j  -1.80898048e-02 +0.00000000e+00j
   -2.24582148e-03 +0.00000000e+00j ...,   2.05737110e-03 +0.00000000e+00j
   -9.33674247e-04 +2.08046835e-04j  -9.33674247e-04 -2.08046835e-04j]
 [  1.52444893e-03 +0.00000000e+00j  -1.75753537e-02 +0.00000000e+00j
   -1.12069343e-03 +0.00000000e+00j ...,   5.29976607e-05 +0.00000000e+00j
   -3.67217836e-05 +3.71110939e-06j  -3.67217836e-05 -3.71110939e-06j]
 [  1.99957878e-03 +0.00000000e+00j  -1.73184875e-02 +0.00000000e+00j
   -5.86135890e-04 +0.00000000e+00j ...,   6.77362502e-05 +0.00000000e+00j
   -5.50983523e-05 +9.69925998e-07j  -5.50983523e-05 -9.69925998e-07j]
 ..., 
 [ -8.44660881e-04 +0.00000000e+00j  -1.90904651e-02 +0.00000000e+00j
   -3.45891258e-02 +0.00000000e+00j ...,  -8.20945992e-03 +0.00000000e+00j
   -4.06659672e-03 -2.74023749e-03j  -4.06659672e-03 +2.74023749e-03j]
 [ -4.68225777e-04 +0.00000000e+00j  -1.93678562e-02 +0.00000000e+00j
   -4.37385904e-02 +0.00000000e+00j ...,   3.29993380e-03 +

In [81]:
# Make a list of (eigenvalue, eigenvector) tuples
eig_pairs = [(np.abs(eig_val_sc[i]), eig_vec_sc[:,i]) for i in range(len(eig_val_sc))]

# Sort the (eigenvalue, eigenvector) tuples from high to low
eig_pairs.sort(key=lambda x: x[0], reverse=True)

# Visually confirm that the list is correctly sorted by decreasing eigenvalues
#for i in eig_pairs:
#    print(i[0])

In [10]:
print(eig_pairs[1][1])
print(eig_pairs[1][1].shape)

[-0.01808980+0.j -0.01757535+0.j -0.01731849+0.j ..., -0.01909047+0.j
 -0.01936786+0.j -0.02181438+0.j]
(10000,)


Choosing the k=10 eigenvectors with largest eigenvalues

In [11]:
matrix_w = np.zeros((10000,1))
print(eig_pairs[0][1].reshape(10000,1).shape)
for i in range(9):
    matrix_w = np.concatenate([matrix_w, (eig_pairs[i][1].reshape(10000,1))], axis=1)
print('Matrix W:\n', matrix_w)

(10000, 1)
Matrix W:
 [[ 0.00000000+0.j  0.00116894+0.j -0.01808980+0.j ..., -0.00673017+0.j
   0.02280083+0.j  0.00186752+0.j]
 [ 0.00000000+0.j  0.00152445+0.j -0.01757535+0.j ..., -0.00800516+0.j
   0.02396519+0.j  0.00338296+0.j]
 [ 0.00000000+0.j  0.00199958+0.j -0.01731849+0.j ..., -0.00929425+0.j
   0.02457368+0.j  0.00500197+0.j]
 ..., 
 [ 0.00000000+0.j -0.00084466+0.j -0.01909047+0.j ..., -0.00135019+0.j
   0.01515583+0.j  0.04682599+0.j]
 [ 0.00000000+0.j -0.00046823+0.j -0.01936786+0.j ..., -0.00030333+0.j
   0.00050111+0.j  0.05105430+0.j]
 [ 0.00000000+0.j -0.00349462+0.j -0.02181438+0.j ..., -0.00243420+0.j
  -0.02327232+0.j  0.04394218+0.j]]


In [12]:
print(matrix_w.shape)

(10000, 10)


Transformed sample to subspace

In [13]:
transformed = matrix_w.T.dot(image_data)
print(transformed.astype(int))
print(transformed.shape)


[[    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]
 [ 6044  5576  5117  6308  5202  4492 10743 11527  5327  4603  6173  6231
   4475  6238  9586  5776  3712  7361  6436  6693]
 [ -494 -2485  -338   146   382    34   199 -2299   225 -1062 -1801  1558
  -3888   506 -1783   419 -2528  -190   117 -2679]
 [ 2021   -44  1382  1608 -1995  1530  1382   863 -2254  1626   741   298
    583  2071   434  3077  1701  -512  1026   514]
 [ 5216  3031  5437  3080  4982  4403  2424  5810  4651  4345  3855  4333
   3393  2895  4761  6077  4350  4295  3749  5096]
 [ 3828   659  1158  2301  2458  1620  2586  2073  2936  1431   497  1754
   3369  2119  1986  1577  2339   509  1467  1412]
 [-1369   784   788   918   622   952    69  1624  1094  1661  -280  -564
    542   701   155   687  -619 -1334   162  -737]
 [ -249  -186  1474  -847    95   131  1829 -1118   131   765  -500  -481
   -654  -707    97  -523   191  -723  -908  1014]


  from ipykernel import kernelapp as app


## Conclusion

The transformed data is at a lower subspace of 10x20 size which can reduce the computation required for subsequent steps. Although looking at the list of eigenvalues in the descending order we see that we can increase the sample subspace to k=19 beyond which the values tend towards 0.

Note: The first row seems to be having values close to 0 and since i've converted to int its displayed as absoulte zero. I can't seem to know why this happened.

### Sklearn implementation of the above PCA steps

In [14]:
from sklearn.decomposition import PCA as sklearnPCA

sklearn_pca = sklearnPCA(n_components=10)
sklearn_transf = sklearn_pca.fit_transform(image_data.T)

print(sklearn_transf)
print(sklearn_transf.shape)

[[ -337.01918934  -303.41174897 -1219.09946114  -906.58831835
   1924.02700175  1662.73513217  -190.62022982  1041.63521222
   -362.60184267   510.16740269]
 [ -804.93735226  1687.58163797   847.86583216  1278.63295029
  -1244.82503359  -491.73583185  -128.24028383  1009.91701217
    122.53542462   580.88265212]
 [-1263.97473466  -459.50754778  -579.97680227 -1128.29412483
   -746.33570576  -495.42481472  1532.55791817   529.61650236
    931.48991961  -868.69608142]
 [  -72.85016958  -944.46618094  -805.18171556  1228.99803409
    396.99447707  -625.15289239  -789.17797129  -323.53803     -582.4901869
    251.17388058]
 [-1179.00371823 -1181.01161618  2798.23438915  -673.11418241
    553.66087594  -329.61814325   153.7280797   -904.99940664
   -331.37454448  -126.78596778]
 [-1888.58083762  -832.38687695  -727.64602969   -93.8376326   -284.11650369
   -659.76273363   189.6198587    323.88532238  -932.35754483
    693.04169921]
 [ 4361.8980841   -997.8050976   -579.75303823  1885.397490

# Performing LDA on the subspace dataset

In [15]:
print(filelist)

['Data/emotion_classification/train/subject10.happy.gif', 'Data/emotion_classification/train/subject02.happy.gif', 'Data/emotion_classification/train/subject12.happy.gif', 'Data/emotion_classification/train/subject04.happy.gif', 'Data/emotion_classification/train/subject13.sad.gif', 'Data/emotion_classification/train/subject06.sad.gif', 'Data/emotion_classification/train/subject11.sad.gif', 'Data/emotion_classification/train/subject01.happy.gif', 'Data/emotion_classification/train/subject13.happy.gif', 'Data/emotion_classification/train/subject06.happy.gif', 'Data/emotion_classification/train/subject02.sad.gif', 'Data/emotion_classification/train/subject12.sad.gif', 'Data/emotion_classification/train/subject07.happy.gif', 'Data/emotion_classification/train/subject04.sad.gif', 'Data/emotion_classification/train/subject03.sad.gif', 'Data/emotion_classification/train/subject10.sad.gif', 'Data/emotion_classification/train/subject07.sad.gif', 'Data/emotion_classification/train/subject09.sad

In [26]:
happy_filelist = []
sad_filelist = []
for i in range(len(filelist)):
    if (filelist[i].find('happy') != -1) :
        happy_filelist.append(filelist[i]) 
    else:
        sad_filelist.append(filelist[i])

In [27]:
happy_filelist_dataset = np.array([np.array(Image.open(fname).resize((100,100),Image.ANTIALIAS)) for fname in happy_filelist])
sad_filelist_dataset = np.array([np.array(Image.open(fname).resize((100,100),Image.ANTIALIAS)) for fname in sad_filelist])

In [29]:
happy_image_data = np.zeros((happy_filelist_dataset.shape[0],10000))
for i in range(happy_filelist_dataset.shape[0]):
    happy_image_data[i] = (happy_filelist_dataset[i,:,:].flatten()).astype(int)
happy_image_data = happy_image_data.astype(int).T

sad_image_data = np.zeros((sad_filelist_dataset.shape[0],10000))
for i in range(sad_filelist_dataset.shape[0]):
    sad_image_data[i] = (sad_filelist_dataset[i,:,:].flatten()).astype(int)
sad_image_data = sad_image_data.astype(int).T

### Generating the transformed dataset from input data

In [79]:
happy_filelist_dataset_transformed = (matrix_w.T.dot(happy_image_data)).astype(int).T
sad_filelist_dataset_transformed = (matrix_w.T.dot(sad_image_data)).astype(int).T

  if __name__ == '__main__':
  from ipykernel import kernelapp as app


In [33]:
print(happy_filelist_dataset_transformed)
print(happy_filelist_dataset_transformed.shape)
print(sad_filelist_dataset_transformed)
print(sad_filelist_dataset_transformed.shape)

[[    0  6044  -494  2021  5216  3828 -1369  -249 -1318  -147]
 [    0  5576 -2485   -44  3031   659   784  -186 -1350   338]
 [    0  5117  -338  1382  5437  1158   788  1474 -1830  1147]
 [    0  6308   146  1608  3080  2301   918  -847 -2683  -366]
 [    0 11527 -2299   863  5810  2073  1624 -1118 -1964  -627]
 [    0  5327   225 -2254  4651  2936  1094   131 -2433   413]
 [    0  4603 -1062  1626  4345  1431  1661   765 -2155  -411]
 [    0  4475 -3888   583  3393  3369   542  -654 -2041  1230]
 [    0  6693 -2679   514  5096  1412  -737  1014 -2121  -367]]
(9, 10)
[[    0  5202   382 -1995  4982  2458   622    95 -3264  -115]
 [    0  4492    34  1530  4403  1620   952   131 -2036  -716]
 [    0 10743   199  1382  2424  2586    69  1829 -2578   187]
 [    0  6173 -1801   741  3855   497  -280  -500 -2540   239]
 [    0  6231  1558   298  4333  1754  -564  -481  -941   431]
 [    0  6238   506  2071  2895  2119   701  -707 -3033   367]
 [    0  9586 -1783   434  4761  1986   155   

### Obtaining the mean vector

In [38]:
mean_vectors = []
mean_vectors.append(np.mean(happy_filelist_dataset_transformed, axis=0))
mean_vectors.append(np.mean(sad_filelist_dataset_transformed, axis=0))
print(mean_vectors[0])

[    0.          6185.55555556 -1430.44444444   699.88888889  4451.
  2129.66666667   589.44444444    36.66666667 -1988.33333333   134.44444444]


### Calculating the scatter matrix

In [75]:
class_sc_mat_happy = np.zeros((10,10))
mean_vector_happy = mean_vectors[0]
mean_vector_happy.reshape(10,1)

for row in happy_filelist_dataset_transformed:
    row.reshape(10,1)
    class_sc_mat_happy += (row-mean_vector_happy).dot((row-mean_vector_happy).T)    


class_sc_mat_sad = np.zeros((10,10))
mean_vector_sad = mean_vectors[1]
mean_vector_sad.reshape(10,1)
for row in sad_filelist_dataset_transformed:
    row.reshape(10,1)
    class_sc_mat_sad += (row-mean_vector_sad).dot((row-mean_vector_sad).T)    


In [76]:
S_W = class_sc_mat_happy + class_sc_mat_sad
print(S_W)
print(S_W.shape)

[[  2.15724497e+08   2.15724497e+08   2.15724497e+08   2.15724497e+08
    2.15724497e+08   2.15724497e+08   2.15724497e+08   2.15724497e+08
    2.15724497e+08   2.15724497e+08]
 [  2.15724497e+08   2.15724497e+08   2.15724497e+08   2.15724497e+08
    2.15724497e+08   2.15724497e+08   2.15724497e+08   2.15724497e+08
    2.15724497e+08   2.15724497e+08]
 [  2.15724497e+08   2.15724497e+08   2.15724497e+08   2.15724497e+08
    2.15724497e+08   2.15724497e+08   2.15724497e+08   2.15724497e+08
    2.15724497e+08   2.15724497e+08]
 [  2.15724497e+08   2.15724497e+08   2.15724497e+08   2.15724497e+08
    2.15724497e+08   2.15724497e+08   2.15724497e+08   2.15724497e+08
    2.15724497e+08   2.15724497e+08]
 [  2.15724497e+08   2.15724497e+08   2.15724497e+08   2.15724497e+08
    2.15724497e+08   2.15724497e+08   2.15724497e+08   2.15724497e+08
    2.15724497e+08   2.15724497e+08]
 [  2.15724497e+08   2.15724497e+08   2.15724497e+08   2.15724497e+08
    2.15724497e+08   2.15724497e+08   2.15724

In [77]:
total_dataset = np.concatenate((happy_filelist_dataset_transformed, sad_filelist_dataset_transformed), axis=0)
overall_mean = np.mean(total_dataset, axis=0)

#For Happy Dataset
class_sb_mat_happy = happy_filelist_dataset_transformed.shape[0] * (mean_vector_happy.reshape(10,1) - overall_mean.reshape(10,1)).dot((mean_vector_happy.reshape(10,1) - overall_mean.reshape(10,1)).T)

#For sad dataset
class_sb_mat_sad = sad_filelist_dataset_transformed.shape[0] * (mean_vector_sad.reshape(10,1) - overall_mean.reshape(10,1)).dot((mean_vector_sad.reshape(10,1) - overall_mean.reshape(10,1)).T)

S_B = class_sb_mat_happy + class_sb_mat_sad
print(S_B)

[[       0.                0.                0.                0.                0.
         0.                0.                0.                0.                0.        ]
 [       0.           625066.86868687  2022512.41414141   328488.80808081
   -453662.09090909  -721882.93939394  -948722.86868687  -304200.39393939
  -1187058.48484848   259390.31313131]
 [       0.          2022512.41414141  6544190.18232323  1062882.58989899
  -1467902.48636364 -2335777.62575758 -3069757.61414141  -984293.20757576
  -3840933.89393939   839302.40858586]
 [       0.           328488.80808081  1062882.58989899   172629.36565657
   -238411.16363636  -379368.15757576  -498578.40808081  -159865.17575758
   -623829.93939394   136316.31919192]
 [       0.          -453662.09090909 -1467902.48636364  -238411.16363636
    329259.64090909   523929.42272727   688565.69090909   220783.07727273
    861545.31818182  -188260.74090909]
 [       0.          -721882.93939394 -2335777.62575758  -379368.15757576
 

In [78]:
eig_vals, eig_vecs = np.linalg.eig(np.linalg.inv(S_W).dot(S_B))

LinAlgError: Singular matrix