# Principal Component Analysis 

### Data import and preparation

In [4]:
import sklearn
from sklearn.decomposition import PCA
#use pandas to load csv file
import pandas as pandas
#use numpy to calculate stuff
import numpy as np

#load data and merge both tables to one, ignore_index to reindex
redwinedata = pandas.read_csv('data/winequality-red.csv', sep =';')
whitewinedata = pandas.read_csv('data/winequality-white.csv', sep =';')


#simplified data for testing
#redwinedata = pandas.read_csv('data/red_onlysugar.csv', sep =';')
#whitewinedata = pandas.read_csv('data/white_onlysugar.csv', sep =';')

concat_data = redwinedata.append(whitewinedata, ignore_index=True)
# drop the quality label and normalize the data
concat_data = concat_data.drop('quality', axis=1)
winearray = concat_data.values
winearray_norm = sklearn.preprocessing.scale(winearray)

### How many of the Principal Components should be used?

In [57]:
#https://stackoverflow.com/questions/23294616/how-to-use-scikit-learn-pca-for-features-reduction-and-know-which-features-are-d
pca = PCA()
pca.fit(winearray_norm)
print("Principal Components and their explained variance ratio:")
print(pca.explained_variance_ratio_)  

import plotly.plotly as py
import plotly.graph_objs as go

trace1 = go.Scatter(
    x=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    y=np.cumsum(pca.explained_variance_ratio_),
    fill='tozeroy'
)
layout = go.Layout(
    title='Plot Title',
    xaxis=dict(
        title='x Axis',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='y Axis',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)

data = [trace1]
py.iplot({'data': data, 'layout': {'title': 'Cumultative explained variance ratio', 'font': dict(size=16)}}, filename='basic-area')


Principal Components and their explained variance ratio:
[0.2754426  0.22671146 0.14148609 0.08823201 0.06544317 0.05521016
 0.04755989 0.04559184 0.03063855 0.02069961 0.00298462]


Based on this Graph, my desicion would be to include 4 PC, because it explains 73% of the variance while drastcally reducibg the number of coponents.

### Feature-Composition of the most important PC
We now look at the correlation between the features and the principal components to see which of the features play into the principal components. This gives us a hint, which features we could drop for our analysis.

In [66]:
# see https://stackoverflow.com/questions/23294616/how-to-use-scikit-learn-pca-for-features-reduction-and-know-which-features-are-d
pca = PCA(n_components=4)
pca.fit(winearray_norm)
#print(pca.components_)

PCA(copy=True, iterated_power='auto', n_components=4, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [67]:
# first take absolute values and then normalize this data to get a better overview
comp1 = sklearn.preprocessing.scale(np.absolute(pca.components_[0]))
comp2 = sklearn.preprocessing.scale(np.absolute(pca.components_[1]))
comp3 = sklearn.preprocessing.scale(np.absolute(pca.components_[2]))
comp4 = sklearn.preprocessing.scale(np.absolute(pca.components_[3]))

df = pandas.DataFrame([comp1,comp2, comp3, comp4])
print(df.describe().drop(['count', '25%', '50%', '75%']).round(2))

        0     1     2     3     4     5     6     7     8     9     10
mean  0.18 -0.01  0.16 -0.01 -0.23  0.00 -0.15 -0.32  0.29  0.30 -0.21
std   0.76  0.76  1.28  0.61  0.76  1.12  1.24  1.70  0.95  1.53  1.18
min  -0.62 -0.90 -0.92 -0.60 -1.33 -1.19 -1.09 -1.74 -0.65 -1.02 -1.27
max   1.08  0.84  1.99  0.57  0.37  1.22  1.65  2.09  1.20  2.48  1.33


Having a look at the dataset above, the features (columns) with the highest means means over all 4 components they have the biggest impact.
In this case the ranking would be: 9, 8, 0, 2, 5, 1, 3, 6, 10, 4, 7
Additionally one could put a weight according to their cumultative explained variance ratio because the PC which explain more of the variance should have a bigger influence in deciding which feature is important.
So let's do the same as before with the variance ratio.


In [81]:
#take the explained variance ratio array and use it to weight the component arrays
comp1_weighted = pca.explained_variance_ratio_[0] * comp1
comp2_weighted = pca.explained_variance_ratio_[1] * comp2
comp3_weighted = pca.explained_variance_ratio_[2] * comp3
comp4_weighted = pca.explained_variance_ratio_[3] * comp4
df_weighted = pandas.DataFrame([comp1_weighted,comp2_weighted, comp3_weighted, comp4_weighted])
print(df_weighted.describe().drop(['count', '25%', '50%', '75%']).round(4))

          0       1       2       3       4       5       6       7       8   \
mean  0.0357  0.0121 -0.0193  0.0352 -0.0188  0.0074  0.0159 -0.0428 -0.0003   
std   0.1144  0.1800  0.2259  0.1122  0.1192  0.2566  0.3065  0.3924  0.1546   
min  -0.0699 -0.2042 -0.2524 -0.0674 -0.1885 -0.2705 -0.2482 -0.4794 -0.1485   
max   0.1533  0.2300  0.2812  0.1565  0.0832  0.3360  0.4554  0.4738  0.1705   

          9       10  
mean  0.0062 -0.0312  
std   0.1635  0.2687  
min  -0.1448 -0.3495  
max   0.2189  0.3009  


Altough this ranking is quite different, I will compare those two to make a decision:    
0, 3, 6, 1, 5, 9, 8, 2, 4, 10, 7 - weighted     
9, 8, 0, 2, 5, 1, 3, 6, 10, 4, 7 - unweighted    

It can therefore be argued, that the components 4, 7 and 10 can be dropped (index begins at 0).
A look at the head of the data shows us which feature those represent:

In [84]:
concat_data.head(1)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


Therefore chlorides, density, alcohol can be dropped!
Comparing with excercise a), it does only partly reflect the difference of white and red wines, but must also have found other "underlying" factors. 