In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [14]:
df = pd.read_csv('Country-data.csv')

In [15]:
df.head()

Unnamed: 0,country,child_mort,exports,health,imports,income,inflation,life_expec,total_fer,gdpp
0,Afghanistan,90.2,10.0,7.58,44.9,1610,9.44,56.2,5.82,553
1,Albania,16.6,28.0,6.55,48.6,9930,4.49,76.3,1.65,4090
2,Algeria,27.3,38.4,4.17,31.4,12900,16.1,76.5,2.89,4460
3,Angola,119.0,62.3,2.85,42.9,5900,22.4,60.1,6.16,3530
4,Antigua and Barbuda,10.3,45.5,6.03,58.9,19100,1.44,76.8,2.13,12200


### Define X as the numerical features in the Data. Then scale the data

In [16]:
X = df[df.columns[1:]]

In [17]:
from sklearn.preprocessing import StandardScaler

In [18]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

###  Import PCA from sklearn.decomposition and apply PCA to convert data into a 2 dimensional data.

In [19]:
from sklearn.decomposition import PCA

In [20]:
# Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

### Create a new Dataframe with columns Country and the new components (z1 and z2)

In [21]:
z1 = X_pca[:,0]
z2 = X_pca[:,1]

In [22]:
df_2d = pd.DataFrame()
df_2d['Country'] = df['country'].values
df_2d['z1'] = z1
df_2d['z2'] = z2

### We were able to convert 9 dimensions to 2 dimensions while hopefully retaining most of the information. In the next task we will check how much % of variance is explained by new components z1 and z2

###  What % of the variance in the Data is explained by the new dimensions z1 and z2

In [26]:
round(pca.explained_variance_ratio_.sum()*100)

63

### Recall that z1 and z2 are linear combinations of original dimensions. Eg: Z = α1*x1+ α2*x2+...α9*x9. Lets try to get the values of these weights α1,α2,...α9.  

In [31]:
pca.components_

array([[-0.41951945,  0.28389698,  0.15083782,  0.16148244,  0.39844111,
        -0.19317293,  0.42583938, -0.40372896,  0.39264482],
       [ 0.19288394,  0.61316349, -0.24308678,  0.67182064,  0.02253553,
        -0.00840447, -0.22270674,  0.15523311, -0.0460224 ]])

In [32]:
features = ['child_mort', 'exports', 'health', 'imports', 'income', 
            'inflation', 'life_expec', 'total_fer', 'gdpp']

In [33]:
# Display the loadings
loadings = pd.DataFrame(pca.components_.T, columns=['z1', 'z2'], index=features)

In [36]:
loadings['z1'].sort_values(ascending=False)

life_expec    0.425839
income        0.398441
gdpp          0.392645
exports       0.283897
imports       0.161482
health        0.150838
inflation    -0.193173
total_fer    -0.403729
child_mort   -0.419519
Name: z1, dtype: float64

In [38]:
loadings['z2'].sort_values(ascending=False)

imports       0.671821
exports       0.613163
child_mort    0.192884
total_fer     0.155233
income        0.022536
inflation    -0.008404
gdpp         -0.046022
life_expec   -0.222707
health       -0.243087
Name: z2, dtype: float64

### z1
**Higher values of z1 represent countries with higher life expectancy and income. Also that represents countries with lower child_mortal,total_fer.**


### Hence z1 might be thought of as a variable representing country's "Economic and heath  prosperity"

### z2
**Higher values of z2 represent countries with higher imports, exports. Also that represents countries with lower health and life expectancy**

### Hence z2 might be thought of as a variable representing country's "Trade Activity and Overall Health"
