In [1]:
import pandas as pd
import re
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

In [2]:
eurostat = pd.read_csv('./data/eurostat/eurostat-2013.csv')

In [3]:
# rename columns to only use attributes indifiers
eurostat.rename(columns={ eurostat.columns[4]: 'teilmF', eurostat.columns[5]: 'teilmM' }, inplace=True)
eurostat.rename(columns=lambda s: re.sub('\(.*\)', '', s.split(' ', 1)[0]), inplace=True)

eurostat.describe()
eurostat.head()

Unnamed: 0,Nom,Code,tps00001,tec00115,teilmF,teilmM,tec00118,teimf050,tsdsc260,tet00002,tsc00001,tsc00004
0,Autriche,AT,8451860,0.3,5.3,4.9,2.1,2.17,4.1,-5683,2.84,38637
1,Belgique,BE,11161642,0.2,8.1,8.8,1.2,2.43,3.9,14145,2.24,44052
2,Bulgarie,BG,7284552,0.9,11.8,13.7,0.4,3.43,6.6,-3610,0.64,11295
3,Suisse,CH,8039060,1.9,4.6,4.1,0.1,0.9,4.4,18780,2.87,25142
4,Chypre,CY,865878,-5.4,15.5,17.5,0.4,6.0,4.9,-3229,0.46,895


In [4]:
def divide_by_population(row):
    population = row['tps00001']
    row['teilmF'] /= population
    row['teilmM'] /= population
    row['tsdsc260'] /= population
    row['tsc00004'] /= population

    return row

In [6]:
eurostat = eurostat.apply(divide_by_population, axis=1) # divide some rows by the population row value
eurostat = eurostat.drop(['tps00001'], axis=1) # delete the population column
eurostat.head()

Unnamed: 0,Nom,Code,tec00115,teilmF,teilmM,tec00118,teimf050,tsdsc260,tet00002,tsc00001,tsc00004
0,Autriche,AT,0.3,6.270809e-07,5.79754e-07,2.1,2.17,4.851003e-07,-5683,2.84,0.004571
1,Belgique,BE,0.2,7.256997e-07,7.884145e-07,1.2,2.43,3.49411e-07,14145,2.24,0.003947
2,Bulgarie,BG,0.9,1.619866e-06,1.880692e-06,0.4,3.43,9.060269e-07,-3610,0.64,0.001551
3,Suisse,CH,1.9,5.722062e-07,5.100099e-07,0.1,0.9,5.473277e-07,18780,2.87,0.003127
4,Chypre,CY,-5.4,1.790091e-05,2.02107e-05,0.4,6.0,5.658996e-06,-3229,0.46,0.001034


In [5]:
# apply a normalization filter : StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

numerical_columns = ['tec00115', 'teilmF', 'teilmM', 'tec00118', 'teimf050', 'tsdsc260', 'tet00002', 'tsc00001', 'tsc00004']
X_norm = pd.DataFrame(scaler.fit_transform(eurostat[numerical_columns]), columns=numerical_columns)
y = eurostat[['Nom', 'Code']]
X_norm.head()

Unnamed: 0,tec00115,teilmF,teilmM,tec00118,teimf050,tsdsc260,tet00002,tsc00001,tsc00004
0,-0.054714,-0.888476,-1.095353,0.882521,-0.663659,-0.605038,0.016166,1.108564,-0.391993
1,-0.104869,-0.399232,-0.317362,-0.097691,-0.51572,-0.712096,0.205697,0.462228,-0.378735
2,0.246214,0.24727,0.660113,-0.968991,0.053279,0.733183,0.035982,-1.261334,-0.458939
3,0.74776,-1.010787,-1.25494,-1.295728,-1.386287,-0.444452,0.250002,1.14088,-0.425035
4,-2.913529,0.893771,1.418155,-0.968991,1.515605,-0.176807,0.039624,-1.455235,-0.484403


In [7]:
# ACP
from sklearn.decomposition import PCA
acp = PCA(svd_solver='full')
coord = acp.fit_transform(X_norm)

n = X_norm.shape[0] # number of rows
p = X_norm.shape[1] # number of columns

In [8]:
# plot instances on the first plan (first 2 factors)
fig, axes = plt.subplots(figsize=(12,12))
axes.set_xlim(-1,1)
axes.set_ylim(-1,1)
for i in range(n):
    plt.annotate(y.values[i][1],(coord[i,0],coord[i,1]))
plt.plot([-1,1],[0,0],color='silver',linestyle='-',linewidth=1)
plt.plot([0,0],[-1,1],color='silver',linestyle='-',linewidth=1)
plt.savefig('fig/acp_instances_1st_plan_CP1_CP2')
plt.close(fig)

# plot instances on the first plan (first 2 factors)
fig, axes = plt.subplots(figsize=(12,12))
axes.set_xlim(-1,1)
axes.set_ylim(-1,1)
for i in range(n):
    plt.annotate(y.values[i][1],(coord[i,2],coord[i,3]))
plt.plot([-1,1],[0,0],color='silver',linestyle='-',linewidth=1)
plt.plot([0,0],[-1,1],color='silver',linestyle='-',linewidth=1)
plt.savefig('fig/acp_instances_1st_plan_CP3_CP4')
plt.close(fig)

In [9]:
# plot eigen values
eigval = float(n-1)/n*acp.explained_variance_
fig = plt.figure()
plt.plot(np.arange(1,p+1),eigval)
plt.title("Scree plot")
plt.ylabel("Eigen values")
plt.xlabel("Factor number")
plt.savefig('fig/acp_eigen_values')
plt.close(fig)

# print correlations between factors and original variables
sqrt_eigval = np.sqrt(eigval)
corvar = np.zeros((p,p))
for k in range(p):
    corvar[:,k] = acp.components_[k,:] * sqrt_eigval[k]
print(corvar)
# lines: variables
# columns: factors

[[-6.20323156e-01  1.78332749e-01  6.11060162e-01  2.30380573e-01
   2.67562266e-01  1.45327180e-01 -2.53379003e-01 -4.02993004e-03
   4.08245781e-04]
 [ 9.39304386e-01  6.99487007e-02 -1.12643954e-01 -9.63445355e-02
   2.08174037e-02  2.43669696e-01 -1.21394398e-01 -2.69623139e-02
  -1.24786494e-01]
 [ 9.32647352e-01  6.79436997e-02 -3.05931057e-02 -1.06016012e-01
   6.29553948e-02  3.01617795e-01 -5.11387366e-02 -2.73279584e-02
   1.22794844e-01]
 [-4.13280268e-01 -1.62359232e-01  3.05026865e-01 -8.29350697e-01
  -1.14375210e-01  6.53403702e-02 -2.18133569e-02  6.43188412e-02
  -3.33827011e-03]
 [ 8.39120540e-01  1.43911876e-01  1.53021455e-01 -7.65743220e-02
  -5.98816183e-02 -4.30830392e-01 -2.13507458e-01  1.04069598e-01
   1.64422459e-02]
 [ 3.43189486e-02  9.35032992e-01  7.71336198e-02  4.78088048e-02
  -5.75038622e-05  9.48058128e-02  1.28364444e-01  3.01289028e-01
  -7.71888598e-03]
 [ 1.00870624e-01 -8.40786525e-01 -1.52256847e-01  6.00657618e-03
   4.48520915e-01  4.4580567

In [11]:
# draw correlation circles
from tp2_1_1 import correlation_circle

# CP1 & CP2
correlation_circle(df=X_norm, nb_var=p, x_axis=0, y_axis=1, corvar=corvar, plt=plt)
# CP3 & CP4
correlation_circle(df=X_norm, nb_var=p, x_axis=2, y_axis=3, corvar=corvar, plt=plt)