In [153]:
import pandas as pd
import re

In [154]:
eurostat = pd.read_csv('./data/eurostat/eurostat-2013.csv')

In [155]:
# rename columns to only use attributes indifiers
eurostat.rename(columns={ eurostat.columns[4]: 'teilmF', eurostat.columns[5]: 'teilmM' }, inplace=True)
eurostat.rename(columns=lambda s: re.sub('\(.*\)', '', s.split(' ', 1)[0]), inplace=True)

eurostat.describe()
eurostat.head()

Unnamed: 0,Nom,Code,tps00001,tec00115,teilmF,teilmM,tec00118,teimf050,tsdsc260,tet00002,tsc00001,tsc00004
0,Autriche,AT,8451860,0.3,5.3,4.9,2.1,2.17,4.1,-5683,2.84,38637
1,Belgique,BE,11161642,0.2,8.1,8.8,1.2,2.43,3.9,14145,2.24,44052
2,Bulgarie,BG,7284552,0.9,11.8,13.7,0.4,3.43,6.6,-3610,0.64,11295
3,Suisse,CH,8039060,1.9,4.6,4.1,0.1,0.9,4.4,18780,2.87,25142
4,Chypre,CY,865878,-5.4,15.5,17.5,0.4,6.0,4.9,-3229,0.46,895


In [156]:
def divide_by_population(row):
    population = row['tps00001']
    row['teilmF'] /= population
    row['teilmM'] /= population
    row['tsdsc260'] /= population
    row['tsc00004'] /= population

    return row

In [157]:
eurostat = eurostat.apply(divide_by_population, axis=1) # divide some rows by the population row value
eurostat = eurostat.drop(['tps00001'], axis=1) # delete the population column
eurostat.head()

Unnamed: 0,Nom,Code,tec00115,teilmF,teilmM,tec00118,teimf050,tsdsc260,tet00002,tsc00001,tsc00004
0,Autriche,AT,0.3,6.270809e-07,5.79754e-07,2.1,2.17,4.851003e-07,-5683,2.84,0.004571
1,Belgique,BE,0.2,7.256997e-07,7.884145e-07,1.2,2.43,3.49411e-07,14145,2.24,0.003947
2,Bulgarie,BG,0.9,1.619866e-06,1.880692e-06,0.4,3.43,9.060269e-07,-3610,0.64,0.001551
3,Suisse,CH,1.9,5.722062e-07,5.100099e-07,0.1,0.9,5.473277e-07,18780,2.87,0.003127
4,Chypre,CY,-5.4,1.790091e-05,2.02107e-05,0.4,6.0,5.658996e-06,-3229,0.46,0.001034


In [158]:
# apply a normalization filter : StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

normalized_eurostat = eurostat.copy()
numerical_columns = ['tec00115', 'teilmF', 'teilmM', 'tec00118', 'teimf050', 'tsdsc260', 'tet00002', 'tsc00001', 'tsc00004']
normalized_eurostat[numerical_columns] = scaler.fit_transform(eurostat[numerical_columns])

normalized_eurostat.head()

Unnamed: 0,Nom,Code,tec00115,teilmF,teilmM,tec00118,teimf050,tsdsc260,tet00002,tsc00001,tsc00004
0,Autriche,AT,-0.054714,-0.487096,-0.488163,0.882521,-0.663659,-0.38177,0.016166,1.108564,0.722569
1,Belgique,BE,-0.104869,-0.463321,-0.442513,-0.097691,-0.51572,-0.44214,0.205697,0.462228,0.322147
2,Bulgarie,BG,0.246214,-0.247752,-0.203546,-0.968991,0.053279,-0.194493,0.035982,-1.261334,-1.213797
3,Suisse,CH,0.74776,-0.500326,-0.503422,-1.295728,-1.386287,-0.354084,0.250002,1.14088,-0.202988
4,Chypre,CY,-2.913529,3.677346,3.806666,-0.968991,1.515605,1.920175,0.039624,-1.455235,-1.545133
