Let's use a *toy example* from the Eurobarometer to study the left-right position of citizen based on various background items.


In [23]:
import csv

data = csv.DictReader( open('eurobaro.csv') )

## we need something called labels (what we try to explain) and
## something we call the features (with what we try to explain)

features = []

for row in data:
    
    f = []
    
    leftright = row['d1'] ## this is TEXT
    leftright = leftright.replace('Box', '').replace('- left', '').replace('- right', '').replace('DK', '-1').replace('Refusal', '-1')
    leftright = leftright.strip()
    leftright = int( leftright )
    
    sex = int( row['d10'] == 'Woman' ) ## man/woman
    age = int( row['d11'] )
        
    area = 0
        
    if row['d25'] == 'Rural area or village':
        area = 1
            
    if row['d25'] == 'Small or middle sized town':
        area = 2
            
    if row['d25'] == 'Large town':
        area = 3
        
    bill_paying = 0
        
    if row['d60'] == 'Most of the time': ## todo: add rest
        bill_paying = 1
            
    _temp = {
        'Very satisfied' : 1,
        'Fairly satisfied' : 2,
        'Not very satisfied' : 3,
        'Not at all satisfied' : 4,
        'DK' : 2.5 ## mean!
    }
        
    live_satisfaction = _temp[ row['d70'] ]
        
    features.append( [ leftright, sex, age, area, bill_paying, live_satisfaction ] )
        
print 'Have', len( features ), 'data entries.'    

Have 27672 data entries.


In [28]:
import numpy
from sklearn import cluster, preprocessing

model = cluster.KMeans()

f = numpy.array( features )
## f = preprocessing.scale( f ) ## scale the data to -1 to 1, so that the mean is 0

model.fit( f )

print model.score( f )

results = model.cluster_centers_

print 'Cluster n Left-right Sex Age Area Pays bills Life satisfaction'

for i in range(8):
    
    r = map( lambda x: round( x, 2 ), results[i] )
    r = map( str, r )
    
    print 'Cluster', i+1,
    print '\t'.join( r )

-551294.770595
Cluster n Left-right Sex Age Area Pays bills Life satisfaction
Cluster 1 3.83	0.59	81.54	0.98	0.07	1.93
Cluster 2 3.77	0.56	31.08	1.22	0.11	1.92
Cluster 3 3.79	0.54	56.64	1.09	0.12	2.08
Cluster 4 3.09	0.5	21.2	1.21	0.1	1.84
Cluster 5 4.0	0.56	48.45	1.09	0.12	2.04
Cluster 6 3.88	0.54	64.61	1.02	0.08	1.96
Cluster 7 3.95	0.58	39.99	1.11	0.11	1.95
Cluster 8 4.07	0.54	72.11	1.01	0.08	1.93


In [34]:
import numpy
from sklearn import cluster, preprocessing

for k in range(2, 10):
    model = cluster.KMeans( k )

    f = numpy.array( features )
    ## f = preprocessing.scale( f ) ## scale the data to -1 to 1, so that the mean is 0

    model.fit( f )

    print model.score( f )

    results = model.cluster_centers_

    print 'Cluster n Left-right Sex Age Area Pays bills Life satisfaction'

    for i in range( k ):
        r = map( lambda x: round( x, 2 ), results[i] )
        r = map( str, r )

        print 'Cluster', i+1,
        print '\t'.join( r )

-2844974.58712
Cluster n Left-right Sex Age Area Pays bills Life satisfaction
Cluster 1 3.9	0.55	65.07	1.04	0.09	1.99
Cluster 2 3.7	0.55	34.34	1.16	0.11	1.93
-1582682.79007
Cluster n Left-right Sex Age Area Pays bills Life satisfaction
Cluster 1 3.91	0.55	50.15	1.09	0.11	2.04
Cluster 2 3.55	0.54	28.76	1.2	0.11	1.9
Cluster 3 3.93	0.55	70.62	1.01	0.08	1.94
-1079988.08729
Cluster n Left-right Sex Age Area Pays bills Life satisfaction
Cluster 1 3.84	0.54	59.8	1.06	0.1	2.02
Cluster 2 3.44	0.53	25.79	1.21	0.11	1.88
Cluster 3 3.99	0.56	74.87	1.0	0.08	1.93
Cluster 4 3.94	0.57	43.02	1.11	0.11	1.99
-827918.105539
Cluster n Left-right Sex Age Area Pays bills Life satisfaction
Cluster 1 3.99	0.57	77.1	1.0	0.08	1.93
Cluster 2 3.89	0.57	37.28	1.14	0.11	1.94
Cluster 3 3.9	0.56	50.52	1.09	0.12	2.05
Cluster 4 3.33	0.52	23.6	1.23	0.11	1.87
Cluster 5 3.87	0.54	64.04	1.04	0.09	1.98
-690767.235763
Cluster n Left-right Sex Age Area Pays bills Life satisfaction
Cluster 1 3.99	0.54	68.52	1.02	0.08	1.94
Cluste