# Chapter 10: Machine Learning

In [42]:
import pandas as pd
ess = pd.read_csv('ess.csv')
print(ess.shape)

(44387, 534)


  ess = pd.read_csv('ess.csv')


In [43]:
print(ess.loc[:,'happy'].head())

0    5
1    5
2    8
3    8
4    5
Name: happy, dtype: int64


In [44]:
ess = ess.loc[ess['sclmeet'] <= 10,:].copy()
ess = ess.loc[ess['rlgdgr'] <= 10,:].copy()
ess = ess.loc[ess['hhmmb'] <= 50,:].copy()
ess = ess.loc[ess['netusoft'] <= 5,:].copy()
ess = ess.loc[ess['agea'] <= 200,:].copy()
ess = ess.loc[ess['health'] <= 5,:].copy()
ess = ess.loc[ess['happy'] <= 10,:].copy()
ess = ess.loc[ess['eduyrs'] <= 100,:].copy()

In [45]:
print(ess.loc[ess['eduyrs'] <= 100,:].copy())

            name  essround  edition    proddate  idno cntry  nwspol  netusoft  \
0      ESS8e02_1         8      2.1  01.12.2018     1    AT     120         4   
1      ESS8e02_1         8      2.1  01.12.2018     2    AT     120         5   
2      ESS8e02_1         8      2.1  01.12.2018     4    AT      30         2   
3      ESS8e02_1         8      2.1  01.12.2018     6    AT      30         5   
4      ESS8e02_1         8      2.1  01.12.2018    10    AT      30         5   
...          ...       ...      ...         ...   ...   ...     ...       ...   
44382  ESS8e02_1         8      2.1  01.12.2018  1303    SI      90         2   
44383  ESS8e02_1         8      2.1  01.12.2018  1304    SI      40         5   
44384  ESS8e02_1         8      2.1  01.12.2018  1305    SI     240         5   
44385  ESS8e02_1         8      2.1  01.12.2018  1306    SI       0         5   
44386  ESS8e02_1         8      2.1  01.12.2018  1307    SI      60         5   

       netustm  ppltrst  ..

In [46]:
# Binary split: happiness as a function of social activity
import numpy as np
social = list(ess.loc[:,'sclmeet'])
happy = list(ess.loc[:,'happy'])
low_social_happiness = [hap for soc,hap in zip(social,happy) if soc <= 5]
high_social_happiness = [hap for soc,hap in zip(social,happy) if soc > 5]

meanlower = np.mean(low_social_happiness)
meanhigher = np.mean(high_social_happiness)

In [47]:
print('The average happiness of someone with low social activity is ' + str(round(meanlower,1)))
print('The average happiness of someone with high social activity is ' + str(round(meanhigher,1)))

The average happiness of someone with low social activity is 7.2
The average happiness of someone with high social activity is 7.8


In [48]:
# Error prediction. Sum of errors approaching zero is better.
lowererrors = [abs(lowhappy - meanlower) for lowhappy in low_social_happiness]
highererrors = [abs(highhappy - meanhigher) for highhappy in low_social_happiness]

total_error = sum(lowererrors) + sum(highererrors)
print(total_error)

76229.87414197999


In [49]:
# Check every possible split point to find which leads to the lowest error.
def get_splitpoint(allvalues,predictedvalues):
    lowest_error = float('inf')
    best_split = None
    best_lowermean = np.mean(predictedvalues)
    best_highermean = np.mean(predictedvalues)
    for pctl in range(0,100):    # "pctl" = "percentile"
        split_candidate = np.percentile(allvalues,pctl)    # The pctl-th percentile of the data
        
        loweroutcomes = [outcome for value,outcome in zip(allvalues,predictedvalues) if value <= split_candidate]    # Create a list of the happiness levels of people whose sclmeet <= the split candidate
        higheroutcomes = [outcome for value, outcome in zip(allvalues,predictedvalues) if value > split_candidate]    # Create a list of the happiness levels of people whose sclmeet > the split candidate
        
        if np.min([len(loweroutcomes),len(higheroutcomes)]) > 0:
            meanlower = np.mean(loweroutcomes)
            meanhigher = np.mean(higheroutcomes)
            
            lowererrors = [abs(outcome - meanlower) for outcome in loweroutcomes]    # Check the errors from using that split candidate
            highererrors = [abs(outcome - meanhigher) for outcome in higheroutcomes]
            
            total_error = sum(lowererrors) + sum(highererrors)
            
            if total_error < lowest_error:
                best_split = split_candidate
                lowest_error = total_error
                best_lowermean = meanlower
                best_highermean = meanhigher
    return(best_split,lowest_error,best_lowermean,best_highermean)

In [50]:
# Find the best split point for happiness as a function of the number of household members
allvalues = list(ess.loc[:,'hhmmb'])
predictedvalues = list(ess.loc[:,'happy'])
print(get_splitpoint(allvalues,predictedvalues))

(1.0, 60860.029867951016, 6.839403436723225, 7.620055170794695)


## Choosing splitting variables



In [51]:
# Iterate over each available variable and check whether splitting on that variable produces the smallest error.

def getsplit(data,variables,outcome_variable):
    best_var = ''
    lowest_error = float('inf')
    best_split = None
    predictedvalues = list(data.loc[:,outcome_variable])
    best_lowermean = -1
    best_highermean = -1
    for var in variables:    # Iterates get_splitpoint() over a list of variables
        allvalues = list(data.loc[:,var])
        splitted = get_splitpoint(allvalues,predictedvalues)
        
        if(splitted[1] < lowest_error):
            best_split = splitted[0]
            lowest_error = splitted[1]
            best_var = var
            best_lowermean = splitted[2]
            best_highermean = splitted[3]
            
    generated_tree = [[best_var,float('-inf'),best_split,best_lowermean],[best_var,best_split,float('inf'),best_highermean]]
    
    return(generated_tree)

In [52]:
# Find the split points of happiness as a function of a series of variables
# Recall that getsplit() optimizes for lowest error

variables = ['rlgdgr','hhmmb','netusoft','agea','eduyrs']
outcome_variable = 'happy'
print(getsplit(ess,variables,outcome_variable))

[['netusoft', -inf, 4.0, 7.041597337770383], ['netusoft', 4.0, inf, 7.73042471042471]]


In [53]:
# getsplit() 2.0: Generates tree to a certain depth

maxdepth = 3
def getsplit(depth,data,variables,outcome_variable):
    best_var = ''
    lowest_error = float('inf')
    best_split = None
    predictedvalues = list(data.loc[:,outcome_variable])
    best_lowermean = -1
    best_highermean = -1
    for var in variables:    # Iterates get_splitpoint() over a list of variables
        allvalues = list(data.loc[:,var])
        splitted = get_splitpoint(allvalues,predictedvalues)
        
        if(splitted[1] < lowest_error):
            best_split = splitted[0]
            lowest_error = splitted[1]
            best_var = var
            best_lowermean = splitted[2]
            best_highermean = splitted[3]
            
    generated_tree = [[best_var,float('-inf'),best_split,[]],[best_var,best_split,float('inf'),[]]]    # Now adds empty lists to generated_tree rather than means. Lists will contain additional branches. 
    
    if depth < maxdepth:
        splitdata1=data.loc[data[best_var] <= best_split,:]
        splitdata2=data.loc[data[best_var] > best_split,:]
        if len(splitdata1.index) > 10 and len(splitdata2.index) > 10:
            generated_tree[0][3] = getsplit(depth + 1,splitdata1,variables,outcome_variable)
            generated_tree[1][3] = getsplit(depth + 1,splitdata2,variables,outcome_variable)
        else:
            depth = maxdepth + 1
            generated_tree[0][3] = best_lowermean
            generated_tree[1][3] = best_highermean
    else:
        generated_tree[0][3] = best_lowermean
        generated_tree[1][3] = best_highermean
    return(generated_tree)

In [54]:
# Creates a decision tree with a depth of two.
variables = ['rlgdgr','hhmmb','netusoft','agea','eduyrs']
outcome_variable = 'happy'
maxdepth = 2
print(getsplit(0,ess,variables,outcome_variable))

[['netusoft', -inf, 4.0, [['hhmmb', -inf, 4.0, [['agea', -inf, 15.0, 8.035714285714286], ['agea', 15.0, inf, 6.997666564322997]]], ['hhmmb', 4.0, inf, [['eduyrs', -inf, 11.0, 7.263969171483622], ['eduyrs', 11.0, inf, 8.0]]]]], ['netusoft', 4.0, inf, [['hhmmb', -inf, 1.0, [['agea', -inf, 66.0, 7.135361428970136], ['agea', 66.0, inf, 7.621993127147766]]], ['hhmmb', 1.0, inf, [['rlgdgr', -inf, 5.0, 7.743893678160919], ['rlgdgr', 5.0, inf, 7.9873320537428025]]]]]]


In [55]:
# Creates a decision tree with a depth of three. Note the additional variable.
variables = ['rlgdgr','hhmmb','netusoft','agea','eduyrs','health']
outcome_variable = 'happy'
maxdepth = 3
print(getsplit(0,ess,variables,outcome_variable))

[['health', -inf, 2.0, [['hhmmb', -inf, 1.0, [['health', -inf, 1.0, [['agea', -inf, 49.0, 7.782231852654387], ['agea', 49.0, inf, 7.992150706436421]]], ['health', 1.0, inf, [['agea', -inf, 66.0, 7.055858882822344], ['agea', 66.0, inf, 7.723076923076923]]]]], ['hhmmb', 1.0, inf, [['netusoft', -inf, 3.0, [['agea', -inf, 61.0, 7.4860335195530725], ['agea', 61.0, inf, 7.9896719319562575]]], ['netusoft', 3.0, inf, [['agea', -inf, 29.0, 7.944661921708185], ['agea', 29.0, inf, 7.9972535414859784]]]]]]], ['health', 2.0, inf, [['health', -inf, 3.0, [['netusoft', -inf, 1.0, [['rlgdgr', -inf, 5.0, 6.410942956926659], ['rlgdgr', 5.0, inf, 7.04384133611691]]], ['netusoft', 1.0, inf, [['agea', -inf, 78.0, 6.998687147170802], ['agea', 78.0, inf, 7.735042735042735]]]]], ['health', 3.0, inf, [['health', -inf, 4.0, [['eduyrs', -inf, 13.0, 6.002436647173489], ['eduyrs', 13.0, inf, 6.356462585034014]]], ['health', 4.0, inf, [['hhmmb', -inf, 1.0, 4.2552301255230125], ['hhmmb', 1.0, inf, 5.56824512534819]]]

## Evaluating the decision tree

In [63]:
# Determine someone's predicted level of happiness based on their ESS response

def get_prediction(observation,tree):
    j = 0
    keepgoing = True
    prediction = -1
    while(keepgoing):
        j = j + 1
        variable_tocheck = tree[0][0]
        bound1 = tree[0][1]
        bound2 = tree[0][2]
        bound3 = tree[1][2]
        if observation.loc[variable_tocheck] < bound2:
            tree = tree[0][3]
        else:
            tree = tree[1][3]
        if isinstance(tree,float):
            keepgoing = False
            prediction = tree
    return(prediction)

predictions=[]
outcome_variable = 'happy'
maxdepth = 4
thetree = getsplit(0,ess,variables,outcome_variable)
for k in range(0,20):
    observation = ess.loc[k,:]
    predictions.append(get_prediction(observation,thetree))
print(predictions)

[6.056947608200455, 6.056947608200455, 6.800675675675675, 6.9994697773064685, 7.972410568155249, 6.9994697773064685, 6.9994697773064685, 6.9994697773064685, 7.972410568155249, 6.9994697773064685, 6.9994697773064685, 6.9994697773064685, 6.00487567040468, 6.9994697773064685, 6.00487567040468, 7.972410568155249, 7.999195847649682, 6.9994697773064685, 6.9994697773064685, 6.9994697773064685]


In [65]:
predictions = []
for k in range(0,len(ess.index)):
    observation = ess.loc[k,:]
    predictions.append(get_prediction(observation,thetree))
ess.loc[:,'predicted'] = predictions
errors = abs(ess.loc[:,'predicted'] - ess.loc[:,'happy'])
print(np.mean(errors))

KeyError: 20

In [64]:
# Define test and training data

import numpy as np
np.random.seed(512)
ess_shuffled = ess.reindex(np.random.permutation(ess.index)).reset_index(drop = True)    # Use numpy to shuffle the data
training_data = ess_shuffled.loc[0:37000,:]
test_data = ess_shuffled.loc[37001:,:].reset_index(drop = True)

thetree = getsplit(0,training_data,variables,outcome_variable)

predictions = []
for k in range(0,len(test_data.index)):
    observation = test_data.loc[k,:]
    predictions.append(get_prediction(observation,thetree))
test_data.loc[:,'predicted'] = predictions
errors = abs(test_data.loc[:,'predicted'] - test_data.loc[:,'happy'])
print(np.mean(errors))

1.4209904647906406
