# Introduction to Ensemble Methods

## Load Libraries and Data

In [1]:
%matplotlib inline

import sys
import io
import requests
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import matplotlib as mpl
import matplotlib.pyplot as plt

# Make this notebook's output stable across runs
random_state = 1000
np.random.seed(random_state)

## Load Data

### Data Description

The Bureau of Labor Statistics administers a National Longitudinal Survey of Youth that tracks individuals over the course of their lives.

The 1979 (NLSY79) cohort includes men and women born in the USA between 1957 and 1964.

In the data set, each row represents an individual that participates in the 1979 cohort.

| Attribute      | Definition                                                                                   |
| -------------- | -------------------------------------------------------------------------------------------- |
| ID             | Participant identifier assigned by BLS                                                       |
| Earnings       | Annual wage earnings 2014                                                                    |
| WeeksWorked    | Count of weeks worked in 2014                                                                |
| CumWeeksWorked | Cumulative weeks of work experience during study                                             |
| Education      | Years of education as of 2014                                                                |
| IQ             | Percentile on IQ test taken in 1979                                                          |
| Gender         | Participant's gender                                                                         |
| MSA            | Did the participant reside within an urban cluster or urbanized area in 2014?                |
| Library        | Did the participant, or someone in the participant's household, have a library card in 1979? |
| Esteem         | Score on the Rosenberg Self-Esteem Scale in 1979                                             |

In [2]:
url = 'https://raw.githubusercontent.com/natecraig/aiml/main/Data/nlsy.csv'
download = requests.get(url).content
df = pd.read_csv(io.StringIO(download.decode('utf-8')))
df.head()

Unnamed: 0,ID,Earnings,WeeksWorked,CumWeeksWorked,Education,IQ,Gender,MSA,Library,Esteem
0,83,0,3,553.0,10,2,Male,Yes,Yes,18
1,84,0,3,107.0,12,9,Male,Yes,Yes,20
2,87,11500,5,759.0,12,5,Male,Yes,Yes,22
3,105,20501,11,979.0,12,84,Female,Yes,Yes,19
4,178,59000,46,1010.0,16,99,Male,No,Yes,24


## Build an Ensemble

In [3]:
# Return index of education category as a function of years of education
def education_category(years_education):
    if years_education < 16:
        return '1 High School'
    elif years_education == 16:
        return '2 Undergraduate'
    else:
        return '3 Graduate'


# Create a variable to contain each participant's education category    
df['EducationCategory'] = df['Education'].apply(education_category)
df['EducationCategory'].value_counts(normalize=True)

1 High School      0.830679
2 Undergraduate    0.113160
3 Graduate         0.056161
Name: EducationCategory, dtype: float64

In [4]:
# Predict EducationCategory as a function of Earnings and WeeksWorked
X = df[['Earnings', 'WeeksWorked']]
y = df['EducationCategory']

# Split data into training and testing sets
(X_train, X_test, 
 y_train, y_test) = train_test_split(X, y, test_size=0.5,
                                     stratify=y, random_state=random_state)

In [5]:
# Use decision tree, k-nearest neighbors,
# and logistic regression classifiers
treea_clf = DecisionTreeClassifier(max_depth=1, random_state=random_state)
treeb_clf = DecisionTreeClassifier(max_depth=2, random_state=random_state)
knn_clf = KNeighborsClassifier(n_neighbors=1)

classifiers = {
    'treea':treea_clf,
    'treeb':treeb_clf,
    'knn':knn_clf,
}

In [6]:
# Evaluate each classifier separately on test data
dfpred = pd.DataFrame()
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)    
    dfpred[name] = clf.predict(X_test)
    print('{:<24} {:.3f}'.format(name,
                                 accuracy_score(y_test, dfpred[name])))


treea                    0.879
treeb                    0.874
knn                      0.874


In [7]:
# Show predictions
dfpred.head(n=3)

Unnamed: 0,treea,treeb,knn
0,2 Undergraduate,3 Graduate,3 Graduate
1,1 High School,1 High School,1 High School
2,1 High School,1 High School,1 High School


In [8]:
# Combine the individual classifiers into a voting ensemble
dfpred['Ensemble'] = dfpred.mode(axis=1)[0]
dfpred.head()

Unnamed: 0,treea,treeb,knn,Ensemble
0,2 Undergraduate,3 Graduate,3 Graduate,3 Graduate
1,1 High School,1 High School,1 High School,1 High School
2,1 High School,1 High School,1 High School,1 High School
3,1 High School,1 High School,1 High School,1 High School
4,1 High School,1 High School,1 High School,1 High School


In [9]:
# Example of when the classifiers disagree
dfpred.loc[0, :]

treea       2 Undergraduate
treeb            3 Graduate
knn              3 Graduate
Ensemble         3 Graduate
Name: 0, dtype: object

In [10]:
y_test.iloc[0]

'3 Graduate'

In [11]:
# Compare the ensemble's performance to the constituent classifiers
for col in dfpred:
    print('{:<24} {:.3f}'.format(col,
                                 accuracy_score(y_test, dfpred[col])))

treea                    0.879
treeb                    0.874
knn                      0.874
Ensemble                 0.891
