In [49]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error

In [50]:
df = pd.read_csv((
    "https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
    "master/ESS_practice_data/ESSdata_Thinkful.csv")).dropna()

df.head()

Unnamed: 0,cntry,idno,year,tvtot,ppltrst,pplfair,pplhlp,happy,sclmeet,sclact,gndr,agea,partner
0,CH,5.0,6,3.0,3.0,10.0,5.0,8.0,5.0,4.0,2.0,60.0,1.0
1,CH,25.0,6,6.0,5.0,7.0,5.0,9.0,3.0,2.0,2.0,59.0,1.0
2,CH,26.0,6,1.0,8.0,8.0,8.0,7.0,6.0,3.0,1.0,24.0,2.0
3,CH,28.0,6,4.0,6.0,6.0,7.0,10.0,6.0,2.0,2.0,64.0,1.0
4,CH,29.0,6,5.0,6.0,7.0,5.0,8.0,7.0,2.0,2.0,55.0,1.0


In [51]:
df.columns

Index(['cntry', 'idno', 'year', 'tvtot', 'ppltrst', 'pplfair', 'pplhlp',
       'happy', 'sclmeet', 'sclact', 'gndr', 'agea', 'partner'],
      dtype='object')

### Original Exercise

In [52]:
# Definine outcome and predictors.
# Set our outcome to 0 and 1.
y = df['partner'] - 1
X = df.loc[:, ~df.columns.isin(['partner', 'cntry', 'idno'])]

# Make the categorical variable 'country' into dummies.
X = pd.concat([X, pd.get_dummies(df['cntry'])], axis=1)

# Create training and test sets.
offset = int(X.shape[0] * 0.9)

# Put 90% of the data in the training set.
X_train, y_train = X[:offset], y[:offset]

# And put 10% in the test set.
X_test, y_test = X[offset:], y[offset:]

In [53]:
from sklearn.metrics import classification_report

# We'll make 500 iterations, use 2-deep trees, and set our loss function.
params = {'n_estimators': 500,
          'max_depth': 2,
          'loss': 'deviance'}

# Initialize and fit the model.
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X_train, y_train)

predict_train = clf.predict(X_train)
predict_test = clf.predict(X_test)

# Accuracy tables.
table_train = pd.crosstab(y_train, predict_train, margins=True)
table_test = pd.crosstab(y_test, predict_test, margins=True)

train_tI_errors = table_train.loc[0.0,1.0] / table_train.loc['All','All']
train_tII_errors = table_train.loc[1.0,0.0] / table_train.loc['All','All']

test_tI_errors = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors = table_test.loc[1.0,0.0]/table_test.loc['All','All']

print((
    'Training set accuracy:\n'
    'Percent Type I errors: {}\n'
    'Percent Type II errors: {}\n\n'
    'Test set accuracy:\n'
    'Percent Type I errors: {}\n'
    'Percent Type II errors: {}'
).format(train_tI_errors, train_tII_errors, test_tI_errors, test_tII_errors))

print('\nClassification Report:\n')
print(classification_report(y_test, clf.predict(X_test)))

Training set accuracy:
Percent Type I errors: 0.04650845608292417
Percent Type II errors: 0.17607746863066012

Test set accuracy:
Percent Type I errors: 0.06257668711656442
Percent Type II errors: 0.18527607361963191

Classification Report:

              precision    recall  f1-score   support

         0.0       0.75      0.90      0.82       505
         1.0       0.76      0.51      0.61       310

   micro avg       0.75      0.75      0.75       815
   macro avg       0.75      0.71      0.71       815
weighted avg       0.75      0.75      0.74       815



---

### Create New Features

In [54]:
# Behold the happiness factor:
df['avg_happiness'] = df.groupby('agea').transform('mean')['happy']
df['happiness_factor'] = df.happy - df.avg_happiness

# average of all the sentiment related features
df['avg_sentiment'] = (df['ppltrst'] + df['pplfair'] + df['pplhlp'] + df['happy'] + 
                       df['sclmeet'] + df['sclact']) / 6

# average sentiment in relation to tvtot
df['tv_by_avg_sent'] = df['tvtot']/df['avg_sentiment']

df = pd.concat([df, pd.get_dummies(df['cntry'])], axis=1)

df['partner'] = df['partner'] - 1

df.describe()

Unnamed: 0,idno,year,tvtot,ppltrst,pplfair,pplhlp,happy,sclmeet,sclact,gndr,...,avg_happiness,happiness_factor,avg_sentiment,tv_by_avg_sent,CH,CZ,DE,ES,NO,SE
count,8147.0,8147.0,8147.0,8147.0,8147.0,8147.0,8147.0,8147.0,8147.0,8147.0,...,8147.0,8147.0,8147.0,8147.0,8147.0,8147.0,8147.0,8147.0,8147.0,8147.0
mean,40226.22,6.500307,3.818584,5.574936,6.005155,5.321468,7.708482,5.215908,2.755984,1.496379,...,7.708482,0.0,5.430322,0.762486,0.181048,0.148153,0.003314,0.281331,0.174297,0.211857
std,632072.1,0.500031,2.008937,2.215745,2.120127,2.166217,1.720839,1.438792,0.901406,0.500018,...,0.214502,1.707418,1.118921,0.529882,0.385082,0.355273,0.057476,0.449676,0.379388,0.408649
min,1.0,6.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,5.0,-8.121951,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1063.0,6.0,2.0,4.0,5.0,4.0,7.0,4.0,2.0,1.0,...,7.552632,-0.783217,4.666667,0.4,0.0,0.0,0.0,0.0,0.0,0.0
50%,1749.0,7.0,4.0,6.0,6.0,5.0,8.0,6.0,3.0,1.0,...,7.673469,0.29927,5.5,0.685714,0.0,0.0,0.0,0.0,0.0,0.0
75%,2778.0,7.0,5.0,7.0,8.0,7.0,9.0,6.0,3.0,2.0,...,7.865248,1.216783,6.166667,1.034483,0.0,0.0,0.0,1.0,0.0,0.0
max,11001430.0,7.0,7.0,10.0,10.0,10.0,10.0,7.0,5.0,2.0,...,10.0,2.967742,8.666667,14.0,1.0,1.0,1.0,1.0,1.0,1.0


In [55]:
# Definine outcome and predictors.
# Set our outcome to 0 and 1.
X = df.drop(columns=['partner', 'cntry', 'idno'])
y = df['partner']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2)

### Original Parameters with New Features

In [56]:
from sklearn.metrics import classification_report

# We'll make 500 iterations, use 2-deep trees, and set our loss function.
params = {'n_estimators': 500,
          'max_depth': 2,
          'loss': 'deviance'}

# Initialize and fit the model.
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X_train, y_train)

predict_train = clf.predict(X_train)
predict_test = clf.predict(X_test)

# Accuracy tables.
table_train = pd.crosstab(y_train, predict_train, margins=True)
table_test = pd.crosstab(y_test, predict_test, margins=True)

train_tI_errors = table_train.loc[0.0,1.0] / table_train.loc['All','All']
train_tII_errors = table_train.loc[1.0,0.0] / table_train.loc['All','All']

test_tI_errors = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors = table_test.loc[1.0,0.0]/table_test.loc['All','All']

print((
    'Training set accuracy:\n'
    'Percent Type I errors: {}\n'
    'Percent Type II errors: {}\n\n'
    'Test set accuracy:\n'
    'Percent Type I errors: {}\n'
    'Percent Type II errors: {}'
).format(train_tI_errors, train_tII_errors, test_tI_errors, test_tII_errors))

print('\nClassification Report:\n')
print(classification_report(y_test, clf.predict(X_test)))

Training set accuracy:
Percent Type I errors: 0.042504219733005985
Percent Type II errors: 0.17124443762467392

Test set accuracy:
Percent Type I errors: 0.05644171779141104
Percent Type II errors: 0.18588957055214725

Classification Report:

              precision    recall  f1-score   support

         0.0       0.75      0.91      0.82      1000
         1.0       0.78      0.52      0.62       630

   micro avg       0.76      0.76      0.76      1630
   macro avg       0.77      0.71      0.72      1630
weighted avg       0.76      0.76      0.74      1630



Adding in the new features did very little to effect the model, the training/test size is a little different then the original and that has effected the numbers in the classification report slightly but generally the new features have not helped.

---

### Try with a Max Depth of 3 Instead

In [57]:
# Definine outcome and predictors.
X = df.drop(columns=['partner', 'cntry', 'idno'])
y = df['partner']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2)

In [58]:

# We'll make 500 iterations, use 2-deep trees, and set our loss function.
params = {'n_estimators': 1000,
          'max_depth': 3,
          'loss': 'deviance'}

# Initialize and fit the model.
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X_train, y_train)

predict_train = clf.predict(X_train)
predict_test = clf.predict(X_test)

# Accuracy tables.
table_train = pd.crosstab(y_train, predict_train, margins=True)
table_test = pd.crosstab(y_test, predict_test, margins=True)

train_tI_errors = table_train.loc[0.0,1.0] / table_train.loc['All','All']
train_tII_errors = table_train.loc[1.0,0.0] / table_train.loc['All','All']

test_tI_errors = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors = table_test.loc[1.0,0.0]/table_test.loc['All','All']

print((
    'Training set accuracy:\n'
    'Percent Type I errors: {}\n'
    'Percent Type II errors: {}\n\n'
    'Test set accuracy:\n'
    'Percent Type I errors: {}\n'
    'Percent Type II errors: {}'
).format(train_tI_errors, train_tII_errors, test_tI_errors, test_tII_errors))

print('\nClassification Report:\n')
print(classification_report(y_test, clf.predict(X_test)))

Training set accuracy:
Percent Type I errors: 0.021789166794537365
Percent Type II errors: 0.11094061684824305

Test set accuracy:
Percent Type I errors: 0.08895705521472393
Percent Type II errors: 0.17914110429447852

Classification Report:

              precision    recall  f1-score   support

         0.0       0.75      0.86      0.80      1004
         1.0       0.70      0.53      0.60       626

   micro avg       0.73      0.73      0.73      1630
   macro avg       0.72      0.69      0.70      1630
weighted avg       0.73      0.73      0.72      1630



For this version I moved the estimators up to 1000 and added a max depth of 3 to the decision trees. This lowers the errors on the training set but doesn't seem to affect the test set all that much.

___

In [59]:
# Definine outcome and predictors.
X = df.drop(columns=['partner', 'cntry', 'idno'])
y = df['partner']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2)

In [60]:
# We'll make 1000 iterations, use 4-deep trees, and set our loss function.
params = {'n_estimators': 1000,
          'max_depth': 4,
          'loss': 'deviance'}

# Initialize and fit the model.
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X_train, y_train)

predict_train = clf.predict(X_train)
predict_test = clf.predict(X_test)

# Accuracy tables.
table_train = pd.crosstab(y_train, predict_train, margins=True)
table_test = pd.crosstab(y_test, predict_test, margins=True)

train_tI_errors = table_train.loc[0.0,1.0] / table_train.loc['All','All']
train_tII_errors = table_train.loc[1.0,0.0] / table_train.loc['All','All']

test_tI_errors = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors = table_test.loc[1.0,0.0]/table_test.loc['All','All']

print((
    'Training set accuracy:\n'
    'Percent Type I errors: {}\n'
    'Percent Type II errors: {}\n\n'
    'Test set accuracy:\n'
    'Percent Type I errors: {}\n'
    'Percent Type II errors: {}'
).format(train_tI_errors, train_tII_errors, test_tI_errors, test_tII_errors))

print('\nClassification Report:\n')
print(classification_report(y_test, clf.predict(X_test)))

Training set accuracy:
Percent Type I errors: 0.003989565751112475
Percent Type II errors: 0.042657664569587236

Test set accuracy:
Percent Type I errors: 0.08159509202453988
Percent Type II errors: 0.19141104294478528

Classification Report:

              precision    recall  f1-score   support

         0.0       0.72      0.86      0.79       954
         1.0       0.73      0.54      0.62       676

   micro avg       0.73      0.73      0.73      1630
   macro avg       0.73      0.70      0.70      1630
weighted avg       0.73      0.73      0.72      1630



Finally moving the max-depth up to 4 again seems to again consolidate the training set performance but not affect the test set much at all. It seems to be that as the trees get deeper the model overfits more and more. It is a bit unfortunate, as the error rate of the training set with the 4 deep trees is remarkably low