In [3]:
# Imports for data-preprocessing
import pandas as pd
import numpy as np
from __future__ import print_function

# Import for spliting the data set
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler

# Imports for classificaiton 
from sklearn.metrics import classification_report


In [4]:
# Import data
df = pd.read_csv('../data/master.csv')

In [5]:
bins = [17, 34, 49, 64, 90]

group_names = ['17-34', '35-49', '50-64', '65+']

age_groups = pd.cut(df.age, bins, labels=group_names)
df['age_groups'] = age_groups

In [6]:
# Delete unwanted variables
del df['workclass']
del df['education']
del df['education_num']
del df['marital_status']
del df['occupation']
del df['native_country'] 
del df['Unnamed: 0']

# Evaluation Metric Selection

Our team has used the sklearns.metrics.classification_report function in most cases to return an array of evaluation metrics. We expected initally that accuracy would be the most important for us. This is due to the nature of our prediction, which is guessing the age group and general income of an individual based on census data. The cost associated with incorrect prediction is low, so we can afford to build a model that focuses on accuracy. When we have a model that has the highest out of sample accuracy, it is theoretically leveraging generalized laws hidden within the data for the period trained. This value of this valuation type is determined by possible uses.

Our prediction algorthims, which attempt to predict the age group of an individual or if an individual's income is greater or less than $50,000,  are primarly applicable to macroeconomic policy. While the abstract models are often well defined by economists, data collection is expensive. Even more so, pieccing together missing chunks is best done by extending pre-existing knowledge to fill the gaps. Prediction models like these have the primary goal of expanding a data set outwards so that it represents the reality. Accuracy is the most relevant here, with incorrect guesses  not being costly at all.

# Standard Scaler -> Stratified Shuffle Split

In [7]:
# Split the data into traning (80%) and test set (20%)
# We are using stratified cross validation here because the majority of the
#    individuals in the variable race are white

if 'income_binary' in df:
    y = df['income_binary'].values #get values we need 
    del df['income_binary']        #get rid of the class label
    X = df.values                  #use everything else to predict 
    
X = pd.get_dummies(df).values

scl = StandardScaler()
X = scl.fit_transform(X)
# Split the data into 20% Test and 80% Train
sss = StratifiedShuffleSplit(n_splits=10, test_size=0.20, random_state=111)
sss.get_n_splits(X, y) #retreving the splits 



10

In [8]:
# Create a for loop that grabs the values for each fold for traing and test sets
for train_index, test_index in sss.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [15163 16279 29246 ..., 21159 41049 17832] TEST: [43578  1917 17027 ..., 22163 39121  5217]
TRAIN: [12894 40784 33210 ..., 40386 25846 36294] TEST: [22078 13800    49 ..., 31283 31586  5940]
TRAIN: [16738 39693 30388 ...,   328 33912 39362] TEST: [24310 48705 25069 ..., 47258 14625 39292]
TRAIN: [ 7391 39777 43398 ...,  8978 24399 34458] TEST: [ 8836  1328 27156 ..., 47164 10476 15648]
TRAIN: [16863 33361 41054 ..., 26744 47828 11941] TEST: [18495 35842 20752 ..., 46535  4696 46808]
TRAIN: [ 5743 21257 30549 ...,  5927  7506 19162] TEST: [21737 30911  7484 ..., 19717 27662 19780]
TRAIN: [18797 40559 21393 ..., 47376 19268 42562] TEST: [13861 34766  4320 ..., 42723 30153 11994]
TRAIN: [36977 11147 24500 ..., 40130 15262 22626] TEST: [13974 35810 43678 ...,  7735 14376 40480]
TRAIN: [32752 37107 19197 ..., 26015 32870  7076] TEST: [36370 13966 29812 ..., 20543 23045 34324]
TRAIN: [22441 41471 48039 ..., 32784  4446 45905] TEST: [  934  1627 37667 ...,  4950 44845 21169]


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics as mt

# criterion : 'gini' > 'entropy'
# max_features : 0 -> 41 (default is fine)
# bootstrap : keep True
# max_depth : not material

RFC = RandomForestClassifier()

iteration = 1
scores = []
for train_indices, test_indices in sss.split(X,y): 
    X_train = X[train_indices]  # train indices for X
    y_train = y[train_indices]  # train indices for y
    
    X_test = X[test_indices]    # test indices for X
    y_test = y[test_indices]    # test indices for y
    
    # train the reusable KNN classifier on the training data
    RFC.fit(X_train,y_train)  # train object
    y_hat = RFC.predict(X_test) # get test set precitions
    
    scores.append(mt.accuracy_score(y_hat,y_test))
    print(mt.accuracy_score(y_hat,y_test))
scores

0.836933155901
0.841641928549
0.838263895998
0.838059166752
0.838878083734
0.838059166752
0.837956802129
0.839492271471
0.838775719111
0.839185177603


[0.83693315590132056,
 0.84164192854949327,
 0.83826389599754325,
 0.83805916675197056,
 0.83887808373426143,
 0.83805916675197056,
 0.83795680212918411,
 0.8394922714709796,
 0.83877571911147508,
 0.83918517760262057]

In [10]:
from sklearn.naive_bayes import BernoulliNB, GaussianNB

#Gaussian : .76
#Bernoulli : .74
BNB = BernoulliNB()

iteration = 1

for train_indices, test_indices in sss.split(X,y): 
    X_train = X[train_indices]  # train indices for X
    y_train = y[train_indices]  # train indices for y
    
    X_test = X[test_indices]    # test indices for X
    y_test = y[test_indices]    # test indices for y
    
    # train the reusable KNN classifier on the training data
    BNB.fit(X_train,y_train)  # train object
    y_hat = BNB.predict(X_test) # get test set precitions
    


In [11]:
#trains slow and not accurate!
from sklearn.svm import SVC

SVC = SVC()

iteration = 1
scores = []
for train_indices, test_indices in sss.split(X,y): 
    X_train = X[train_indices]  # train indices for X
    y_train = y[train_indices]  # train indices for y
    
    X_test = X[test_indices]    # test indices for X
    y_test = y[test_indices]    # test indices for y
    
    # train the reusable KNN classifier on the training data
    SVC.fit(X_train,y_train)  # train object
    y_hat = SVC.predict(X_test) # get test set precitions
    
    scores.append(mt.accuracy_score(y_hat,y_test))

np.mean(scores)

0.84491759647865694

### Adjust Parameters 

We have seen above different algorithms perform with default hyperparameters.  

In [None]:
from sklearn.ensemble import GradientBoostingClassifier


# criterion : 'gini' > 'entropy'
# max_features : 0 -> 41 (default is fine)
# bootstrap : keep True
# max_depth : not material

GBC = GradientBoostingClassifier()

iteration = 1
scores = []
for train_indices, test_indices in sss.split(X,y): 
    X_train = X[train_indices]  # train indices for X
    y_train = y[train_indices]  # train indices for y
    
    X_test = X[test_indices]    # test indices for X
    y_test = y[test_indices]    # test indices for y
    
    # train the reusable KNN classifier on the training data
    GBC.fit(X_train,y_train)  # train object
    y_hat = GBC.predict(X_test) # get test set precitions
    
    scores.append(mt.accuracy_score(y_hat,y_test))

np.mean(scores)


In [None]:
from sklearn.grid_search import GridSearchCV

In [None]:
clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5, scoring=score)

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
params = {'learning_rate':[.25,.5,1,2]}

abc = GridSearchCV(AdaBoostClassifier(),params, cv=5 )

## Tensorflow Auto Encoder

In [None]:
import tensorflow as tf

In [None]:
#our data is already in a relatively useful shape
print(X_train.shape)
print(y_train.shape)

In [None]:
# Training Parameters
learning_rate = 0.01
num_steps = 30000
batch_size = 256

display_step = 1000
examples_to_show = 10

# Network Parameters
num_hidden_1 = 256 # 1st layer num features
num_hidden_2 = 128 # 2nd layer num features (the latent dim)
num_input = 45 # data shape [1,45]

# tf Graph input (only pictures)
X = tf.placeholder("float", [None, num_input])

weights = {
    'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1])),
    'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2])),
    'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1])),
    'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input])),
}
biases = {
    'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1])),
    'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2])),
    'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1])),
    'decoder_b2': tf.Variable(tf.random_normal([num_input])),
}

In [None]:
# Building the encoder
def encoder(x):
    # Encoder Hidden layer with sigmoid activation #1
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']),
                                   biases['encoder_b1']))
    # Encoder Hidden layer with sigmoid activation #2
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']),
                                   biases['encoder_b2']))
    return layer_2


# Building the decoder
def decoder(x):
    # Decoder Hidden layer with sigmoid activation #1
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['decoder_h1']),
                                   biases['decoder_b1']))
    # Decoder Hidden layer with sigmoid activation #2
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']),
                                   biases['decoder_b2']))
    return layer_2

# Construct model
encoder_op = encoder(X)
decoder_op = decoder(encoder_op)

# Prediction
y_pred = decoder_op
# Targets (Labels) are the input data.
y_true = X

# Define loss and optimizer, minimize the squared error
loss = tf.reduce_mean(tf.pow(y_true - y_pred, 2))
optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(loss)

# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()

In [None]:

# Start Training
# Start a new TF session
sess = tf.Session()

# Run the initializer
sess.run(init)

# Training
for i in range(1, num_steps+1):
    # Prepare Data
    # Get the next batch of MNIST data (only images are needed, not labels)
    batch_x, _ = tf.train.batch(X_train,45)

    # Run optimization op (backprop) and cost op (to get loss value)
    _, l = sess.run([optimizer, loss], feed_dict={X: batch_x})
    # Display logs per step
    if i % display_step == 0 or i == 1:
        print('Step %i: Minibatch Loss: %f' % (i, l))