In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import sys

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.decomposition import PCA

In [2]:
def preprocess_data(df,use_pca=True):
    """
    Takes a dataframe, and applies PCA decomposition to categorical columns
    Adds those columns to the end, then  removes the original col
    
    args:
    df: pandas data frame of data
    pca: dtype-bool: determines whether or not to apply pca
    """
    # one hot encode categorical fields
    dummy_fields = ['X0','X1','X2','X3','X4','X5','X6','X8']
    pca = PCA(n_components=1)
    
    # grab each dummy field, one hot encode, then PCA to one dimension
    for each in dummy_fields:
        dummies = pd.get_dummies(df[each], prefix=each, drop_first=False)
        if use_pca:
            dummies = pca.fit_transform(dummies)
            df[each+'_PCA'] = dummies
        else:
            df = pd.concat([df, dummies], axis=1)
        
    # drop one hot encoded cols
    drop_list = ['X0','X1','X2','X3','X4','X5','X6','X8']

    # training data that will be fed to neural net
    processed_df = df.drop(drop_list, axis=1)
    
    return processed_df

In [3]:
train_file = 'train.csv'
test_file  = 'test.csv'

In [4]:
all_test_data = pd.read_csv(test_file)
all_train_data = pd.read_csv(train_file)
all_train_data.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# targets are a list of y
all_train_targets = []
for index,row in all_train_data.iterrows():
    all_train_targets.append(row['y'])
    
# drop 'y' and 'ID' row for preprocess function that works with test and 
all_train_data = all_train_data.drop(['y','ID'],axis=1)

In [6]:
#Grab labels of test-set for submitting
test_labels = []
for index,row in all_test_data.iterrows():
    test_labels.append(row['ID'])
    
all_test_data = all_test_data.drop(['ID'],axis=1)

In [7]:
# category differences noted for later, not an issue now due to PCA
train_cols = set(preprocess_data(all_train_data,use_pca=False))
test_cols  = set(preprocess_data(all_test_data,use_pca=False))
print('num train columns: {}\nnum test columns: {}'.format(len(train_cols),len(test_cols)))
col_difference = test_cols-train_cols
print('differences:{}'.format(col_difference))

num train columns: 563
num test columns: 569
differences:{'X2_ad', 'X5_a', 'X2_ax', 'X0_ag', 'X0_p', 'X0_ae', 'X2_aj', 'X2_u', 'X5_t', 'X0_av', 'X2_ab', 'X0_bb', 'X5_b', 'X5_z', 'X0_an', 'X2_w'}


In [8]:
feed_train_data = preprocess_data(all_train_data)
feed_test_data  = preprocess_data(all_test_data)

In [9]:
feed_train_data.head()

Unnamed: 0,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,...,X384,X385,X0_PCA,X1_PCA,X2_PCA,X3_PCA,X4_PCA,X5_PCA,X6_PCA,X8_PCA
0,0,0,0,1,0,0,0,0,1,0,...,0,0,-0.009846,-0.147453,-0.301773,0.292244,-0.001112,-0.004538,-0.706161,-0.021246
1,0,0,0,0,0,0,0,0,1,0,...,0,0,-0.009846,-0.072187,-0.301235,0.229293,-0.001112,-0.004538,-0.001886,-0.021246
2,0,0,0,0,0,0,0,1,0,0,...,0,0,-0.018615,-0.0743,-0.34169,-0.598021,-0.001112,-0.004559,-0.706161,-0.024525
3,0,0,0,0,0,0,0,0,0,0,...,0,0,-0.018615,-0.072187,-0.34169,0.790623,-0.001112,-0.004559,-0.001886,-0.070741
4,0,0,0,0,0,0,0,0,0,0,...,0,0,-0.018615,-0.147453,-0.34169,0.790623,-0.001112,-0.004538,-0.002553,-0.125958


In [10]:
feed_test_data.head()

Unnamed: 0,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,...,X384,X385,X0_PCA,X1_PCA,X2_PCA,X3_PCA,X4_PCA,X5_PCA,X6_PCA,X8_PCA
0,0,0,0,0,0,0,0,0,0,0,...,0,0,-0.061028,-0.151594,-0.332234,0.77936,-0.00174,-0.005661,-0.02968,-0.05501
1,0,0,0,0,0,0,0,0,0,1,...,0,0,-0.126258,-0.293767,-0.506215,0.289201,-0.00174,-0.005661,0.725562,-0.020338
2,0,0,0,0,1,0,0,0,0,0,...,0,0,-0.061028,-0.151594,0.637086,0.77936,-0.00174,-0.005661,-0.684456,-0.229076
3,0,0,0,0,0,0,0,0,0,0,...,0,0,-0.061028,-0.299026,-0.332234,0.77936,-0.00174,-0.005661,-0.044218,-0.109151
4,0,0,0,0,1,0,0,0,0,0,...,0,0,-0.070907,-0.304476,0.637086,-0.608285,-0.00174,-0.005661,-0.045588,-0.004985


In [11]:
# Splitting into 80/20 Train and validation sets
train_features, val_features, train_targets, val_targets = train_test_split(np.asarray(feed_train_data), all_train_targets, test_size=0.2)

In [12]:
print(feed_train_data.shape[0])
print(train_features.shape)
print(val_features.shape)
print(np.mean(train_targets))

4209
(3367, 376)
(842, 376)
100.780608851


In [13]:
def model_inputs(num_features):
    """
    Creates tensor placeholders for inputs, labels, and learning_rate
    
    args:
    num_features:dtype-int, number of features being used for inputs
    
    returns: tuple(inputs,labels,learning_rate,is_training)
    """
    # placeholders for feed_dict
    inputs = tf.placeholder(tf.float32,(None,num_features), name='inputs')
    targets = tf.placeholder(tf.float32, name='labels')
    learning_rate = tf.placeholder(tf.float32, (None), name='learning_rate')
    is_training = tf.placeholder(tf.bool,name='train_flag')
    
    return inputs,targets,learning_rate,is_training

In [14]:
def create_model(inputs, num_units, activation, is_train):
    """
    Creates a dense network with(currently) 3 hidden layers
    
    args:
    inputs: dtype-tensor of our input data
    num_units: dtype-int, number of hidden units
    activation: dtype-string, the activation function to be used
    is_train: dtype-tensor, bool flag for training, used with batch_norm and dropout
    returns: raw output of last layer
    """
    drop_rate = .5
    
    fc1 = tf.layers.dense(inputs,num_units,activation=tf.nn.relu,name='fc1')
    bn1 = tf.layers.batch_normalization(fc1,training=is_train)
    fc1_act = tf.nn.relu(bn1)
    fc1_out = tf.layers.dropout(fc1_act,rate=drop_rate,training=is_train)
    
    fc2 = tf.layers.dense(fc1_out, num_units,activation=activation,name='fc2')
    bn2 = tf.layers.batch_normalization(fc2,training=is_train)
    fc2_act = activation(bn2)
    fc2_out = tf.layers.dropout(fc2_act,rate=drop_rate,training=is_train)
    
    fc3 = tf.layers.dense(fc2_out, num_units//2,activation=activation,name='fc3')
    bn3 = tf.layers.batch_normalization(fc3,training=is_train)
    fc3_act = activation(bn3)
    fc3_out = tf.layers.dropout(fc3_act,rate=drop_rate,training=is_train)
    
    fc4 = tf.layers.dense(fc3_out,num_units//4,activation=activation,name='fc4')
    
    outputs = tf.layers.dense(fc4,1,name='outputs')
    return outputs

In [15]:
def model_loss_opt(outputs,targets,learning_rate):
    """
    Creates the loss and optimizer for network
    
    args, dtype-all tensors:
    outputs: tensor of outputs from neural net
    targets:  tensor of targets
    learning_rate: tensor for learning rate
    
    returns: tuple of (loss,optimizer)
    """
    #commented out loss of R2 as it makes model tend toward negative infinity
    #mean_of_targets = tf.reduce_mean(targets)
    #total_sum_of_squares = tf.reduce_sum(tf.squared_difference(targets,mean_of_targets))
    #res_sum_of_squares   = tf.reduce_sum(tf.squared_difference(targets,outputs))
    #loss = tf.subtract(1.0,tf.divide(res_sum_of_squares,total_sum_of_squares))
    loss = tf.losses.mean_squared_error(targets,outputs)
    opt  = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    
    return loss,opt

In [16]:
#Hyperparameters
batch_size    = 512
num_features  = feed_train_data.shape[1]
num_units     = 64
l_rate        = 0.005
epochs        = 150
activation    = tf.nn.relu

In [17]:
# Initialize network with above helper functions and hyperparameters
tf.reset_default_graph()

inputs,targets,learning_rate,is_training = model_inputs(num_features)
model = create_model(inputs,num_units,activation,is_training)
loss,opt = model_loss_opt(model,targets,learning_rate)

In [18]:
data_length = feed_train_data.shape[0]
train_loss = []
val_loss   = []
sess = tf.Session()
sess.run(tf.global_variables_initializer())
for e in range(epochs):
    # shuffle data for each epoch
    train_features,train_targets = shuffle(train_features,train_targets) 

    # indicies for batches
    start_i = 0
    end_i   = batch_size

    #Run through data
    for batch_round in range(data_length//batch_size):
        batch_feat,batch_targets = train_features[start_i:end_i],train_targets[start_i:end_i]
        start_i = end_i
        end_i  += batch_size

        _ = sess.run(opt,feed_dict={inputs:batch_feat,
                                    targets:batch_targets,
                                    learning_rate:l_rate,
                                    is_training:True})

    # Collect loss for training and validation at end of each epoch
    t_loss = sess.run(loss,feed_dict={inputs:feed_train_data,
                                      targets:train_targets,
                                      is_training:False})
    v_loss = sess.run(loss,feed_dict={inputs:val_features,
                                      targets:val_targets,
                                      is_training:False})

    # Print current progress
    sys.stdout.write("\rProgress: " + str(100 * e/float(epochs))[:4] \
                 + "% ... Training loss: " + str(t_loss)[:5] \
                 + " ... Validation loss: " + str(v_loss)[:5])

    # save losses for later study
    train_loss.append(t_loss)
    val_loss.append(v_loss)

Progress: 99.3% ... Training loss: 162.4 ... Validation loss: 153.7

In [19]:
results = [['ID','y']]

predictions = sess.run(model, feed_dict={inputs:feed_test_data,is_training:False}).tolist()
    
for _id,y in zip(test_labels,predictions):
    results.append([_id]+y)

In [20]:
results[:10]

[['ID', 'y'],
 [1, 100.76374053955078],
 [2, 100.76374053955078],
 [3, 100.76374053955078],
 [4, 100.76374053955078],
 [5, 100.76374053955078],
 [8, 100.76374053955078],
 [10, 100.76374053955078],
 [11, 100.76374053955078],
 [12, 100.76374053955078]]