# Data

In [186]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf

# disable the warnings by tensorflow
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# setting the random seed to get consistent results
np.random.seed(1)

In [187]:
train = pd.read_csv('criminal_train.csv')
test = pd.read_csv('criminal_test.csv')

train.head()

Unnamed: 0,PERID,IFATHER,NRCH17_2,IRHHSIZ2,IIHHSIZ2,IRKI17_2,IIKI17_2,IRHH65_2,IIHH65_2,PRXRETRY,...,TOOLONG,TROUBUND,PDEN10,COUTYP2,MAIIN102,AIIND102,ANALWT_C,VESTR,VEREP,Criminal
0,25095143,4,2,4,1,3,1,1,1,99,...,1,2,1,1,2,2,3884.805998,40026,1,0
1,13005143,4,1,3,1,2,1,1,1,99,...,2,2,2,3,2,2,1627.108106,40015,2,1
2,67415143,4,1,2,1,2,1,1,1,99,...,2,2,2,3,2,2,4344.95798,40024,1,0
3,70925143,4,0,2,1,1,1,1,1,99,...,2,2,1,1,2,2,792.521931,40027,1,0
4,75235143,1,0,6,1,4,1,1,1,99,...,2,2,2,2,2,2,1518.118526,40001,2,0


In [188]:
for col in train.columns:
    if col not in ['ANALWT_C', 'PERID']:
        print(col,"       ", np.unique(train[col]))

IFATHER         [-1  1  2  3  4]
NRCH17_2         [-1  0  1  2  3]
IRHHSIZ2         [-1  1  2  3  4  5  6]
IIHHSIZ2         [-1  1  3]
IRKI17_2         [-1  1  2  3  4]
IIKI17_2         [-1  1  3]
IRHH65_2         [-1  1  2  3]
IIHH65_2         [-1  1  2  3]
PRXRETRY         [-1  2 94 97 98 99]
PRXYDATA         [-1  1  2 94 97 98 99]
MEDICARE         [-1  1  2 85 94 97 98]
CAIDCHIP         [-1  1  2 85 94 97 98]
CHAMPUS         [-1  1  2 85 94 97 98]
PRVHLTIN         [-1  1  2 85 94 97 98]
GRPHLTIN         [-1  1  2 85 94 97 98 99]
HLTINNOS         [-1  1  2 94 97 98 99]
HLCNOTYR         [-1  1  2 85 94 97 98 99]
HLCNOTMO         [-1  1  2  3  4  5  6  7  8  9 10 11 12 85 94 97 98 99]
HLCLAST         [-1  1  2  3  4  5 94 97 98 99]
HLLOSRSN         [-1  1  2  3  4  5  6  7  8  9 10 11 12 85 94 97 98 99]
HLNVCOST         [-1  1  6 94 97 98 99]
HLNVOFFR         [-1  1  6 94 97 98 99]
HLNVREF         [-1  1  6 94 97 98 99]
HLNVNEED         [-1  1  6 94 97 98 99]
HLNVSOR         [-1  1  6 

In [189]:
print("Training samples =", train.shape[0])
print("Testing samples =", test.shape[0])

Training samples = 45718
Testing samples = 11430


In [190]:
y_train = train.pop('Criminal')

In [191]:
c1 = np.count_nonzero(y_train==1)
c2 = np.count_nonzero(y_train==0)
print("Criminal=", c1, "    ,Percentage=", c1/(c1+c2))
print("NotCriminal=", c2, ",Percentage=", c2/(c1+c2))

Criminal= 3175     ,Percentage= 0.06944748239205564
NotCriminal= 42543 ,Percentage= 0.9305525176079443


# Preprocessing

In [192]:
data = pd.concat([train, test])
test_id = test.PERID
train_rows = train.shape[0]
data.drop('PERID', axis=1, inplace=True)
data.shape

(57148, 70)

In [193]:
for col in data.columns:
    data[col] = data[col].replace(-1, np.nan)

data.isnull().sum()

IFATHER        2
NRCH17_2      92
IRHHSIZ2       2
IIHHSIZ2       2
IRKI17_2       2
IIKI17_2       2
IRHH65_2       2
IIHH65_2       2
PRXRETRY       2
PRXYDATA       2
MEDICARE       2
CAIDCHIP       2
CHAMPUS        2
PRVHLTIN       2
GRPHLTIN       2
HLTINNOS       2
HLCNOTYR       2
HLCNOTMO       2
HLCLAST        2
HLLOSRSN       2
HLNVCOST       2
HLNVOFFR       2
HLNVREF        2
HLNVNEED       2
HLNVSOR        2
IRMCDCHP       2
IIMCDCHP       2
IRMEDICR       2
IIMEDICR       2
IRCHMPUS       2
            ... 
OTHINS         2
CELLNOTCL      2
CELLWRKNG      2
IRFAMSOC       2
IIFAMSOC       2
IRFAMSSI       2
IIFAMSSI       2
IRFSTAMP       2
IIFSTAMP       2
IRFAMPMT       2
IIFAMPMT       2
IRFAMSVC       2
IIFAMSVC       2
IRWELMOS       2
IIWELMOS       2
IRPINC3        2
IRFAMIN3       2
IIPINC3        2
IIFAMIN3       2
GOVTPROG       2
POVERTY3     419
TOOLONG        2
TROUBUND       2
PDEN10         2
COUTYP2        2
MAIIN102       2
AIIND102       2
ANALWT_C      

In [194]:
from sklearn.preprocessing import Imputer

im = Imputer(strategy='most_frequent')

for col in data.columns:
    data[col] = im.fit_transform(data[[col]])

In [195]:
from sklearn.preprocessing import MinMaxScaler

sc = MinMaxScaler()
tmp = sc.fit_transform(data[['ANALWT_C']])
data.drop(['ANALWT_C'], axis=1, inplace=True)

In [196]:
data = pd.get_dummies(data=data, columns=data.columns)

In [197]:
data['ANALWT_C'] = tmp

In [198]:
train = data[:train_rows]
test = data[train_rows:]

del data

In [199]:
train.shape

(45718, 333)

In [200]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train, y_train, stratify=y_train, test_size=0.3)

In [201]:
def data_transform(data, labels):
    data = data.astype(np.float32)
    if labels is not None:
        labels = (np.arange(0, 2) == labels[:,None]).astype(np.float32)
    return data, labels

X_train, y_train = data_transform(X_train.values, y_train)
X_val, y_val = data_transform(X_val.values, y_val)
X_test, _ = data_transform(test.values, None)

In [216]:
print ("Training dataset dimensions=",X_train.shape, "\tTraining labels=",y_train.shape)
print ("Validation dataset dimensions=",X_val.shape, "\tValidation labels=",y_val.shape)
print ("Testing Dataset dimensions=", X_test.shape)

Training dataset dimensions= (32002, 333) 	Training labels= (32002, 2)
Validation dataset dimensions= (13716, 333) 	Validation labels= (13716, 2)
Testing Dataset dimensions= (11430, 333)


In [251]:
tf.reset_default_graph()
J, K, L, M, N = 333, 150, 75, 30, 10

# input
X = tf.placeholder(tf.float32, [None, J])

w1 = tf.Variable(tf.truncated_normal([J, K], stddev=0.1))
b1 = tf.Variable(tf.zeros([K]))

w2 = tf.Variable(tf.truncated_normal([K, L], stddev=0.1))
b2 = tf.Variable(tf.zeros([L]))

w3 = tf.Variable(tf.truncated_normal([L, M], stddev=0.1))
b3 = tf.Variable(tf.zeros([M]))

w4 = tf.Variable(tf.truncated_normal([M, N], stddev=0.1))
b4 = tf.Variable(tf.zeros([N]))

w5 = tf.Variable(tf.truncated_normal([N, 2], stddev=0.1))
b5 = tf.Variable(tf.zeros([2]))

In [252]:
y1 = tf.nn.relu(tf.matmul(X, w1)+b1)
y2 = tf.nn.relu(tf.matmul(y1, w2)+ b2)
y3 = tf.nn.relu(tf.matmul(y2, w3)+ b3)
y4 = tf.nn.relu(tf.matmul(y3, w4)+ b4)

Y = tf.matmul(y4, w5)+ b5

In [253]:
Y_true = tf.placeholder(tf.float32, [None, 2])

In [254]:
loss = tf.nn.softmax_cross_entropy_with_logits(logits=Y, labels=Y_true)
mean_loss = tf.reduce_mean(loss)
is_correct = tf.equal(tf.argmax(Y, 1),tf.argmax(Y_true,1))
accuracy = tf.reduce_mean(tf.cast(is_correct,tf.float32))

In [255]:
global_step = tf.Variable(0, trainable=False)
learning_rate=0.01

In [256]:
optimize = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(mean_loss, global_step=global_step)

In [None]:
sess = tf.InteractiveSession()
initializer = tf.global_variables_initializer()
sess.run(initializer)

In [None]:
batch_size = 50
batch_number = X_train.shape[0]//batch_size

for epoch_counter in range(10):
    curr_epoch_loss = 0
    start = 0
    end = start + batch_size
    
    # training the network on batches
    for batch_counter in range(batch_number):
        batch_x = X_train[start:end]
        batch_y = y_train[start:end]
        start = end
        end = start+batch_size
        
        train_data = {X: batch_x, Y_true: batch_y}
        _, batch_loss = sess.run([optimize,mean_loss], feed_dict=train_data)
        curr_epoch_loss += batch_loss
    
    curr_epoch_loss /= batch_number   
    val_data = {X: X_val, Y_true: y_val}
    val_loss, val_accuracy = sess.run([mean_loss,accuracy], feed_dict=val_data)
    
    print ("Epoch %d: Train Loss=%0.4f Val Loss=%0.4f Val Acc=%0.4f eta=%0.6f global_step=%d"
          % (epoch_counter+1, 
             curr_epoch_loss, 
             val_loss, 
             val_accuracy, 
             learning_rate,
             global_step.eval(session=sess)))

In [None]:
predict = tf.argmax(tf.nn.softmax(Y),1)
predictions = predict.eval(feed_dict={X: X_test})

In [None]:
submission = pd.DataFrame({'PERID': test_id, 'Criminal':predictions})
submission = submission[['PERID', 'Criminal']]
submission.head()

In [None]:
submission.to_csv('predictions_nn.csv',index=False)