In [0]:
import pandas as pd
import tensorflow as tf

In [0]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? 
Nothing done.


In [0]:
df = pd.read_csv("processed_train.csv", 
                       names=["ip", "app", "device", "os", "channel", "click_time", "attributed_time", "is_attributed"], 
                      parse_dates=[5]
)

In [0]:
df['day'] = df['click_time'].dt.day.astype('uint8')
df['hour'] = df['click_time'].dt.hour.astype('uint8')
df['minute'] = df['click_time'].dt.minute.astype('uint8')
df['second'] = df['click_time'].dt.second.astype('uint8')

In [0]:
ATTRIBUTION_CATEGORIES = [        
    # V1 Features #
    ###############
    ['ip'], ['app'], ['device'], ['os'], ['channel'],
    
    # V2 Features #
    ###############
    ['app', 'channel'],
    ['app', 'os'],
    ['app', 'device'],
    
    # V3 Features #
    ###############
    ['channel', 'os'],
    ['channel', 'device'],
    ['os', 'device']
]

In [0]:
import numpy as np
freqs = {}
for cols in ATTRIBUTION_CATEGORIES:
    
    # New feature name
    new_feature = '_'.join(cols)+'_confRate'    
    
    # Perform the groupby
    group_object = df.groupby(cols)
    
    # Group sizes    
    group_sizes = group_object.size()
    
    log_group = np.log(100000) # 1000 views -> 60% confidence, 100 views -> 40% confidence 
    print(">> Calculating confidence-weighted rate for: {}.\n   Saving to: {}. Group Max /Mean / Median / Min: {} / {} / {} / {}".format(
        cols, new_feature, 
        group_sizes.max(), 
        np.round(group_sizes.mean(), 2),
        np.round(group_sizes.median(), 2),
        group_sizes.min()
    ))
    
    # Aggregation function
    def rate_calculation(x):
        """Calculate the attributed rate. Scale by confidence"""
        rate = x.sum() / float(x.count())
        conf = np.min([1, np.log(x.count()) / log_group])
        return rate * conf
    
    # Perform the merge
    df = df.merge(
        group_object['is_attributed']. \
            apply(rate_calculation). \
            reset_index(). \
            rename( 
                index=str,
                columns={'is_attributed': new_feature}
            )[cols + [new_feature]],
        on=cols, how='left'
    )

>> Calculating confidence-weighted rate for: ['ip'].
   Saving to: ip_confRate. Group Max /Mean / Median / Min: 6693 / 12.28 / 4.0 / 1
>> Calculating confidence-weighted rate for: ['app'].
   Saving to: app_confRate. Group Max /Mean / Median / Min: 183486 / 3558.72 / 8.0 / 1
>> Calculating confidence-weighted rate for: ['device'].
   Saving to: device_confRate. Group Max /Mean / Median / Min: 942816 / 2409.64 / 1.0 / 1
>> Calculating confidence-weighted rate for: ['os'].
   Saving to: os_confRate. Group Max /Mean / Median / Min: 238974 / 4926.11 / 16.0 / 1
>> Calculating confidence-weighted rate for: ['channel'].
   Saving to: channel_confRate. Group Max /Mean / Median / Min: 81188 / 5847.95 / 1609.0 / 1
>> Calculating confidence-weighted rate for: ['app', 'channel'].
   Saving to: app_channel_confRate. Group Max /Mean / Median / Min: 78159 / 1526.72 / 25.0 / 1
>> Calculating confidence-weighted rate for: ['app', 'os'].
   Saving to: app_os_confRate. Group Max /Mean / Median / Min: 445

In [0]:
df.loc[df['is_attributed'] == 0, 'Normal'] = 1
df.loc[df['is_attributed'] == 1, 'Normal'] = 0

In [0]:
df = df.drop(columns = ['click_time', 'attributed_time'])

In [0]:
fraud = df[df['is_attributed'] == 1]

In [0]:
normal = df[df['is_attributed'] == 0]

In [0]:
X_train = fraud.sample(frac=0.8)

In [0]:
count_Frauds = len(X_train)

In [0]:
X_train = pd.concat([X_train, normal.sample(frac = 0.8)], axis = 0)

In [0]:
X_test = df.loc[~df.index.isin(X_train.index)]

In [0]:
from sklearn.utils import shuffle

In [0]:
X_train = shuffle(X_train)
X_test = shuffle(X_test)


In [0]:
y_train = X_train["is_attributed"]


In [0]:
y_train = pd.concat([y_train, X_train["Normal"]], axis=1)

In [0]:
y_test = X_test["is_attributed"]

In [0]:
y_test = pd.concat([y_test, X_test.Normal], axis=1)

In [0]:
ratio = len(X_train)/count_Frauds

In [0]:
ratio

407.955124936257

In [0]:
X_train.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,day,hour,minute,second,ip_confRate,app_confRate,device_confRate,os_confRate,channel_confRate,app_channel_confRate,app_os_confRate,app_device_confRate,channel_os_confRate,channel_device_confRate,os_device_confRate,Normal
144109,5348,15,1,19,480,0,9,14,31,27,0.001829,0.000218,0.001749,0.001753,0.000234,0.000504,0.000161,0.000226,0.000191,0.000236,0.001782,1.0
729678,88680,15,1,13,265,0,8,17,34,29,0.0,0.000218,0.001749,0.00159,0.000121,0.000347,0.000226,0.000226,0.000115,0.000128,0.001639,1.0
310137,95752,12,1,19,245,0,7,7,36,7,0.0,8.4e-05,0.001749,0.001753,1.9e-05,0.0,5.7e-05,8.9e-05,0.0,2e-05,0.001782,1.0
222486,2720,11,1,19,122,0,8,14,45,54,0.0,0.001371,0.001749,0.001753,0.000482,0.002042,0.001264,0.00139,0.000955,0.0005,0.001782,1.0
960993,70556,15,1,19,245,0,7,17,49,21,0.0,0.000218,0.001749,0.001753,1.9e-05,2.9e-05,0.000161,0.000226,0.0,2e-05,0.001782,1.0


In [0]:
X_train = X_train.drop(['is_attributed','Normal'], axis = 1)
X_test = X_test.drop(['is_attributed','Normal'], axis = 1)

In [0]:
y_train['is_attributed'] *= ratio
y_test['is_attributed'] *= ratio

In [0]:
# Split the testing data into validation and testing sets
split = int(len(y_test)/2)

inputX = X_train.as_matrix()
inputY = y_train.as_matrix()
inputX_valid = X_test.as_matrix()[:split]
inputY_valid = y_test.as_matrix()[:split]
inputX_test = X_test.as_matrix()[split:]
inputY_test = y_test.as_matrix()[split:]

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """
  
  import sys
  


In [0]:
print(len(X_train.columns))

20


In [0]:
# Number of input nodes.
input_nodes = 20 # 48

multiplier = 1.5 

# Number of nodes in each hidden layer
hidden_nodes1 = 18
hidden_nodes2 = round(hidden_nodes1 * multiplier)
hidden_nodes3 = round(hidden_nodes2 * multiplier)

# Percent of nodes to keep during dropout.
pkeep = tf.placeholder(tf.float32)

# input
x = tf.placeholder(tf.float32, [None, input_nodes])

# layer 1
W1 = tf.Variable(tf.truncated_normal([input_nodes, hidden_nodes1], stddev = 0.15))
b1 = tf.Variable(tf.zeros([hidden_nodes1]))
y1 = tf.nn.sigmoid(tf.matmul(x, W1) + b1)

# layer 2
W2 = tf.Variable(tf.truncated_normal([hidden_nodes1, hidden_nodes2], stddev = 0.15))
b2 = tf.Variable(tf.zeros([hidden_nodes2]))
y2 = tf.nn.sigmoid(tf.matmul(y1, W2) + b2)

# layer 3
W3 = tf.Variable(tf.truncated_normal([hidden_nodes2, hidden_nodes3], stddev = 0.15)) 
b3 = tf.Variable(tf.zeros([hidden_nodes3]))
y3 = tf.nn.sigmoid(tf.matmul(y2, W3) + b3)
y3 = tf.nn.dropout(y3, rate=1-pkeep)

# layer 4
W4 = tf.Variable(tf.truncated_normal([hidden_nodes3, 2], stddev = 0.15)) 
b4 = tf.Variable(tf.zeros([2]))
y4 = tf.nn.softmax(tf.matmul(y3, W4) + b4)

# output
y = y4
y_ = tf.placeholder(tf.float32, [None, 2])


# Parameters
training_epochs = 1000
training_dropout = 0.8
display_step = 10 
n_samples = y_train.shape[0]
batch_size = 1024# 2048
learning_rate = 0.01


# Cost function: Cross Entropy
cost = -tf.reduce_sum(y_ * tf.log(y))

# We will optimize our model via AdamOptimizer
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

# Correct prediction if the most likely value (Fraud or Normal) from softmax equals the target value.
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

accuracy_summary = []
cost_summary = []
valid_accuracy_summary = [] 
valid_cost_summary = [] 
stop_early = 0 


# Save the best weights so that they can be used to make the final predictions
checkpoint = "best_model.ckpt"
saver = tf.train.Saver(max_to_keep=1)

# Initialize variables and tensorflow session
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(training_epochs): 
        for batch in range(int(n_samples/batch_size)):
            batch_x = inputX[batch*batch_size : (1+batch)*batch_size]
            batch_y = inputY[batch*batch_size : (1+batch)*batch_size]

            sess.run([optimizer], feed_dict={x: batch_x, 
                                             y_: batch_y,
                                             pkeep: training_dropout})

        # Display logs after every 10 epochs
        if (epoch) % display_step == 0:
            train_accuracy, newCost = sess.run([accuracy, cost], feed_dict={x: inputX, 
                                                                            y_: inputY,
                                                                            pkeep: training_dropout})

            valid_accuracy, valid_newCost = sess.run([accuracy, cost], feed_dict={x: inputX_valid, 
                                                                                  y_: inputY_valid,
                                                                                  pkeep: 1})

            print ("Epoch:", epoch,
                   "Acc =", "{:.5f}".format(train_accuracy), 
                   "Cost =", "{:.5f}".format(newCost),
                   "Valid_Acc =", "{:.5f}".format(valid_accuracy), 
                   "Valid_Cost = ", "{:.5f}".format(valid_newCost))
            
            # Save the weights if these conditions are met.
            if epoch > 0 and valid_accuracy > max(valid_accuracy_summary) and valid_accuracy > 0.995:
                saver.save(sess, checkpoint)
            
            # Record the results of the model
            accuracy_summary.append(train_accuracy)
            cost_summary.append(newCost)
            valid_accuracy_summary.append(valid_accuracy)
            valid_cost_summary.append(valid_newCost)
            
            # If the model does not improve after 15 logs, stop the training.
            if valid_accuracy < max(valid_accuracy_summary) and epoch > 100:
                stop_early += 1
                if stop_early == 20:
                    break
            else:
                stop_early = 0
            
    print()
    print("Optimization Finished!")
    print()

Epoch: 0 Acc = 0.00245 Cost = 1109913.00000 Valid_Acc = 0.00252 Valid_Cost =  140512.62500
Epoch: 10 Acc = 0.00245 Cost = 1108838.12500 Valid_Acc = 0.00252 Valid_Cost =  140435.26562
Epoch: 20 Acc = 0.00245 Cost = 1108821.50000 Valid_Acc = 0.00252 Valid_Cost =  140434.26562
Epoch: 30 Acc = 0.00245 Cost = 1108819.75000 Valid_Acc = 0.00252 Valid_Cost =  140433.98438
Epoch: 40 Acc = 0.00245 Cost = 1108819.25000 Valid_Acc = 0.00252 Valid_Cost =  140433.92188
Epoch: 50 Acc = 0.00245 Cost = 1108819.50000 Valid_Acc = 0.00252 Valid_Cost =  140433.93750
Epoch: 60 Acc = 0.00245 Cost = 1108819.37500 Valid_Acc = 0.00252 Valid_Cost =  140433.93750
Epoch: 70 Acc = 0.00245 Cost = 1108819.37500 Valid_Acc = 0.00252 Valid_Cost =  140433.93750
Epoch: 80 Acc = 0.00245 Cost = 1108819.37500 Valid_Acc = 0.00252 Valid_Cost =  140433.93750
Epoch: 90 Acc = 0.00245 Cost = 1108819.37500 Valid_Acc = 0.00252 Valid_Cost =  140433.93750
Epoch: 100 Acc = 0.00245 Cost = 1108819.37500 Valid_Acc = 0.00252 Valid_Cost =  1

KeyboardInterrupt: ignored