## 1. Import data

In [2]:
import gc
import numpy as np
import pandas as pd

# Load subset of the training data
X_train = pd.read_csv('train.csv',nrows=10000, parse_dates=['click_time'])

# Show the head of the table
X_train['day'] = X_train['click_time'].dt.day.astype('uint8')
X_train['hour'] = X_train['click_time'].dt.hour.astype('uint8')
X_train['minute'] = X_train['click_time'].dt.minute.astype('uint8')
X_train['second'] = X_train['click_time'].dt.second.astype('uint8')

X_train.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,day,hour,minute,second
0,83230,3,1,13,379,2017-11-06 14:32:21,,0,6,14,32,21
1,17357,3,1,19,379,2017-11-06 14:33:34,,0,6,14,33,34
2,35810,3,1,13,379,2017-11-06 14:34:12,,0,6,14,34,12
3,45745,14,1,13,478,2017-11-06 14:34:52,,0,6,14,34,52
4,161007,3,1,13,379,2017-11-06 14:35:08,,0,6,14,35,8


In [3]:
# X_train.columns
# X_train.describe()

In [4]:
X_test = pd.read_csv('test.csv', nrows=1000, parse_dates=['click_time']).drop('click_id', axis=1)
X_test['day'] = X_test['click_time'].dt.day.astype('uint8')
X_test['hour'] = X_test['click_time'].dt.hour.astype('uint8')
X_test['minute'] = X_test['click_time'].dt.minute.astype('uint8')
X_test['second'] = X_test['click_time'].dt.second.astype('uint8')

# X_test.head()

## 2. Feature Manipulation

In [3]:
ATTRIBUTION_CATEGORIES = [        
    # V1 Features #
    ###############
    ['ip'], ['app'], ['device'], ['os'], ['channel'],
    
    # V2 Features #
    ###############
    ['app', 'channel'],
    ['app', 'os'],
    ['app', 'device'],
    
    # V3 Features #
    ###############
    ['channel', 'os'],
    ['channel', 'device'],
    ['os', 'device']
]

In [4]:
train_size = X_train.shape[0]
test_size = X_test.shape[0]
X_train = X_train.append(X_test, sort=False)

$\sum_{1}^{n}i=50$

In [5]:
# Find frequency of is_attributed for each unique value in column
freqs = {}
for cols in ATTRIBUTION_CATEGORIES:
    
    # New feature name
    new_feature = '_'.join(cols)+'_confRate'    

    # Perform the groupby
    group_object = X_train.groupby(cols)

    # Group sizes    
    group_sizes = group_object.size()
    log_group = np.log(100000) # 1000 views -> 60% confidence, 100 views -> 40% confidence 
    
    print(">> Calculating confidence-weighted rate for: {}.\n   Saving to: {}. Group Max /Mean / Median / Min: {} / {} / {} / {}".format(
        cols, new_feature, 
        group_sizes.max(), 
        np.round(group_sizes.mean(), 2),
        np.round(group_sizes.median(), 2),
        group_sizes.min()
    ))
    
    # Aggregation function
    def rate_calculation(x):
        """Calculate the attributed rate. Scale by confidence"""
        rate = x.sum() / float(x.count())
        conf = np.min( [1, np.log(x.count()) / log_group] )
        return rate * conf
    
    # Perform the merge
    X_train = X_train.merge(
        group_object['is_attributed']. \
            apply(rate_calculation). \
            reset_index(). \
            rename( 
                index=str,
                columns={'is_attributed': new_feature}
            )[cols + [new_feature]],
        on=cols, how='left'
    )

>> Calculating confidence-weighted rate for: ['ip'].
   Saving to: ip_confRate. Group Max /Mean / Median / Min: 73 / 2.36 / 2.0 / 1




>> Calculating confidence-weighted rate for: ['app'].
   Saving to: app_confRate. Group Max /Mean / Median / Min: 1732 / 171.88 / 8.0 / 1
>> Calculating confidence-weighted rate for: ['device'].
   Saving to: device_confRate. Group Max /Mean / Median / Min: 10449 / 687.5 / 1.5 / 1
>> Calculating confidence-weighted rate for: ['os'].
   Saving to: os_confRate. Group Max /Mean / Median / Min: 2665 / 141.03 / 15.5 / 1
>> Calculating confidence-weighted rate for: ['channel'].
   Saving to: channel_confRate. Group Max /Mean / Median / Min: 772 / 94.02 / 39.0 / 1
>> Calculating confidence-weighted rate for: ['app', 'channel'].
   Saving to: app_channel_confRate. Group Max /Mean / Median / Min: 520 / 48.25 / 17.0 / 1
>> Calculating confidence-weighted rate for: ['app', 'os'].
   Saving to: app_os_confRate. Group Max /Mean / Median / Min: 461 / 12.44 / 3.0 / 1
>> Calculating confidence-weighted rate for: ['app', 'device'].
   Saving to: app_device_confRate. Group Max /Mean / Median / Min: 1705

In [6]:
X_train.describe()

Unnamed: 0,ip,app,device,os,channel,is_attributed,day,hour,minute,second,...,app_confRate,device_confRate,os_confRate,channel_confRate,app_channel_confRate,app_os_confRate,app_device_confRate,channel_os_confRate,channel_device_confRate,os_device_confRate
count,11000.0,11000.0,11000.0,11000.0,11000.0,10000.0,11000.0,11000.0,11000.0,11000.0,...,10996.0,10997.0,10995.0,10965.0,10928.0,10943.0,10992.0,10830.0,10956.0,10993.0
mean,86373.184,12.822545,30.902,28.753,261.855182,0.0023,6.363636,14.860818,2.032455,7.389909,...,0.000654,0.001572,0.001169,0.000654,0.000413,0.00038,0.000582,0.000325,0.00054,0.001115
std,54825.99094,17.309722,297.877138,79.32846,129.786646,0.047906,1.149971,3.442565,9.774033,7.358735,...,0.005074,0.001081,0.002794,0.005205,0.003863,0.004195,0.00467,0.003667,0.004417,0.001888
min,195.0,1.0,0.0,0.0,3.0,0.0,6.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,41966.0,3.0,1.0,13.0,140.0,0.0,6.0,16.0,0.0,3.0,...,0.0,0.001594,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,81374.0,12.0,1.0,18.0,245.0,0.0,6.0,16.0,0.0,7.0,...,0.0,0.001594,0.001393,0.0,0.0,0.0,0.0,0.0,0.0,0.001419
75%,119628.0,15.0,1.0,19.0,379.0,0.0,6.0,16.0,0.0,10.0,...,0.000581,0.001594,0.002098,0.0,0.0,0.0,0.0,0.0,0.0,0.002156
max,212619.0,536.0,3032.0,607.0,498.0,1.0,10.0,16.0,59.0,59.0,...,0.148438,0.021205,0.071945,0.090309,0.111835,0.129692,0.148438,0.111835,0.090309,0.048291


## Method 1: XGBoost

In [7]:
X_test = X_train.iloc[train_size: , :].drop('is_attributed', axis=1).select_dtypes(include=[np.number])
X_train = X_train.iloc[0:train_size,:]

In [8]:
import xgboost as xgb

# Split into X and y

y = X_train['is_attributed']
X = X_train.drop('is_attributed', axis=1).select_dtypes(include=[np.number])

# Create a model
# Params from: https://www.kaggle.com/aharless/swetha-s-xgboost-revised
clf_xgBoost = xgb.XGBClassifier(
    max_depth = 4,
    subsample = 0.8,
    colsample_bytree = 0.7,
    colsample_bylevel = 0.7,
    scale_pos_weight = 9,
    min_child_weight = 0,
    reg_alpha = 4,
    n_jobs = 2, 
    objective = 'binary:logistic'
)
# Fit the models
clf_xgBoost.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.7,
       colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=4, min_child_weight=0, missing=None, n_estimators=100,
       n_jobs=2, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=4, reg_lambda=1, scale_pos_weight=9, seed=None,
       silent=True, subsample=0.8)

## Method 2: Keras

In [9]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

Using TensorFlow backend.


In [10]:
def build_classifier() :
    classifier = Sequential()
    classifier.add(Dense(activation='relu', input_dim=X.shape[1], units=50))
    classifier.add(Dropout(rate = 0.1))
    classifier.add(Dense(kernel_initializer="uniform", activation='relu', units=25))
    classifier.add(Dropout(rate = 0.1))
    classifier.add(Dense(kernel_initializer="uniform", activation='relu', units=5))
    classifier.add(Dropout(rate = 0.1))
    classifier.add(Dense(kernel_initializer='uniform',activation='sigmoid', units=1))
    classifier.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
    return classifier

In [11]:
classifier = build_classifier()

In [12]:
classifier.fit(X, y,  epochs = 10, batch_size = 100)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f693073eb38>

Save Model

In [None]:
import h5py
from keras.models import load_model
classifier.save("talking-data.h5")

# 3. Prediction

In [15]:
predictions = clf_xgBoost.predict(X_test)

  if diff:


In [13]:
predictions = classifier.predict(X_test)

In [16]:
predictions

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

# 4. Submission

In [22]:
testset = pd.read_csv('test.csv', nrows=1000, parse_dates=['click_time'])

In [23]:
my_submission = pd.DataFrame({'click_id': testset.click_id, 'is_attributed': predictions})

In [24]:
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)