Data fields
Each row of the training data contains a click record, with the following features.

ip: ip address of click.
app: app id for marketing.
device: device type id of user mobile phone (e.g., iphone 6 plus, iphone 7, huawei mate 7, etc.)
os: os version id of user mobile phone
channel: channel id of mobile ad publisher
click_time: timestamp of click (UTC)
attributed_time: if user download the app for after clicking an ad, this is the time of the app download
is_attributed: the target that is to be predicted, indicating the app was downloaded
Note that ip, app, device, os, and channel are encoded.

The test data is similar, with the following differences:

click_id: reference for making predictions
is_attributed: not included

Soruce: https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/data

In [14]:
# Python 2 & 3 Compatibility
from __future__ import print_function, division

# Necessary imports
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.cross_validation import train_test_split
from sklearn import svm


import os
import pickle
%matplotlib inline
from datetime import datetime
from dateutil.parser import parse

In [2]:
with open("all_converted_data.pkl", 'rb') as picklefile: 
    df_new = pickle.load(picklefile)
df_new.shape


(1370538, 13)

In [3]:
df_new.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,hour,date,day,month,year
0,117867,12,1,18,19,2017-11-08 23:43:20,,0,23,2017-11-08,8,11,2017
1,43715,17,1,13,280,2017-11-08 05:36:19,,0,5,2017-11-08,8,11,2017
2,183537,2,1,13,477,2017-11-06 17:21:36,,0,17,2017-11-06,6,11,2017
3,46323,3,1,13,402,2017-11-09 06:26:40,,0,6,2017-11-09,9,11,2017
4,4248,9,1,13,489,2017-11-07 10:28:31,,0,10,2017-11-07,7,11,2017


In [4]:
df_download = df_new[df_new.is_attributed == 1].sample(5000)
df_notDownload = df_new[df_new.is_attributed != 1].sample(5000*2)
df_all = df_notDownload.append(df_download, ignore_index=True)

In [7]:
df_all.shape

(15000, 13)

In [8]:
selectedFeatures=['ip', 'app', 'device', 'os', 'channel','hour']

In [9]:
df_all.dtypes

ip                          int64
app                         int64
device                      int64
os                          int64
channel                     int64
click_time         datetime64[ns]
attributed_time            object
is_attributed               int64
hour                        int64
date                       object
day                         int64
month                       int64
year                        int64
dtype: object

In [10]:
X=df_all.loc[:, selectedFeatures]
y=df_all.loc[:, ['is_attributed']]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5,random_state=42)


In [17]:
def runSVC(X_train_p, y_train_p, X_test_p, y_test_p):
    from sklearn.svm import SVC
    model = SVC()
    model.fit(X_train_p, y_train_p)
    y_pred = model.predict(X_test_p)
    print(accuracy_score(y_test_p,y_pred))
    print(f1_score(y_test_p, y_pred, average='macro'))
    print(f1_score(y_test_p, y_pred, average=None))
    print(f1_score(y_test_p, y_pred, average='weighted'))


In [18]:
runSVC(X_train, y_train, X_test, y_test)

  y = column_or_1d(y, warn=True)


0.6716
0.40293879199027477
[0.80344745 0.00243013]
0.5400729566034679


In [19]:
def runSVM(X_train_p, y_train_p, X_test_p, y_test_p):
    model = svm.SVC()
    model.fit(X_train_p, y_train_p)
    y_pred = model.predict(X_test_p)
    print(accuracy_score(y_test_p,y_pred))
    print(f1_score(y_test_p, y_pred, average='macro'))
    print(f1_score(y_test_p, y_pred, average=None))
    print(f1_score(y_test_p, y_pred, average='weighted'))


In [20]:
runSVM(X_train, y_train, X_test, y_test)

  y = column_or_1d(y, warn=True)


0.6716
0.40293879199027477
[0.80344745 0.00243013]
0.5400729566034679


In [21]:
def runGaussianNB(X_train_p, y_train_p, X_test_p, y_test_p):
    from sklearn.naive_bayes import GaussianNB

    # Initialize our classifier
    model = GaussianNB()
    model.fit(X_train_p, y_train_p)
    y_pred = model.predict(X_test_p)
    print(accuracy_score(y_test_p,y_pred))
    print(f1_score(y_test_p, y_pred, average='macro'))
    print(f1_score(y_test_p, y_pred, average=None))
    print(f1_score(y_test_p, y_pred, average='weighted'))


In [22]:
runGaussianNB(X_train, y_train, X_test, y_test)

0.7653333333333333
0.690708199494336
[0.84263233 0.53878407]
0.7427270224309661


  y = column_or_1d(y, warn=True)


In [25]:
def runLog(X_train_p, y_train_p, X_test_p, y_test_p):
    from sklearn.linear_model import LogisticRegression
    # Initialize our classifier
    model = LogisticRegression()
    model.fit(X_train_p, y_train_p)
    y_pred = model.predict(X_test_p)
    print(accuracy_score(y_test_p,y_pred))
    print(f1_score(y_test_p, y_pred, average='macro'))
    print(f1_score(y_test_p, y_pred, average=None))
    print(f1_score(y_test_p, y_pred, average='weighted'))


In [26]:
runLog(X_train, y_train, X_test, y_test)

0.7676
0.7293659748158918
[0.83108828 0.62764367]
0.7641956933943912


  y = column_or_1d(y, warn=True)


In [39]:
def runRandom(X_train_p, y_train_p, X_test_p, y_test_p):
    from sklearn.ensemble import RandomForestClassifier    # Initialize our classifier
    model = RandomForestClassifier()
    model.fit(X_train_p, y_train_p)
    y_train_pred = model.predict(X_train_p)
    print(accuracy_score(y_train_p,y_train_pred))
    print("------")
    y_pred = model.predict(X_test_p)
    print(accuracy_score(y_test_p,y_pred))
    print(f1_score(y_test_p, y_pred, average='macro'))
    print(f1_score(y_test_p, y_pred, average=None))
    print(f1_score(y_test_p, y_pred, average='weighted'))
    print(confusion_matrix(y_test_p, y_pred))

In [40]:
runRandom(X_train, y_train, X_test, y_test)

0.9922666666666666
------
0.9246666666666666
0.9123954060585083
[0.94518289 0.87960793]
0.9236218389910879
[[4871  163]
 [ 402 2064]]


  after removing the cwd from sys.path.


Attemp for other dataset from kaggle

In [20]:
df = pd.read_csv('data/creditcard.csv')

In [21]:
df.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')

In [107]:
df.shape

(284807, 31)

In [108]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [109]:
df.shape

(284807, 31)

In [110]:
from sklearn.model_selection import KFold, cross_val_score

In [111]:
df.dtypes

Time      float64
V1        float64
V2        float64
V3        float64
V4        float64
V5        float64
V6        float64
V7        float64
V8        float64
V9        float64
V10       float64
V11       float64
V12       float64
V13       float64
V14       float64
V15       float64
V16       float64
V17       float64
V18       float64
V19       float64
V20       float64
V21       float64
V22       float64
V23       float64
V24       float64
V25       float64
V26       float64
V27       float64
V28       float64
Amount    float64
Class       int64
dtype: object

In [144]:
s = ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']

In [145]:
X=df.loc[:, s]
y=df.loc[:, ['Class']]

In [146]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5,random_state=42)


In [160]:
# fit linear model
from sklearn import linear_model
#model_svm = linear_model.SGDClassifier()
model_svm = svm.SVC()
model_svm.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [161]:
# predict out of sample
y_pred = model_svm.predict(X_test)

In [162]:
accuracy_score(y_test,y_pred)


0.9983497654560265

In [150]:
from sklearn.naive_bayes import GaussianNB

# Initialize our classifier
gnb = GaussianNB()

# Train our classifier
model = gnb.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [151]:
# Make predictions
preds = gnb.predict(X_test)
print(preds)

[1 0 0 ... 0 0 0]


In [152]:

from sklearn.metrics import accuracy_score

# Evaluate accuracy
print(accuracy_score(y_test, preds))

0.9932024381337603


In [153]:
#df_test = pd.read_csv('data/test.csv')

In [154]:
#df_test.head()

In [155]:
#df_test_new = convertDF(df_test)
#df_test_new.head()

In [156]:
# X_sub=df_test_new.loc[:, selectedFeatures]
# X_sub.head()

In [157]:
# y_pred_sub = model_svm.predict(X_sub)

In [158]:
# y_pred_sub

In [159]:
# X_sub_final = df_test

In [None]:
# X_sub_final['is_attributed'] = y_pred_sub

In [None]:
# X_sub_final.head()