**Working on smaller chunk of data**

**train_sample & test**

**Data fields** : Each row of the training data contains a click record, with the following features.

1. ip: ip address of click.
2. app: app id for marketing.
3. device: device type id of user mobile phone (e.g., iphone 6 plus, iphone 7, huawei mate 7, etc.)
4. os: os version id of user mobile phone
5. channel: channel id of mobile ad publisher
6. click_time: timestamp of click (UTC)
7. attributed_time: if user download the app for after clicking an ad, this is the time of the app download
8. is_attributed: the target that is to be predicted, indicating the app was downloaded

Note that ip, app, device, os, and channel are encoded.

In [66]:
import pandas as pd
train = pd.read_csv('train_sample.csv')
test = pd.read_csv('test.csv')

In [67]:
print(train.shape)
print(test.shape)

(100000, 8)
(18790469, 7)


In [68]:
train.apply(lambda x: sum(x.isnull()))

ip                     0
app                    0
device                 0
os                     0
channel                0
click_time             0
attributed_time    99773
is_attributed          0
dtype: int64

In [69]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
ip                 100000 non-null int64
app                100000 non-null int64
device             100000 non-null int64
os                 100000 non-null int64
channel            100000 non-null int64
click_time         100000 non-null object
attributed_time    227 non-null object
is_attributed      100000 non-null int64
dtypes: int64(6), object(2)
memory usage: 6.1+ MB


In [70]:
train.sample(5)

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
42614,120419,15,1,14,130,2017-11-08 16:22:37,,0
36797,24965,1,1,19,135,2017-11-07 10:05:30,,0
59388,23478,3,1,9,417,2017-11-07 23:37:22,,0
95547,88551,18,1,19,107,2017-11-07 20:55:54,,0
92690,114276,3,1,17,417,2017-11-07 12:49:01,,0


In [71]:
train[train['attributed_time'].notnull()].sample(10)

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
13770,177975,45,1,4,419,2017-11-07 10:14:19,2017-11-07 10:14:30,1
39198,46364,9,1,12,489,2017-11-09 03:34:38,2017-11-09 07:28:37,1
95330,260575,45,1,18,5,2017-11-08 06:21:05,2017-11-08 06:21:17,1
18529,53389,35,1,19,21,2017-11-07 03:03:16,2017-11-07 03:05:13,1
61222,218061,45,1,26,5,2017-11-08 06:19:39,2017-11-08 06:19:51,1
19190,253653,20,1,19,478,2017-11-07 23:18:18,2017-11-08 09:06:37,1
45186,295505,19,351,21,282,2017-11-09 01:03:09,2017-11-09 01:04:00,1
14419,354266,5,1,30,113,2017-11-09 04:43:03,2017-11-09 04:43:30,1
62924,224428,108,1,23,243,2017-11-08 14:19:50,2017-11-08 14:40:50,1
63685,177515,19,97,24,213,2017-11-07 02:53:43,2017-11-07 02:54:59,1


In [72]:
for c in train.columns:
    if c != 'ip':
        print(train[c].value_counts())
        print('*'*50)
#better way is to plot it

3      18279
12     13198
2      11737
9       8992
15      8595
18      8315
14      5359
1       3135
13      2422
8       2004
21      1979
11      1927
26      1633
23      1454
6       1303
64      1079
7        981
20       911
25       804
28       720
24       704
27       696
19       478
10       388
22       386
17       380
29       360
32       286
5        188
151      109
       ...  
551        1
168        1
105        1
233        1
302        1
78         1
112        1
115        1
116        1
372        1
123        1
124        1
548        1
100        1
163        1
99         1
97         1
96         1
31         1
347        1
91         1
538        1
474        1
536        1
216        1
407        1
145        1
271        1
398        1
92         1
Name: app, Length: 161, dtype: int64
**************************************************
1       94338
2        4345
0         541
3032      371
3543      151
3866       93
59         12
6           8
16     

In [73]:
train['XX'] = pd.to_datetime(train['click_time'], format="%Y-%m-%d %H:%M:%S")

In [74]:
train['XX'].dt.weekday.value_counts()

2    34035
1    32393
3    28561
0     5011
Name: XX, dtype: int64

In [75]:
test.sample(5)

Unnamed: 0,click_id,ip,app,device,os,channel,click_time
4162302,4162302,31382,26,1,12,477,2017-11-10 05:16:18
630012,630013,71699,3,1,13,442,2017-11-10 04:11:09
16326074,16326074,50218,9,1,13,244,2017-11-10 14:14:15
17728164,17728164,43671,18,1,27,439,2017-11-10 14:39:15
4246895,4246895,7554,14,1,13,401,2017-11-10 05:18:05


In [76]:
test['X'] = pd.to_datetime(test['click_time'], format="%Y-%m-%d %H:%M:%S")

In [77]:
test['X'].dt.day.value_counts()

10    18790469
Name: X, dtype: int64

In [78]:
##since test dataset doesn't samples from different date that is not there in train dataset, weekday and date won't help
#train['date'] = pd.to_datetime(train['click_time'], format="%Y-%m-%d %H:%M:%S").dt.day
train['hour'] = pd.to_datetime(train['click_time'], format="%Y-%m-%d %H:%M:%S").dt.hour
train['minute'] = pd.to_datetime(train['click_time'], format="%Y-%m-%d %H:%M:%S").dt.minute
#train['weekDay'] = pd.to_datetime(train['click_time'], format="%Y-%m-%d %H:%M:%S").dt.weekday
train.dtypes

ip                          int64
app                         int64
device                      int64
os                          int64
channel                     int64
click_time                 object
attributed_time            object
is_attributed               int64
XX                 datetime64[ns]
hour                        int64
minute                      int64
dtype: object

In [79]:
train.drop(['click_time','attributed_time','XX'], axis=1, inplace=True)
train.dtypes

ip               int64
app              int64
device           int64
os               int64
channel          int64
is_attributed    int64
hour             int64
minute           int64
dtype: object

In [80]:
test['hour'] = pd.to_datetime(test['click_time'], format="%Y-%m-%d %H:%M:%S").dt.hour
test['minute'] = pd.to_datetime(test['click_time'], format="%Y-%m-%d %H:%M:%S").dt.minute
test.dtypes

click_id               int64
ip                     int64
app                    int64
device                 int64
os                     int64
channel                int64
click_time            object
X             datetime64[ns]
hour                   int64
minute                 int64
dtype: object

In [81]:
test.drop(['click_time','X'], axis=1, inplace=True)
test.dtypes

click_id    int64
ip          int64
app         int64
device      int64
os          int64
channel     int64
hour        int64
minute      int64
dtype: object

**Data fields** : Each row of the training data contains a click record, with the following features.

1. ip: ip address of click.
2. app: app id for marketing.
3. device: device type id of user mobile phone (e.g., iphone 6 plus, iphone 7, huawei mate 7, etc.)
4. os: os version id of user mobile phone
5. channel: channel id of mobile ad publisher
6. click_time: timestamp of click (UTC)
7. attributed_time: if user download the app for after clicking an ad, this is the time of the app download
8. is_attributed: the target that is to be predicted, indicating the app was downloaded

'app', 'channel', 'time' are critical

In [82]:
train['app'].corr(train['channel'])

-0.028237659717744154

In [83]:
#import seaborn as sns
#sns.pairplot(train[['channel','app']])

In [84]:
train[['channel','app']].sample(10)

Unnamed: 0,channel,app
19356,489,14
42828,17,1
79228,134,18
27814,280,3
78325,334,9
81175,107,18
14461,469,13
54426,245,12
5993,234,9
4631,105,24


# Model

In [85]:
Xtrain = train.drop(['is_attributed'], axis=1)
ytrain = train['is_attributed']
Xtest = test.drop(['click_id'], axis=1)

In [86]:
train.dtypes

ip               int64
app              int64
device           int64
os               int64
channel          int64
is_attributed    int64
hour             int64
minute           int64
dtype: object

In [87]:
ytrain.dtypes

dtype('int64')

In [88]:
Xtest.dtypes

ip         int64
app        int64
device     int64
os         int64
channel    int64
hour       int64
minute     int64
dtype: object

In [89]:
from xgboost import XGBClassifier
model = XGBClassifier()

In [90]:
%time model.fit(Xtrain, ytrain)

Wall time: 5.01 s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [91]:
%time ypred = model.predict(Xtest)

  if diff:


Wall time: 34.6 s


In [92]:
#predictions = [round(value) for value in ypred]

In [93]:
output = pd.read_csv('sample_submission.csv')
output.dtypes

click_id         int64
is_attributed    int64
dtype: object

In [94]:
output['click_id'] = test['click_id']
output['is_attributed'] = ypred

In [44]:
output.shape

(18790469, 2)

In [95]:
output.sample(5)

Unnamed: 0,click_id,is_attributed
16544739,16544739,0
8799190,8799190,0
16015726,16015726,0
3694037,3694037,0
2781670,2781670,0


In [96]:
output['is_attributed'].value_counts()

0    18790469
Name: is_attributed, dtype: int64

In [29]:
output.to_csv('m1s.csv', index=False)

PermissionError: [Errno 13] Permission denied