**Working on smaller chunk of data**

**train_sample & test**

**Data fields** : Each row of the training data contains a click record, with the following features.

1. ip: ip address of click.
2. app: app id for marketing.
3. device: device type id of user mobile phone (e.g., iphone 6 plus, iphone 7, huawei mate 7, etc.)
4. os: os version id of user mobile phone
5. channel: channel id of mobile ad publisher
6. click_time: timestamp of click (UTC)
7. attributed_time: if user download the app for after clicking an ad, this is the time of the app download
8. is_attributed: the target that is to be predicted, indicating the app was downloaded

Note that ip, app, device, os, and channel are encoded.

In [1]:
import pandas as pd
train = pd.read_csv('train_sample.csv')
test = pd.read_csv('test.csv')

In [2]:
print(train.shape)
print(test.shape)

(100000, 8)
(18790469, 7)


In [3]:
train.apply(lambda x: sum(x.isnull()))

ip                     0
app                    0
device                 0
os                     0
channel                0
click_time             0
attributed_time    99773
is_attributed          0
dtype: int64

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
ip                 100000 non-null int64
app                100000 non-null int64
device             100000 non-null int64
os                 100000 non-null int64
channel            100000 non-null int64
click_time         100000 non-null object
attributed_time    227 non-null object
is_attributed      100000 non-null int64
dtypes: int64(6), object(2)
memory usage: 6.1+ MB


In [5]:
train.sample(5)

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
72166,158491,8,1,19,145,2017-11-07 13:04:35,,0
94043,8786,2,1,1,237,2017-11-09 05:20:19,,0
57051,73954,14,1,41,134,2017-11-08 13:56:53,,0
90156,17014,3,2,13,280,2017-11-08 06:50:59,,0
16738,282913,2,1,8,435,2017-11-08 01:41:37,,0


In [6]:
train[train['attributed_time'].notnull()].sample(10)

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
94450,93542,19,0,0,347,2017-11-07 03:14:10,2017-11-07 03:17:23,1
11034,96637,19,102,24,213,2017-11-07 00:53:43,2017-11-07 08:32:53,1
62320,17460,9,1,13,489,2017-11-09 03:52:26,2017-11-09 03:52:48,1
55222,120259,3,1,15,280,2017-11-07 05:37:18,2017-11-07 07:26:43,1
22527,60666,19,0,0,347,2017-11-09 09:31:16,2017-11-09 10:13:19,1
73413,177454,3,1,13,280,2017-11-08 04:46:48,2017-11-08 08:57:28,1
38971,56317,50,0,0,213,2017-11-09 02:28:45,2017-11-09 08:56:28,1
65798,303879,10,1,27,113,2017-11-09 08:58:59,2017-11-09 08:59:35,1
15821,5348,19,0,29,213,2017-11-07 14:17:49,2017-11-07 14:17:52,1
39198,46364,9,1,12,489,2017-11-09 03:34:38,2017-11-09 07:28:37,1


In [7]:
for c in train.columns:
    if c != 'ip':
        print(train[c].value_counts())
        print('*'*50)
#better way is to plot it

3      18279
12     13198
2      11737
9       8992
15      8595
18      8315
14      5359
1       3135
13      2422
8       2004
21      1979
11      1927
26      1633
23      1454
6       1303
64      1079
7        981
20       911
25       804
28       720
24       704
27       696
19       478
10       388
22       386
17       380
29       360
32       286
5        188
151      109
       ...  
551        1
168        1
105        1
233        1
302        1
78         1
112        1
115        1
116        1
372        1
123        1
124        1
548        1
100        1
163        1
99         1
97         1
96         1
31         1
347        1
91         1
538        1
474        1
536        1
216        1
407        1
145        1
271        1
398        1
92         1
Name: app, Length: 161, dtype: int64
**************************************************
1       94338
2        4345
0         541
3032      371
3543      151
3866       93
59         12
6           8
16     

In [8]:
train['XX'] = pd.to_datetime(train['click_time'], format="%Y-%m-%d %H:%M:%S")

In [9]:
train['XX'].dt.weekday.value_counts()

2    34035
1    32393
3    28561
0     5011
Name: XX, dtype: int64

In [10]:
test.sample(5)

Unnamed: 0,click_id,ip,app,device,os,channel,click_time
13725990,13725990,97773,15,1,19,3,2017-11-10 13:27:17
12649354,12649353,95006,21,2,9,232,2017-11-10 13:06:26
11333506,11333506,93235,9,1,9,258,2017-11-10 10:41:09
10928719,10928719,17130,9,1,22,334,2017-11-10 10:33:32
9578157,9578157,80398,13,1,41,477,2017-11-10 10:07:32


In [11]:
test['X'] = pd.to_datetime(test['click_time'], format="%Y-%m-%d %H:%M:%S")

In [12]:
test['X'].dt.day.value_counts()

10    18790469
Name: X, dtype: int64

In [13]:
##since test dataset doesn't samples from different date that is not there in train dataset, weekday and date won't help
#train['date'] = pd.to_datetime(train['click_time'], format="%Y-%m-%d %H:%M:%S").dt.day
train['hour'] = pd.to_datetime(train['click_time'], format="%Y-%m-%d %H:%M:%S").dt.hour
train['minute'] = pd.to_datetime(train['click_time'], format="%Y-%m-%d %H:%M:%S").dt.minute
#train['weekDay'] = pd.to_datetime(train['click_time'], format="%Y-%m-%d %H:%M:%S").dt.weekday
train.dtypes

ip                          int64
app                         int64
device                      int64
os                          int64
channel                     int64
click_time                 object
attributed_time            object
is_attributed               int64
XX                 datetime64[ns]
hour                        int64
minute                      int64
dtype: object

In [14]:
train.drop(['click_time','attributed_time','XX'], axis=1, inplace=True)
train.dtypes

ip               int64
app              int64
device           int64
os               int64
channel          int64
is_attributed    int64
hour             int64
minute           int64
dtype: object

In [15]:
test['hour'] = pd.to_datetime(test['click_time'], format="%Y-%m-%d %H:%M:%S").dt.hour
test['minute'] = pd.to_datetime(test['click_time'], format="%Y-%m-%d %H:%M:%S").dt.minute
test.dtypes

click_id               int64
ip                     int64
app                    int64
device                 int64
os                     int64
channel                int64
click_time            object
X             datetime64[ns]
hour                   int64
minute                 int64
dtype: object

In [16]:
test.drop(['click_time','X'], axis=1, inplace=True)
test.dtypes

click_id    int64
ip          int64
app         int64
device      int64
os          int64
channel     int64
hour        int64
minute      int64
dtype: object

**Data fields** : Each row of the training data contains a click record, with the following features.

1. ip: ip address of click.
2. app: app id for marketing.
3. device: device type id of user mobile phone (e.g., iphone 6 plus, iphone 7, huawei mate 7, etc.)
4. os: os version id of user mobile phone
5. channel: channel id of mobile ad publisher
6. click_time: timestamp of click (UTC)
7. attributed_time: if user download the app for after clicking an ad, this is the time of the app download
8. is_attributed: the target that is to be predicted, indicating the app was downloaded

'app', 'channel', 'time' are critical

In [17]:
train['app'].corr(train['channel'])

-0.028237659717744154

In [18]:
#import seaborn as sns
#sns.pairplot(train[['channel','app']])

In [19]:
train[['channel','app']].sample(10)

Unnamed: 0,channel,app
7915,280,3
38211,439,14
35885,107,18
69827,259,12
59086,237,2
94960,480,15
66548,121,18
835,463,14
31015,463,14
13389,101,7


# Model

In [20]:
Xtrain = train.drop(['is_attributed'], axis=1)
ytrain = train['is_attributed']
Xtest = test.drop(['click_id'], axis=1)

In [21]:
train.dtypes

ip               int64
app              int64
device           int64
os               int64
channel          int64
is_attributed    int64
hour             int64
minute           int64
dtype: object

In [22]:
ytrain.dtypes

dtype('int64')

In [23]:
Xtest.dtypes

ip         int64
app        int64
device     int64
os         int64
channel    int64
hour       int64
minute     int64
dtype: object

In [24]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(n_estimators=10)

In [25]:
%time model.fit(Xtrain, ytrain)

Wall time: 716 ms


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=10,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [26]:
%time ypred = model.predict(Xtest)

Wall time: 3.22 s


In [27]:
#predictions = [round(value) for value in ypred]

In [28]:
output = pd.read_csv('sample_submission.csv')
output.dtypes

click_id         int64
is_attributed    int64
dtype: object

In [29]:
output['click_id'] = test['click_id']
output['is_attributed'] = ypred

In [30]:
output.shape

(18790469, 2)

In [31]:
output.sample(5)

Unnamed: 0,click_id,is_attributed
9742112,9742112,0
15564805,15564805,0
2124429,2124431,0
6034516,6034517,0
18143158,18143158,0


In [32]:
output['is_attributed'].value_counts()

0    18788870
1        1599
Name: is_attributed, dtype: int64

In [33]:
output.to_csv('m4s.csv', index=False)