In [26]:
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [27]:
X_train = pd.read_csv('../input/train.csv', nrows=1000000, parse_dates=['click_time'])

In [28]:
X_train.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,83230,3,1,13,379,2017-11-06 14:32:21,,0
1,17357,3,1,19,379,2017-11-06 14:33:34,,0
2,35810,3,1,13,379,2017-11-06 14:34:12,,0
3,45745,14,1,13,478,2017-11-06 14:34:52,,0
4,161007,3,1,13,379,2017-11-06 14:35:08,,0


In [29]:
X_train['day'] = X_train['click_time'].dt.day.astype('uint8')
X_train['hour'] = X_train['click_time'].dt.hour.astype('uint8')
X_train['minute'] = X_train['click_time'].dt.minute.astype('uint8')
X_train['second'] = X_train['click_time'].dt.second.astype('uint8')
X_train.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,day,hour,minute,second
0,83230,3,1,13,379,2017-11-06 14:32:21,,0,6,14,32,21
1,17357,3,1,19,379,2017-11-06 14:33:34,,0,6,14,33,34
2,35810,3,1,13,379,2017-11-06 14:34:12,,0,6,14,34,12
3,45745,14,1,13,478,2017-11-06 14:34:52,,0,6,14,34,52
4,161007,3,1,13,379,2017-11-06 14:35:08,,0,6,14,35,8


In [30]:
ATTRIBUTION_CATEGORIES = [
    # V1
    ['ip'], ['app'], ['device'], ['os'], ['channel'],
    
    # V2
    ['app', 'channel'],
    ['app', 'os'],
    ['app', 'device'],
    
    # V3
    ['channel', 'os'],
    ['channel', 'device'],
    ['os', 'device']
]

freqs = {}
for cols in ATTRIBUTION_CATEGORIES:
    new_feature = '_'.join(cols)+'_confRate'
    group_object = X_train.groupby(cols)
    group_sizes = group_object.size()
    log_group = np.log(100000)
    
    print(">> Calculating confidence-weighted rate for: {}.\n Saving to: {}. Group Max /Mean / Median / Min: {} / {} / {} / {}".format(
        cols, new_feature, 
        group_sizes.max(), 
        np.round(group_sizes.mean(), 2),
        np.round(group_sizes.median(), 2),
        group_sizes.min()
    ))
    
    # Aggregation function
    def rate_calculation(x):
        """Calculate the attributed rate. Scale by confidence"""
        rate = x.sum() / float(x.count())
        conf = np.min([1, np.log(x.count()) / log_group])
        return rate * conf
    
    # Perform the merge
    X_train = X_train.merge(
        group_object['is_attributed']. \
            apply(rate_calculation). \
            reset_index(). \
            rename( 
                index=str,
                columns={'is_attributed': new_feature}
            )[cols + [new_feature]],
        on=cols, how='left'
    )
    
X_train.head()

>> Calculating confidence-weighted rate for: ['ip'].
 Saving to: ip_confRate. Group Max /Mean / Median / Min: 6437 / 25.25 / 10.0 / 1
>> Calculating confidence-weighted rate for: ['app'].
 Saving to: app_confRate. Group Max /Mean / Median / Min: 141851 / 4672.9 / 12.0 / 1
>> Calculating confidence-weighted rate for: ['device'].
 Saving to: device_confRate. Group Max /Mean / Median / Min: 938053 / 3355.7 / 1.0 / 1
>> Calculating confidence-weighted rate for: ['os'].
 Saving to: os_confRate. Group Max /Mean / Median / Min: 239642 / 6250.0 / 38.0 / 1
>> Calculating confidence-weighted rate for: ['channel'].
 Saving to: channel_confRate. Group Max /Mean / Median / Min: 92534 / 6493.51 / 1200.0 / 1
>> Calculating confidence-weighted rate for: ['app', 'channel'].
 Saving to: app_channel_confRate. Group Max /Mean / Median / Min: 61413 / 1915.71 / 18.0 / 1
>> Calculating confidence-weighted rate for: ['app', 'os'].
 Saving to: app_os_confRate. Group Max /Mean / Median / Min: 34853 / 352.24 / 8

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,day,hour,...,app_confRate,device_confRate,os_confRate,channel_confRate,app_channel_confRate,app_os_confRate,app_device_confRate,channel_os_confRate,channel_device_confRate,os_device_confRate
0,83230,3,1,13,379,2017-11-06 14:32:21,,0,6,14,...,0.000446,0.001193,0.001146,0.000497,0.00036,0.000366,0.000444,0.0,0.000499,0.001182
1,17357,3,1,19,379,2017-11-06 14:33:34,,0,6,14,...,0.000446,0.001193,0.001394,0.000497,0.00036,0.000275,0.000444,0.000235,0.000499,0.001428
2,35810,3,1,13,379,2017-11-06 14:34:12,,0,6,14,...,0.000446,0.001193,0.001146,0.000497,0.00036,0.000366,0.000444,0.0,0.000499,0.001182
3,45745,14,1,13,478,2017-11-06 14:34:52,,0,6,14,...,0.000431,0.001193,0.001146,0.004071,0.004164,0.000201,0.000445,0.001835,0.004109,0.001182
4,161007,3,1,13,379,2017-11-06 14:35:08,,0,6,14,...,0.000446,0.001193,0.001146,0.000497,0.00036,0.000366,0.000444,0.0,0.000499,0.001182


In [31]:
X_train['ip_confRate'].head()

0    0.000000
1    0.000000
2    0.000000
3    0.000775
4    0.000000
Name: ip_confRate, dtype: float64

In [46]:
X_train.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,day,hour,...,app_confRate,device_confRate,os_confRate,channel_confRate,app_channel_confRate,app_os_confRate,app_device_confRate,channel_os_confRate,channel_device_confRate,os_device_confRate
0,83230,3,1,13,379,2017-11-06 14:32:21,,0,6,14,...,0.000446,0.001193,0.001146,0.000497,0.00036,0.000366,0.000444,0.0,0.000499,0.001182
1,17357,3,1,19,379,2017-11-06 14:33:34,,0,6,14,...,0.000446,0.001193,0.001394,0.000497,0.00036,0.000275,0.000444,0.000235,0.000499,0.001428
2,35810,3,1,13,379,2017-11-06 14:34:12,,0,6,14,...,0.000446,0.001193,0.001146,0.000497,0.00036,0.000366,0.000444,0.0,0.000499,0.001182
3,45745,14,1,13,478,2017-11-06 14:34:52,,0,6,14,...,0.000431,0.001193,0.001146,0.004071,0.004164,0.000201,0.000445,0.001835,0.004109,0.001182
4,161007,3,1,13,379,2017-11-06 14:35:08,,0,6,14,...,0.000446,0.001193,0.001146,0.000497,0.00036,0.000366,0.000444,0.0,0.000499,0.001182


In [90]:
a =  X_train.groupby(['app'])

# a['is_attributed'].rename(
#     index=str,
#     columns={'is_attributed': 'aaaaaaaa'}
# )

def rate_calculation(x):
        """Calculate the attributed rate. Scale by confidence"""
        rate = x.sum() / float(x.count())
        conf = np.min([1, np.log(x.count()) / log_group])
        #print(x.sum())
        #print(float(x.count()))
        return rate

a['is_attributed'].apply(rate_calculation)


app
0      0.461538
1      0.000343
2      0.000262
3      0.000446
4      0.000000
5      0.010891
6      0.000098
8      0.001288
9      0.000848
10     0.010519
11     0.001633
12     0.000134
13     0.000068
14     0.000473
15     0.000210
16     0.473684
17     0.000973
18     0.000376
19     0.184633
20     0.001628
21     0.000148
22     0.000718
23     0.000000
24     0.000374
25     0.000120
26     0.000720
27     0.000389
28     0.000195
29     0.122836
32     0.003595
         ...   
286    0.000000
292    0.000000
294    0.000000
299    0.000000
302    0.000000
303    0.000000
305    0.000000
315    0.000000
322    0.000000
326    0.000000
347    0.000000
361    1.000000
363    0.000000
372    0.000000
381    0.000000
407    0.000000
425    0.000000
469    0.000000
481    0.058824
502    0.000000
536    0.000000
538    0.000000
541    0.000000
549    0.000000
551    0.000000
556    0.000000
561    0.000000
563    0.000000
610    0.000000
645    0.000000
Name: is_attributed,