In [1]:
import pandas as pd
import numpy as np
%matplotlib inline

## Model

In [2]:
def clicksPerAd(clicks, group_by=['ad_id', 'adsPerDisplay'], alpha=10, beta=0.1):

    res = pd.DataFrame({'clicked': clicks.groupby(group_by)['clicked'].sum(),
                         'viewed': clicks.groupby(group_by)['clicked'].size()
                          }).reset_index()
    
    res_mean = clicks['clicked'].mean()
   
    by = group_by[1:]
    if by<>[]:
        clicksPerAd_train_means = pd.DataFrame({'mean':clicks.groupby(by)['clicked'].mean()}).reset_index()
        res = res.merge(clicksPerAd_train_means, on=by)
        res['clicksPerShows'] = (beta*res_mean + alpha*res['mean'] + res['clicked']) / (beta + alpha + res['viewed'])
        res.drop(['clicked', 'viewed', 'mean'], axis=1, inplace=True)
    else:
        res['clicksPerShows'] = (alpha*res_mean + res['clicked']) / (alpha + res['viewed'])
        
    return res

In [3]:
def join(series):
    #return ' '.join( str(list(series)).strip('[]').split(', ') )
    return ' '.join( map(str, series) )

In [32]:
def model_prediction(clicksPerAd_train, test, on=['ad_id', 'adsPerDisplay']):
    prediction = pd.merge(test, clicksPerAd_train, how='left', on = on) 
    
    by = on[1:]
    clicksPerAd_train_means = pd.DataFrame({'clicksPerShows':clicksPerAd_train.groupby(by)['clicksPerShows'].mean()}).reset_index()
    
    index = pd.isnull(prediction['clicksPerShows'])
    prediction_nulls = prediction[index].copy()
    prediction_nulls.drop(['clicksPerShows'], axis=1, inplace=True)
    prediction_nulls = prediction_nulls.merge(clicksPerAd_train_means, how='left', on = by)#.fillna(clicksPerAd_train['clicksPerShows'].mean())
    prediction = pd.concat([prediction[~index], prediction_nulls])
    
    prediction = prediction.sort_values(by=['display_id', 'clicksPerShows'], ascending=[True, False])
    #print len(prediction[prediction['clicksPerShows'].isnull()])
    
    return pd.DataFrame({ 'ad_id': prediction.groupby('display_id')['ad_id'].apply(join) }).reset_index()

In [5]:
%run 'metrics.ipynb'
def model_evaluation(actual, prediction):
    return mapk(actual['ad_id'], prediction['ad_id'], k=12)

In [6]:
pd.read_csv("../generated/final/clicks_train_sample.csv", nrows=10).head()

Unnamed: 0,display_id,document_id,timestamp,traffic_source,platform,day,hour,geo,ad_id,clicked,adsPerDisplay,clicksPerShows
0,8,1330329,638,1.0,2.0,1,7,2765,95724,0,4,0.034785
1,8,1330329,638,1.0,2.0,1,7,2765,175694,0,4,0.202327
2,8,1330329,638,1.0,2.0,1,7,2765,280430,1,4,0.256853
3,8,1330329,638,1.0,2.0,1,7,2765,329774,0,4,0.052426
4,2657,1271490,185255,1.0,2.0,1,7,2765,70081,0,4,0.150145


In [7]:
group_by=['ad_id', 'adsPerDisplay']
usecols = ['display_id', 'ad_id', 'adsPerDisplay', 'clicked']

## Validation

In [8]:
clicks_train_sample = pd.read_csv("../generated/final/clicks_train_sample.csv", usecols = usecols)
clicks_train_sample.count()

display_id       65355241
ad_id            65355241
clicked          65355241
adsPerDisplay    65355241
dtype: int64

In [9]:
res = pd.DataFrame({'viewed': clicks_train_sample.groupby(['ad_id'])['clicked'].size()
                          }).reset_index()
res.count()

ad_id     447836
viewed    447836
dtype: int64

In [10]:
res = res[res['viewed']>30].drop(['viewed'], axis=1)
res.count()

ad_id    70536
dtype: int64

In [11]:
clicks_train_sample = clicks_train_sample.merge(res, on=['ad_id'])
clicks_train_sample.count()

display_id       63025599
ad_id            63025599
clicked          63025599
adsPerDisplay    63025599
dtype: int64

In [57]:
clicksPerAd_train = clicksPerAd(clicks_train_sample,  group_by, alpha=10, beta=0.6)
clicksPerAd_train.count()

ad_id             485307
adsPerDisplay     485307
clicksPerShows    485307
dtype: int64

In [13]:
clicks_test_sample = pd.read_csv("../generated/final/clicks_test_sample.csv", usecols = usecols)
clicks_test_sample.count()

display_id       21786490
ad_id            21786490
clicked          21786490
adsPerDisplay    21786490
dtype: int64

In [14]:
validation = clicks_test_sample[clicks_test_sample['clicked']==1].astype({'ad_id':str}).sort_values('display_id')
validation.count()

display_id       4220025
ad_id            4220025
clicked          4220025
adsPerDisplay    4220025
dtype: int64

In [15]:
validation.head()

Unnamed: 0,display_id,ad_id,clicked,adsPerDisplay
18142714,1,144739,1,6
14412118,7,105766,1,3
11270842,9,140940,1,4
377269,14,224171,1,4
18233315,26,152193,1,2


In [58]:
prediction = model_prediction(clicksPerAd_train, clicks_test_sample, on=group_by)
prediction.head()

0


Unnamed: 0,display_id,ad_id
0,1,279295 144739 296965 139684 42337 156824
1,7,300808 215967 105766
2,9,151028 19959 140940 104208
3,14,98270 143467 224171 288396
4,26,152193 285992


In [59]:
prediction.count()

display_id    4220025
ad_id         4220025
dtype: int64

In [60]:
print 'clicksPerShows :', model_evaluation(validation, prediction)  # 0.649366311611

clicksPerShows : 0.651356037895


## Training

## Predicting

Check order:

## Submission