In [2]:
import pandas as pd

## Model

In [3]:
def clicksPerAd(clicks, group_by=['ad_id'], alpha=10):

    res = pd.DataFrame({'clicked': clicks.groupby(group_by)['clicked'].sum(),
                         'viewed': clicks.groupby(group_by)['clicked'].size()
                          }).reset_index()
    
    by = group_by[1:]
    if by<>[]:
        clicksPerAd_train_means = pd.DataFrame({'mean':clicks.groupby(by)['clicked'].mean()}).reset_index()
        res = res.merge(clicksPerAd_train_means, on=by)
        res['clicksPerShows'] = (alpha*res['mean'] + res['clicked']) / (alpha + res['viewed'])
        res.drop(['clicked', 'viewed', 'mean'], axis=1, inplace=True)
    else:
        res_mean = clicks['clicked'].mean()
        res['clicksPerShows'] = (alpha*res_mean + res['clicked']) / (alpha + res['viewed'])
        
    return res

In [4]:
def join(series):
    #return ' '.join( str(list(series)).strip('[]').split(', ') )
    return ' '.join( map(str, series) )

In [5]:
def model_prediction(clicksPerAd_train, test, on=['ad_id']):
    prediction = pd.merge(test, clicksPerAd_train, how='left', on = on) 
    
    by = on[1:]
    clicksPerAd_train_means = pd.DataFrame({'clicksPerShows':clicksPerAd_train.groupby(by)['clicksPerShows'].mean()}).reset_index()
    
    index = pd.isnull(prediction['clicksPerShows'])
    prediction_nulls = prediction[index].copy()
    prediction_nulls.drop(['clicksPerShows'], axis=1, inplace=True)
    prediction_nulls = prediction_nulls.merge(clicksPerAd_train_means, how='left', on = by)#.fillna(clicksPerAd_train['clicksPerShows'].mean())
    prediction = pd.concat([prediction[~index], prediction_nulls])
    
    prediction = prediction.sort_values(by=['display_id', 'clicksPerShows'], ascending=False)
    return pd.DataFrame({ 'ad_id': prediction.groupby('display_id')['ad_id'].apply(join) }).reset_index()

In [6]:
%run 'metrics.ipynb'
def model_evaluation(actual, prediction):
    return mapk(actual['ad_id'], prediction['ad_id'], k=12)

In [7]:
pd.read_csv("../generated/final/clicks_train_sample.csv", nrows=10).head()

Unnamed: 0,display_id,document_id,timestamp,traffic_source,platform,day,hour,geo,ad_id,clicked,adsPerDisplay
0,8,1330329,638,1.0,2.0,1,7,2765,95724,0,4
1,8,1330329,638,1.0,2.0,1,7,2765,175694,0,4
2,8,1330329,638,1.0,2.0,1,7,2765,280430,1,4
3,8,1330329,638,1.0,2.0,1,7,2765,329774,0,4
4,2657,1271490,185255,1.0,2.0,1,7,2765,70081,0,4


In [8]:
group_by=['ad_id', 'adsPerDisplay']
usecols = ['display_id', 'ad_id', 'adsPerDisplay', 'clicked']

## Validation

In [None]:
clicks_train_sample = pd.read_csv("../generated/final/clicks_train_sample.csv", usecols = usecols)
clicks_train_sample.count()

In [None]:
clicksPerAd_train = clicksPerAd(clicks_train_sample,  group_by, alpha=10)
clicksPerAd_train.count()

In [None]:
clicks_test_sample = pd.read_csv("../generated/final/clicks_test_sample.csv", usecols = usecols)
clicks_test_sample.count()

In [None]:
validation = clicks_test_sample[clicks_test_sample['clicked']==1].astype({'ad_id':str}).sort_values('display_id')
validation.count()

In [None]:
validation.head()

In [None]:
prediction = model_prediction(clicksPerAd_train, clicks_test_sample, on=group_by)
prediction.head()

In [None]:
prediction.count()

In [None]:
print 'clicksPerShows :', model_evaluation(validation, prediction)  

## Training

In [8]:
print usecols
print group_by

['display_id', 'ad_id', 'adsPerDisplay', 'clicked']
['ad_id', 'adsPerDisplay']


In [9]:
events_clicks_train = pd.read_csv("../generated/final/events_clicks_train.csv", usecols=usecols)
events_clicks_train.count()

display_id       87141731
ad_id            87141731
clicked          87141731
adsPerDisplay    87141731
dtype: int64

In [10]:
clicksPerAd_train = clicksPerAd(events_clicks_train,  group_by=group_by ,alpha=10)
clicksPerAd_train.to_csv("../generated/solutions/6_clicksPerAd_train.csv", index=False)
clicksPerAd_train.count()

ad_id             1570987
adsPerDisplay     1570987
clicksPerShows    1570987
dtype: int64

## Predicting

In [9]:
print usecols
print group_by

['display_id', 'ad_id', 'adsPerDisplay', 'clicked']
['ad_id', 'adsPerDisplay']


In [10]:
clicksPerAd_train = pd.read_csv("../generated/solutions/6_clicksPerAd_train.csv")

In [11]:
clicks_test = pd.read_csv("../generated/final/events_clicks_test.csv", usecols=usecols[:-1])
clicks_test.count()

display_id       32225162
ad_id            32225162
adsPerDisplay    32225162
dtype: int64

In [12]:
prediction = model_prediction(clicksPerAd_train, clicks_test, on=group_by)
prediction.head()

Unnamed: 0,display_id,ad_id
0,16874594,170392 172888 162754 150083 66758 180797
1,16874595,8846 143982 30609
2,16874596,289122 132820 289915 11430 173005 288385 57197...
3,16874597,285834 305790 143981 155945 308836 180965 1820...
4,16874598,145937 335632 67292 250082


In [13]:
prediction.count()

display_id    6245533
ad_id         6245533
dtype: int64

Check order:

In [14]:
ids = prediction.display_id
prev = 0
for nxt in ids:
    if nxt<prev:
        print nxt
        break
    prev = nxt 

## Submission

In [15]:
prediction.to_csv("../submissions/solution_6.csv", index=False)