In [1]:
import pandas as pd
import numpy as np
%matplotlib inline

In [2]:
%run 'medcouple.ipynb'

## Model

In [None]:
def outliers_skewed(x):
    Q1 = x.quantile(0.25)
    Q3 = x.quantile(0.75)
    IQR = Q3 - Q1
    mc = medcouple_1d(x)
    if mc > 0:
        whisker_upper = 1.5*IQR*np.exp(3*mc)
        whisker_lower = 1.5*IQR*np.exp(-4*mc)
    else:
        whisker_upper = 1.5*IQR*np.exp(4*mc)
        whisker_lower = 1.5*IQR*np.exp(-3*mc)
    
    lower = Q1 - whisker_lower
    upper = Q3 + whisker_upper    
    return lower, upper 

In [None]:
def outlier_skewed_upper(x):
    Q1 = x.quantile(0.25)
    Q3 = x.quantile(0.75)
    IQR = Q3 - Q1
    mc = medcouple_1d(x)
    if mc > 0:
        whisker_upper = 1.5*IQR*np.exp(3*mc)
    else:
        whisker_upper = 1.5*IQR*np.exp(4*mc)
            
    upper = Q3 + whisker_upper    
    return upper

In [None]:
def mean_outliers_skewed(x):
    l, u = outliers_skewed(x)
    return x[(float(l)<=x)&(x<=float(u))].mean()

In [None]:
outliers_skewed(pd.Series([20, 16, 15, 15, 18, 22, 13]))

In [3]:
def clicksPerAd(clicks, group_by=['ad_id', 'adsPerDisplay'], alpha=10, beta=0.1):

    res = pd.DataFrame({'clicked': clicks.groupby(group_by)['clicked'].sum(),
                         'viewed': clicks.groupby(group_by)['clicked'].size()
                          }).reset_index()
    
    res_mean = clicks['clicked'].mean()
   
    by = group_by[1:]
    if by<>[]:
        clicksPerAd_train_means = pd.DataFrame({'mean':clicks.groupby(by)['clicked'].mean()}).reset_index()
        res = res.merge(clicksPerAd_train_means, on=by)
        res['clicksPerShows'] = (beta*res_mean + alpha*res['mean'] + res['clicked']) / (beta + alpha + res['viewed'])
        res.drop(['clicked', 'viewed', 'mean'], axis=1, inplace=True)
    else:
        res['clicksPerShows'] = (alpha*res_mean + res['clicked']) / (alpha + res['viewed'])
        
    return res

In [4]:
def join(series):
    #return ' '.join( str(list(series)).strip('[]').split(', ') )
    return ' '.join( map(str, series) )

In [5]:
def model_prediction(clicksPerAd_train, test, on=['ad_id', 'adsPerDisplay']):
    prediction = pd.merge(test, clicksPerAd_train, how='left', on = on) 
    
    by = on[1:]
    clicksPerAd_train_means = pd.DataFrame({'clicksPerShows':clicksPerAd_train.groupby(by)['clicksPerShows'].mean()}).reset_index()
    
    index = pd.isnull(prediction['clicksPerShows'])
    prediction_nulls = prediction[index].copy()
    prediction_nulls.drop(['clicksPerShows'], axis=1, inplace=True)
    prediction_nulls = prediction_nulls.merge(clicksPerAd_train_means, how='left', on = by)#.fillna(clicksPerAd_train['clicksPerShows'].mean())
    prediction = pd.concat([prediction[~index], prediction_nulls])
    
    prediction = prediction.sort_values(by=['display_id', 'clicksPerShows'], ascending=[True, False])
    return pd.DataFrame({ 'ad_id': prediction.groupby('display_id')['ad_id'].apply(join) }).reset_index()

In [6]:
%run 'metrics.ipynb'
def model_evaluation(actual, prediction):
    return mapk(actual['ad_id'], prediction['ad_id'], k=12)

In [7]:
pd.read_csv("../generated/final/clicks_train_sample.csv", nrows=10).head()

Unnamed: 0,display_id,document_id,timestamp,traffic_source,platform,day,hour,geo,ad_id,clicked,adsPerDisplay,clicksPerShows
0,8,1330329,638,1.0,2.0,1,7,2765,95724,0,4,0.034785
1,8,1330329,638,1.0,2.0,1,7,2765,175694,0,4,0.202327
2,8,1330329,638,1.0,2.0,1,7,2765,280430,1,4,0.256853
3,8,1330329,638,1.0,2.0,1,7,2765,329774,0,4,0.052426
4,2657,1271490,185255,1.0,2.0,1,7,2765,70081,0,4,0.150145


In [8]:
group_by=['ad_id', 'adsPerDisplay']
usecols = ['display_id', 'ad_id', 'adsPerDisplay', 'clicked']

## Validation

In [9]:
clicks_train_sample = pd.read_csv("../generated/final/clicks_train_sample.csv", usecols = usecols)
clicks_train_sample.count()

display_id       65355241
ad_id            65355241
clicked          65355241
adsPerDisplay    65355241
dtype: int64

In [None]:
clicks = clicks_train_sample
group_by = ['adsPerDisplay', 'ad_id']

res = pd.DataFrame({'viewed': clicks.groupby(group_by)['clicked'].size()
                          }).reset_index()

res.head()

In [None]:
outliers = pd.DataFrame(columns=['adsPerDisplay', 'upper'])
for adsPerDisplay in range(2,13):
    outliers.loc[adsPerDisplay-2] = [adsPerDisplay, outlier_skewed_upper(res[res['adsPerDisplay']==adsPerDisplay]['viewed'].copy())]
    #print adsPerDisplay, outliers.loc[adsPerDisplay-2]
    
outliers 

In [None]:
res = res.merge(outliers, on='adsPerDisplay')
res.count()

In [None]:
res.head()

In [None]:
res = res[res['viewed']<=res['upper']]
res.count()

In [None]:
res.drop(['viewed', 'upper'], axis=1, inplace=True)
res.head()

In [None]:
clicks_train_sample = clicks_train_sample.merge(res, on=['adsPerDisplay', 'ad_id'])
clicks_train_sample.count()

In [10]:
clicks_train_sample.head()

Unnamed: 0,display_id,ad_id,clicked,adsPerDisplay
0,8,95724,0,4
1,8,175694,0,4
2,8,280430,1,4
3,8,329774,0,4
4,2657,70081,0,4


In [11]:
clicksPerAd_train = clicksPerAd(clicks_train_sample,  group_by, alpha=10, beta=0.1)
clicksPerAd_train.count()

ad_id             1406219
adsPerDisplay     1406219
clicksPerShows    1406219
dtype: int64

In [12]:
clicks_test_sample = pd.read_csv("../generated/final/clicks_test_sample.csv", usecols = usecols)
clicks_test_sample.count()

display_id       21786490
ad_id            21786490
clicked          21786490
adsPerDisplay    21786490
dtype: int64

In [13]:
validation = clicks_test_sample[clicks_test_sample['clicked']==1].astype({'ad_id':str}).sort_values('display_id')
validation.count()

display_id       4220025
ad_id            4220025
clicked          4220025
adsPerDisplay    4220025
dtype: int64

In [14]:
validation.head()

Unnamed: 0,display_id,ad_id,clicked,adsPerDisplay
18142714,1,144739,1,6
14412118,7,105766,1,3
11270842,9,140940,1,4
377269,14,224171,1,4
18233315,26,152193,1,2


In [15]:
prediction = model_prediction(clicksPerAd_train, clicks_test_sample, on=group_by)
prediction.head()

Unnamed: 0,display_id,ad_id
0,1,279295 144739 139684 42337 296965 156824
1,7,300808 215967 105766
2,9,151028 19959 140940 104208
3,14,98270 143467 224171 288396
4,26,152193 285992


In [16]:
prediction.count()

display_id    4220025
ad_id         4220025
dtype: int64

In [17]:
print 'clicksPerShows :', model_evaluation(validation, prediction)  #0.496702629674

clicksPerShows : 0.651788620997


## Training

In [None]:
print usecols
print group_by

In [None]:
events_clicks_train = pd.read_csv("../generated/final/events_clicks_train.csv", usecols=usecols)
events_clicks_train.count()

In [None]:
clicksPerAd_train = clicksPerAd(events_clicks_train,  group_by=group_by ,alpha=10, beta=0.1)
clicksPerAd_train.to_csv("../generated/solutions/7_clicksPerAd_train.csv", index=False)
clicksPerAd_train.count()

## Predicting

In [None]:
print usecols
print group_by

In [None]:
clicksPerAd_train = pd.read_csv("../generated/solutions/7_clicksPerAd_train.csv")

In [None]:
clicks_test = pd.read_csv("../generated/final/events_clicks_test.csv", usecols=usecols[:-1])
clicks_test.count()

In [None]:
prediction = model_prediction(clicksPerAd_train, clicks_test, on=group_by)
prediction.head()

In [None]:
prediction.count()

Check order:

In [None]:
ids = prediction.display_id
prev = 0
for nxt in ids:
    if nxt<prev:
        print nxt
        break
    prev = nxt 

## Submission

In [None]:
prediction.to_csv("../submissions/solution_7.csv", index=False)