In [6]:
import random as rnd
from numpy.random import * 
import pandas as pd

In [48]:
# ベースになるCTRの準備
ad_ctrs = {
    1: 0.10,
    2: 0.09,
    3: 0.08,
    4: 0.07,
    5: 0.06,
    6: 0.05,
    7: 0.04,
    8: 0.03,
    9: 0.02,
    10: 0.01,
}

adspot_ctrs = {
    1: -0.05,
    2: -0.04,
    3: -0.03,
    4: -0.02,
    5: -0.01,
    6: 0.00,
    7: 0.01,
    8: 0.02,
    9: 0.03,
    10: 0.04,
}


In [49]:
ad_ids = list(range(1, 11))
adspot_ids = list(range(1, 11))

## サンプルデータ生成

In [66]:
def create_log():
    ad_id = rnd.choice(ad_ids)
    adspot_id = rnd.choice(adspot_ids)
    ad_ctr = ad_ctrs[ad_id]
    adspot_ctr = adspot_ctrs[adspot_id]      
        
    random = normal(0, 0.001)
    real_ctr = (0.15 + ad_ctr + adspot_ctr + random) 
    
    is_clicked = binomial(n=1, p=real_ctr)

    return [ad_id, adspot_id, real_ctr, is_clicked]


def create_logs(n=1000):
    test_data = []
    for i in list(range(0, n)):
        tmp = create_log()
        test_data.append(tmp)

    return pd.DataFrame(test_data, columns=['ad_id', 'adspot_id', 'real_ctr', 'is_clicked'])

In [72]:
train_data = create_logs(50000)

In [76]:
train_data.to_csv('./train_data.csv')

In [28]:
import numpy as np
import pandas as pd
import pymc3 as pm
from scipy import stats
import json

In [74]:
# train_data = pd.read_csv(csv_file)
ad_id_data = pd.Categorical(train_data['ad_id'].values).codes
adspot_id_data = pd.Categorical(train_data['adspot_id'].values).codes
is_clicked_data = train_data['is_clicked'].values

with pm.Model() as logistic_model:
    alpha = pm.Uniform('alpha', lower=-4, upper=1)
    ad_coefs = pm.Normal('ad_coefs', mu=0, sd=1, shape=len(set(train_data['ad_id'].values)))
    adspot_coefs = pm.Normal('adspot_coefs', mu=0, sd=1, shape=len(set(train_data['adspot_id'].values)))
    p = pm.Deterministic('p', pm.math.sigmoid(alpha + ad_coefs[ad_id_data] + adspot_coefs[adspot_id_data]))
    yl = pm.Bernoulli('yl', p=p, observed=is_clicked_data)
    trace = pm.sample(draws=2000, chains=3)

summary = pm.summary(trace, varnames={'alpha', 'ad_coefs', 'adspot_coefs'})

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (3 chains in 2 jobs)
NUTS: [adspot_coefs, ad_coefs, alpha]
Sampling 3 chains: 100%|██████████| 7500/7500 [20:38<00:00,  1.83draws/s] 
The number of effective samples is smaller than 10% for some parameters.


In [71]:
pm.summary(trace[1500:], varnames={'alpha', 'ad_coefs', 'adspot_coefs'})

Unnamed: 0,mean,sd,mc_error,hpd_2.5,hpd_97.5,n_eff,Rhat
adspot_coefs__0,-0.261069,0.319327,0.012337,-0.898074,0.3407,544.991744,1.002531
adspot_coefs__1,-0.134497,0.318662,0.012367,-0.803505,0.431625,548.320775,1.002589
adspot_coefs__2,-0.048691,0.31888,0.012346,-0.685447,0.554981,549.328935,1.002189
adspot_coefs__3,-0.006378,0.317916,0.012314,-0.633018,0.602061,545.507174,1.002568
adspot_coefs__4,-0.097266,0.318043,0.012298,-0.738069,0.494483,549.972117,1.002572
adspot_coefs__5,0.084557,0.318472,0.012353,-0.536419,0.693866,542.725956,1.0024
adspot_coefs__6,0.082299,0.318672,0.012412,-0.553329,0.682979,541.365525,1.002542
adspot_coefs__7,0.165683,0.318229,0.012378,-0.503939,0.738421,546.554728,1.002528
adspot_coefs__8,0.189956,0.31725,0.012405,-0.436475,0.797254,540.175989,1.0026
adspot_coefs__9,0.204574,0.31834,0.012403,-0.417162,0.812208,540.968582,1.002642
