In [53]:
#前処理後のindex作るための処理
import os
import elasticsearch as es
import math
import numpy as np
from itertools import chain
import sys
import tensorflow as tf
import matplotlib.pyplot as plt

ES_HOST = "localhost"
escon = es.Elasticsearch(ES_HOST, port=9200)
GACRP_HOME=os.environ['GACRP_HOME']

# ESのデータ量依存設定値のグローバル変数化
# scrollのタイムアウト時間
es_s_time='2m'


In [54]:
indataStartYmd=20160801
indataEndYmd=20161031
forecastStartYmd=20161101
forecastEndYmd=20161231

In [55]:
# 元データ取得

def getInputData(startymd,endymd):
    es_size=10000
    body = {
      "_source": [
        "fullVisitorId","date",
        "channelGrouping","visitNumber","isMobile","country","hits","bounces",
        "newVisits","source","medium","isTrueDirect","transactionRevenue"
      ],
      "query": {
        "bool": {
          "must": [
            {
              "range": {
                "date": {
                  "gt": startymd,
                  "lt": endymd
                }
              }
            }
          ],
          "must_not": [],
          "should": []
        }
      },
      "from": 0,
      "size": 0,
      "sort": [],
      "aggs": {}
    }
  
    res=escon.search(index='gacrp_index',body=body,scroll=es_s_time,size=es_size)
    es_s_id=res['_scroll_id']
    es_s_size=len(res["hits"]["hits"])
    
    return(res,es_s_id,es_s_size)


In [56]:
#fullVisitorIdごとに、指定した期間のrevenueを取得
def getRevenueByFullVisitorId(fullVisitorId,startYmd,endYmd):
    es_size=10000
    body = {
        "size":0,
        "aggs":{
            "agg_revenue":{
                "sum":{"field":"transactionRevenue"}
            }
        },
        "query":{
            "bool":{
                "must":[
                    {"term":{"fullVisitorId.keyword":fullVisitorId}},
                    {"range": {"date": {
                                "gt": startYmd,"lt": endYmd
                    }}}
                ]
            }
        }
    }
    res=escon.search(index='gacrp_index',body=body)
    revenue=math.log(int(res['aggregations']['agg_revenue']['value'])+1)
    #print(revenue)
    return(revenue)

In [57]:
#oneHotベクトル化
def es2oneHot(res,data):
    for i in range(len(res["hits"]["hits"])):
        row=res["hits"]["hits"][i]["_source"]
        rec=[]

        col_date=row['date']
        col_fullVisitorId=row['fullVisitorId']
        
        if row['channelGrouping']=='Display':
            col_channelGrouping=1
        elif row['channelGrouping']=='Social':
            col_channelGrouping=2
        else:
            col_channelGrouping=0
        oneHot_col_channelGrouping=np.eye(2+1)[col_channelGrouping]
        
        #visitNumberは最大を5で打ち切って0～1にnormalize。
        col_visitNumber=min(int(row['visitNumber']),5)/5.0 

        if row['isMobile']=="TRUE":
            col_isMobile=1
        else:
            col_isMobile=0

        if row['country']=="United States":
            col_country =1
        elif row['country']=="Canada":
            col_country =2
        elif row['country']=="Japan":
            col_country=3
        else:
            col_country=0
        oneHot_col_country=np.eye(3+1)[col_country]
    
        col_hits=math.log10(int(row['hits'])*1.0)
    
        if row['bounces']=="1":
            col_bounces=1
        else:
            col_bounces=0

        if row['newVisits']== "1":
            col_newVisits=1
        else:
            col_newVisits=0
    
        if row['source']=="mail.googleplex.com" or row['source']=="dfa":
            col_source =1
        elif row['source']=="(direct)":
            col_source=2
        else:
            col_source=0
        oneHot_col_source=np.eye(2+1)[col_source]
        
        if row['medium']=="cpm":
            col_medium=1
        else:
            col_medium=0
        oneHot_col_medium=np.eye(1+1)[col_medium]
    
        if row['isTrueDirect']=="1":
            col_isTrueDirect=1
        else:
            col_isTrueDirect=0
    
        #revenue = math.log(int(row['transactionRevenue'])+1)
    
        l=[oneHot_col_channelGrouping,[col_visitNumber],[col_isMobile],oneHot_col_country,[col_hits],[col_bounces],[col_newVisits],oneHot_col_source,oneHot_col_medium,[col_isTrueDirect]]
        data.append([col_fullVisitorId,list(chain.from_iterable(l))])
        #data.append([col_fullVisitorId,l])
        #ans.append(revenue)
    
    return(data)

In [58]:
rowdata=[]
ans = []

res,es_s_id,es_s_size=getInputData(indataStartYmd,indataEndYmd)
while(es_s_size>0):
    rowdata=es2oneHot(res,rowdata)
    res=escon.scroll(scroll_id=es_s_id,scroll=es_s_time)
    es_s_id=res['_scroll_id']
    es_s_size=len(res["hits"]["hits"])

    print(len(rowdata))

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
237728


In [59]:
#rowdataを対象期間のfullVisitorIdごとに集約して予測期間のtransactionRevenueを当てる
rowdata=sorted(rowdata,key=lambda x:x[0])
print("ソート完了")
key_fullVisitorId=rowdata[0][0]
sumdata=[]

tmp_indata=[]
tmp_indata.append(rowdata[0][1])
tmp_cnt=1
for i in range(1,len(rowdata)):
    if i %10000==0:
        print("集約処理実行中：",i)
    if key_fullVisitorId==rowdata[i][0]:
        tmp_indata.append(rowdata[i][1])
        tmp_cnt+=1
    else:
        #列方向に平均をとってappend
        #print(tmp_indata)
        sumdata.append(np.average(tmp_indata,axis=1))
        ans.append(getRevenueByFullVisitorId(key_fullVisitorId,forecastStartYmd,forecastEndYmd))
        key_fullVisitorId=rowdata[i][0]
        tmp_cnt=1

ans.append(getRevenueByFullVisitorId(key_fullVisitorId,forecastStartYmd,forecastEndYmd))    

print(len(sumdata),len(ans))

ソート完了
('\xe9\x9b\x86\xe7\xb4\x84\xe5\x87\xa6\xe7\x90\x86\xe5\xae\x9f\xe8\xa1\x8c\xe4\xb8\xad\xef\xbc\x9a', 10000)
('\xe9\x9b\x86\xe7\xb4\x84\xe5\x87\xa6\xe7\x90\x86\xe5\xae\x9f\xe8\xa1\x8c\xe4\xb8\xad\xef\xbc\x9a', 20000)
('\xe9\x9b\x86\xe7\xb4\x84\xe5\x87\xa6\xe7\x90\x86\xe5\xae\x9f\xe8\xa1\x8c\xe4\xb8\xad\xef\xbc\x9a', 30000)
('\xe9\x9b\x86\xe7\xb4\x84\xe5\x87\xa6\xe7\x90\x86\xe5\xae\x9f\xe8\xa1\x8c\xe4\xb8\xad\xef\xbc\x9a', 40000)
('\xe9\x9b\x86\xe7\xb4\x84\xe5\x87\xa6\xe7\x90\x86\xe5\xae\x9f\xe8\xa1\x8c\xe4\xb8\xad\xef\xbc\x9a', 50000)
('\xe9\x9b\x86\xe7\xb4\x84\xe5\x87\xa6\xe7\x90\x86\xe5\xae\x9f\xe8\xa1\x8c\xe4\xb8\xad\xef\xbc\x9a', 60000)


ConnectionTimeout: ConnectionTimeout caused by - ReadTimeoutError(HTTPConnectionPool(host=u'localhost', port=9200): Read timed out. (read timeout=10))

In [None]:
#集約後のfullVisitorIdに
#for i in range(200):
#    print(getRevenueByFullVisitorId(rowdata[i][0],"20160101","20180101"))
#
#datalength=len(data)

In [None]:
#tensorflowセットアップ

#print(data[0],ans[0])
#inputData=tf.transpose(data)
debugFlg=False
batch_size=datalength/2
num_elements=18
P=36
num_rows=len(data)
inputData=np.reshape(sumdata,(num_rows,num_elements))
inputAns=np.reshape(ans,(num_rows,1))
x =tf.placeholder("float",[None,num_elements])
y_=tf.placeholder("float",[None,1])

a=tf.Variable( tf.random_uniform([num_elements,1],-1.0,1.0),name="weights")    
w1=tf.Variable( tf.random_uniform([num_elements,P],-1.0,1.0 )  )
w2=tf.Variable( tf.random_uniform([P,1],-1.0,1.0 )  )
#a=tf.Variable( tf.random_uniform([[18]],-1.0,1.0),name="weights")    
#b=tf.Variable(tf.zeros([1,1]),name="intercept")
b=tf.Variable(tf.random_normal([P]),name="intercept")
print("x:",x.shape,"a:",a.shape)
#y=tf.matmul(tf.transpose(a),x)+b
#y=tf.matmul(x,a)
x2=tf.sigmoid(tf.matmul(x,w1))+b
y=tf.matmul(x2,w2)

init = tf.global_variables_initializer()
loss = tf.reduce_sum(tf.square(y_-y))/batch_size
train_step=tf.train.GradientDescentOptimizer(0.01).minimize(loss)


In [None]:
#データ表示用セットアップ
graphLoss=[]
graphStep=[]
graphNum=20
repeatAll=50000

with tf.Session() as sess:
    sess.run(init)
    print('初期状態')
    inpShape=inputData[0:batch_size]
    ansShape=inputAns[0:batch_size]
    #print('誤差' + str(sess.run(loss, feed_dict={x: inputData[0:batch_size], y_: inputAns[0:batch_size]})))
    ##print('誤差' + str(sess.run(loss, feed_dict={x: inpShape, y_: ansShape})))
    #print("weigths: %f, intercept: %f" % (sess.run(a), sess.run(b) ))
    #print("input:",inputAns[batch_size*step+1:batch_size*step+1+batch_size])
    ##print("weigths:",sess.run(a),"intercept:",sess.run(b) )
    
    for step in range(repeatAll):
        if (batch_size*step) %datalength < (batch_size*step+batch_size) %datalength :
            batchInputData= inputData[(batch_size*step)%datalength:(batch_size*step+batch_size)%datalength]
            batchInputAns = inputAns[(batch_size*step)%datalength:(batch_size*step+batch_size)%datalength]
        else:
            batchInputData= inputData[(batch_size*step)%datalength:min( (batch_size*step+batch_size)%datalength,datalength-1 )]
            batchInputAns = inputAns[(batch_size*step)%datalength:min( (batch_size*step+batch_size)%datalength,datalength-1) ]
        sess.run(train_step, feed_dict={x: batchInputData, y_: batchInputAns })
        if (step+1) % (repeatAll/graphNum) == 0:
            print('\nStep: %s' % (step+1))
            print('誤差' + str(sess.run(loss, feed_dict={x: batchInputData, y_: batchInputAns })))
            if debugFlg==True:
                print("weights:",sess.run(a), " intercept: %f" % ( sess.run(b)))
            graphLoss.append( sess.run(loss, feed_dict={x: batchInputData, y_: batchInputAns }) )
            graphStep.append(step)
    #最終状態を表示
    print("weights1:",sess.run(w1),"weights2:",sess.run(w2), " intercept:", sess.run(b))


In [None]:
plt.plot(graphStep,graphLoss)