In [3]:
import pandas as pd
import numpy as np

from elasticsearch import helpers 
from collections.abc import MutableMapping


In [4]:
def flatten(nested_dict, parent_key=''):
    items = []
    for k, v in nested_dict.items():
        if isinstance(v, MutableMapping):
            items.extend(flatten(v, k).items())
        else:
            items.append((k, v))
    return dict(items)
    
    
query = {
  "sort": [
    {
      "timestamp": "desc"
    }
  ],
  "query": {
    "match_phrase": {
      "event": "Search"
    }
  },
  "size": 100000
}

response = es.search(
    body=query,
    index="search_relevance_implicit"
)

df = pd.DataFrame([
    flatten(event['_source']) for event in response['hits']['hits']
])

In [5]:
for col in df.columns: 
    print(col) 


event
anonymousId
timestamp
network
toggles
_queryType
aggregations
items.locations.locationType
page
production.dates.from
production.dates.to
query
sort
sortOrder
workType
id
position
resultIdentifiers
resultSubjects
resultWorkType
source
totalResults
resultLanguage


In [6]:
# grab only the columns needed
df2=df[['query', 'timestamp', 'anonymousId','network','event']]

#note: Python client automatically indexes from latest to earliest.  To check start date:

sorted=df2.sort_values(by=['timestamp'], ascending=True) 
sorted.head(2)

Unnamed: 0,query,timestamp,anonymousId,network,event
99999,,2020-06-07T06:34:29.095Z,025d42f3-8e1e-434d-891b-ef83f00b504c,,Search landing
99998,Herbalism,2020-06-07T06:34:30.329Z,02c0f22f-e907-48b7-866f-1d5ff4aeadb7,,Search


In [8]:
#data cleansing
df3=df2.loc[(df2['network'] != 'StaffCorporateDevices') & (df2['event'] == 'Search')]
            
#note: Python client automatically indexes from latest to earliest.  To check start date:  'Search result selected']
df3.head()

Unnamed: 0,query,timestamp,anonymousId,network,event
1,black death,2020-07-16T14:45:42.799Z,4d93b96b-4b5a-43d5-a95e-0baee22bd42e,,Search
2,plague,2020-07-16T14:45:36.926Z,4d93b96b-4b5a-43d5-a95e-0baee22bd42e,,Search
4,queer,2020-06-08T10:35:14.032Z,6791d76f-e458-457e-bef9-6af2f00fe84f,,Search
5,1970’s,2020-06-08T10:35:10.919Z,6a4053f4-17dd-4687-b5f4-c53963ff4c27,,Search
6,anatomy,2020-06-08T10:35:10.677Z,3c70e628-fcf6-4e72-a788-dcc9771ebb10,,Search


In [19]:
#count searches and only keep data for users who search 3 times or more
counts=df3.groupby('anonymousId').count()[['query']]
counts2=counts.loc[(counts['query'] >3)]
#counts2.head()

Unnamed: 0_level_0,query
anonymousId,Unnamed: 1_level_1
0049585a-ab2c-44fc-a239-f9bdd313d1f6,23
005cb843-07bd-4906-8b40-c5b61358f65a,41
006d43f9-ba1c-4b11-a492-f56ce2e2d9a9,15
00846f9b-c224-4d28-8a02-78ffd8868f43,4
0092b25f-1df8-48a8-82db-df3130cb96b6,9


In [21]:
counts3=counts2.drop(columns=['query'])
#counts3.head()


0049585a-ab2c-44fc-a239-f9bdd313d1f6
005cb843-07bd-4906-8b40-c5b61358f65a
006d43f9-ba1c-4b11-a492-f56ce2e2d9a9
00846f9b-c224-4d28-8a02-78ffd8868f43
0092b25f-1df8-48a8-82db-df3130cb96b6


In [41]:
#merge 
searches_3plus=counts3.merge(df3, how='left', on='anonymousId')








pd.to_datetime(stamps, format="%Y%m%d:%H:%M:%S.%f").sort_values()

searches_3plus.sort_values(by='timestamp', ascending=False)
searches_3plus.head()



Unnamed: 0,anonymousId,query,timestamp,network,event
0,0049585a-ab2c-44fc-a239-f9bdd313d1f6,anatomy,2020-06-07T20:12:08.251Z,,Search
1,0049585a-ab2c-44fc-a239-f9bdd313d1f6,anatomy,2020-06-07T18:44:27.936Z,,Search
2,0049585a-ab2c-44fc-a239-f9bdd313d1f6,anatomy,2020-06-07T08:16:01.446Z,,Search
3,0049585a-ab2c-44fc-a239-f9bdd313d1f6,anatomy,2020-06-07T08:10:45.277Z,,Search
4,0049585a-ab2c-44fc-a239-f9bdd313d1f6,anatomy,2020-06-07T08:10:31.637Z,,Search


In [40]:
#output data to csv to check

searches_3plus.to_csv('searches_3plus.csv') 