In [None]:
from elasticsearch import helpers
from collections import MutableMapping
import pandas as pd
import numpy as np


def flatten(nested_dict, parent_key=""):
    items = []
    for k, v in nested_dict.items():
        if isinstance(v, MutableMapping):
            items.extend(flatten(v, k).items())
        else:
            items.append((k, v))
    return dict(items)


query = {
    "sort": [{"timestamp": "desc"}],
    "query": {"match_phrase": {"event": "Search result selected"}},
    "size": 10000,
}

response = es.search(body=query, index="search_relevance_implicit")

df = pd.DataFrame([flatten(event["_source"]) for event in response["hits"]["hits"]])

In [None]:
# note: Python client automatically indexes from latest to earliest.  To check start date:

sorted = df.sort_values(by=["timestamp"], ascending=True)
sorted.head()

In [None]:
# data cleansing

df = df.loc[df["network"] != "StaffCorporateDevices"]  # remove staff usage
# df=df.loc[df['timestamp'] > '2019-11-07 00:00:00']  #only use searches after AND implemented
# df=df.loc[df['event'] != 'Search landing']

df2 = df.loc[
    df["timestamp"] >= "2020-03-01 00:00:00"
]  # grabs 2 days' of data, ie from 1/3/20


df2.head(5)

In [None]:
df.sort_values(by=["anonymousId", "timestamp"])
unique_selects = df.drop_duplicates(subset="anonymousId", keep="first")
unique_selects.head(5)

In [None]:
#How many workIds?
summary = unique_selects.groupby("id").count()[["anonymousId"]]
summary

In [None]:
#How many workIds viewed once?
viewed_once = summary.loc[summary["anonymousId"] <= 1]
count = viewed_once["anonymousId"].count()
print(count)

In [None]:
#How many workIds viewed twice?
viewed_twice = summary.loc[summary["anonymousId"] == 2]
count = viewed_twice["anonymousId"].count()
print(count)

In [None]:
# Create bar chart for workIds viewed more than twice.

summary.sort_values(by="anonymousId", ascending=False)[:20].plot.bar(legend=False)

In [None]:
# calculate skewness

skew = summary.skew()
print(skew)

In [None]:
# output data to csv to check

summary.to_csv("skew_for_2_days.csv")