In [None]:
with open (“./passwords.json”, “r”)
as f:
sensitive_data = json.load(f)

In [None]:
from collections.abc import MutableMapping

import numpy as np
import pandas as pd
from elasticsearch import helpers
from elasticsearch.helpers import scan


def flatten(nested_dict, parent_key=""):
    items = []
    for k, v in nested_dict.items():
        if isinstance(v, MutableMapping):
            items.extend(flatten(v, k).items())
        else:
            items.append((k, v))
    return dict(items)


query = {
    "sort": [{"timestamp": "desc"}],
    "query": {"match_phrase": {"event": "Search result selected"}},
}


# note: scan works fast because it grabs data unsorted.
# grabs 100,000 without scan. seems to have trouble past 500,000
response = helpers.scan(
    es,
    query=query,
    preserve_order=True,
    index="search_relevance_implicit",
)
n_events_to_fetch = 250000


df = pd.DataFrame(
    [flatten(next(response)["_source"]) for _ in range(n_events_to_fetch)]
)

In [None]:
# note: Python client automatically indexes from latest to earliest.

# "timestamp": datetime(2010, 10, 10, 10, 10, 10)

# To check start date:

sorted = df.sort_values(by=["timestamp"], ascending=True)
sorted.head()

In [None]:
# remove staff usage, limit time frame to 1/7/20 - 30/9/20
df2 = df.loc[
    (df["network"] != "StaffCorporateDevices")
    & (df["timestamp"] >= "2020-07-01")
    & (df["timestamp"] < "2020-10-01")
]

# grab only the columns needed
df2 = df2[["id", "resultWorkType", "anonymousId", "timestamp"]]

# sort the dataframe
df2.sort_values(by=["anonymousId", "id"], inplace=True)
df2.head(5)

In [None]:
# dedupe

df2.sort_values(by=["id", "anonymousId"])
df3 = df2.drop_duplicates(subset=["anonymousId", "id"], keep="first")

In [None]:
# How many workIds?
summary = df3.groupby("id").count()[["anonymousId"]]
print(summary)

In [None]:
# How many workIds?
summary["anonymousId"].count()

In [None]:
# create index for dataframe
sorted = summary.sort_values(by=["anonymousId"], ascending=False)
sorted.head()

In [None]:
# output data to csv to check

summary.to_csv("selects_by_workId.csv")