In [1]:
import gensim.downloader
import nltk
import pandas as pd

from tqdm import tqdm

%config Completer.use_jedi = False
%matplotlib inline

nltk.download('wordnet')
nltk.download('punkt')

clickstream_path = '../data/Behavioral_sample_for_interview_2021_02_05.csv'

[nltk_data] Downloading package wordnet to /Users/alex/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/alex/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Dataset preprocessing
- lemmatize all words
- group events by session (just order, not GROUP BY)
- sort events by timestamp within the session
- explicitly convert timestamp column to timestamp type (with the assumption that date format is YYYY-MM-DD)

In [2]:
lemmatizer = nltk.WordNetLemmatizer()


def normalize(words):
    return ' '.join([
        lemmatizer.lemmatize(word) 
        for word in words.split()
    ])

In [3]:
df = pd.read_csv(clickstream_path)

df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d %H:%M')
df = df.sort_values(by=['session_id', 'timestamp'])
df['term'] = df['term'].astype(str).apply(normalize)

### Intrasession term pairs

##### Hypothesis:
during the session, most users searching for one item type

##### Solution:
- remove all click_through events, we are interested only in search queries
- get all consecutive term pairs in each session
- drop duplicate pairs (within one session)
- filter out all pairs where one item is a subset of another
- count in how many session term pair accrues (more term co-occurrences more likely they are synonyms)
- count threshold could be adjusted according to desired precision and recall

##### Not solved problems:
- all term pairs are symmetric, and we can't say anything about hierarchy (e.g. (apple, fruit) == (fruit, apple)) or we can assume that user first search category and then enter a more specific term (category -> item)
- we will find synonymic phrases, not words

In [4]:
session_terms = (
    df
    [df['action'] == 'search']
    .drop_duplicates(['session_id', 'term'])
    .groupby('session_id')
    ['term'].apply(list)
)

term_pairs = session_terms.apply(lambda terms: list(nltk.bigrams(terms))).sum()
term_pairs = pd.Series(term_pairs)

In [5]:
term_pairs = term_pairs[
    term_pairs.apply(
        lambda pair: pair[0] not in pair[1] and pair[1] not in pair[0]
    )
]

results, sorted by confidence

In [15]:
term_pairs.value_counts().index.to_list()[:50]

[('kettle', 'toaster'),
 ('cushion', 'throw'),
 ('pillow', 'cushion'),
 ('back pack', 'backpack'),
 ('toaster', 'kettle'),
 ('lunchbox', 'lunch box'),
 ('pop it', 'fidget toy'),
 ('desk', 'table'),
 ('pop it', 'fidget'),
 ('fidget toy', 'pop it'),
 ('contact', 'book cover'),
 ('bean bag', 'beanbag'),
 ('car seat', 'booster seat'),
 ('tv', 'television'),
 ('bookcase', 'shelf'),
 ('table', 'desk'),
 ('tv unit', 'entertainment unit'),
 ('barbie', 'lol'),
 ('fidget', 'pop it'),
 ('book covering', 'contact'),
 ('light', 'lamp'),
 ('car seat', 'booster'),
 ('book cover', 'back to school'),
 ('rug', 'mat'),
 ('beanbag', 'bean bag'),
 ('maternity', 'nursing'),
 ('back pack', 'bag'),
 ('ps5', '64226187'),
 ('throw', 'cushion'),
 ('bike pant', 'bike short'),
 ('desk', 'chair'),
 ('school bag', 'lunch box'),
 ('pop it', 'toy'),
 ('lonsdale', 'filum'),
 ('trolley', 'cart'),
 ('book cover', 'contact'),
 ('wall art', 'print'),
 ('bather', 'bikini'),
 ('contact', 'book covering'),
 ('bookcase', 'book

### Subsession term pairs (new session start on click_through event)

##### Hypothesis:
during a session and before click_through event, most users searching for one item type, click_trough event means user found desired item and looking for something else in the next search queries

##### Solution:
- calculate the number of click_through events before each event (mark each click_trough event as 1, and  use cumulative sum)
- use session_id and clicks_before as a unique identifier for subsession (all actions within one session before search success (click_throught event)) 
- filter out all click_through events, we are interested only in search queries
- get all consecutive term pairs in each subsession
- filter out all pairs where one item is a subset of another
- count in how many session term pair occurs (more term concurrences more likely they are synonyms)
- count threshold could be adjusted according to desired precision and recall

##### Not solved problems:
- all term pairs are symmetric, and we can't say anything about hierarchy (e.g. (apple, fruit) == (fruit, apple)) or we can assume that user first search category and then enter a more specific term (category -> item)
- we will find synonymic phrases, not words

In [7]:
df['clicks_before'] = (df['action'] == 'click_through').astype(int).cumsum()

In [8]:
session_terms = (
    df
    [df['action'] == 'search']
    .drop_duplicates(['session_id', 'term'])
    .groupby(['session_id', 'clicks_before'])
    ['term'].apply(list)
)

term_pairs = pd.Series(
    session_terms
    .apply(lambda terms: list(nltk.bigrams(terms)))
    .sum()
)

term_pairs = term_pairs[
    term_pairs.apply(
        lambda pair: pair[0] not in pair[1] and pair[1] not in pair[0]
    )
]

results, sorted by confidence

In [16]:
term_pairs.value_counts().index.to_list()[:50]

[('kettle', 'toaster'),
 ('cushion', 'throw'),
 ('pillow', 'cushion'),
 ('back pack', 'backpack'),
 ('toaster', 'kettle'),
 ('lunchbox', 'lunch box'),
 ('pop it', 'fidget toy'),
 ('desk', 'table'),
 ('pop it', 'fidget'),
 ('fidget toy', 'pop it'),
 ('contact', 'book cover'),
 ('bean bag', 'beanbag'),
 ('car seat', 'booster seat'),
 ('tv', 'television'),
 ('bookcase', 'shelf'),
 ('table', 'desk'),
 ('tv unit', 'entertainment unit'),
 ('barbie', 'lol'),
 ('fidget', 'pop it'),
 ('book covering', 'contact'),
 ('light', 'lamp'),
 ('car seat', 'booster'),
 ('book cover', 'back to school'),
 ('rug', 'mat'),
 ('beanbag', 'bean bag'),
 ('maternity', 'nursing'),
 ('back pack', 'bag'),
 ('ps5', '64226187'),
 ('throw', 'cushion'),
 ('bike pant', 'bike short'),
 ('desk', 'chair'),
 ('school bag', 'lunch box'),
 ('pop it', 'toy'),
 ('lonsdale', 'filum'),
 ('trolley', 'cart'),
 ('book cover', 'contact'),
 ('wall art', 'print'),
 ('bather', 'bikini'),
 ('contact', 'book covering'),
 ('bookcase', 'book

### Pretrained word vectorizer

##### Hypothesys:
Pretrained NLP models such as Word2Vec, Glove, Fasttext will give us good quality synonyms.

##### Solution:
- get all unique words from search queries
- get top similar words for each word
- if these words are found in search terms, we should use them as synonyms in our search engine
- word similarity threshold could be adjusted according to desired precision and recall

##### Not solved problems:
- no customer-specific information used
- we missed all multy-word search terms

In [10]:
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-200')

In [11]:
search_words = set(
    df['term']
    .drop_duplicates()
    .str.split()
    .sum()
)

In [12]:
scores = []
pairs = []

for term in tqdm(df['term'].unique()):
    try:
        for synonym, score in glove_vectors.most_similar(term, topn=10):
            synonym = normalize(synonym)
            if (synonym != term) and (synonym in search_words):
                pairs.append((synonym, term))
                scores.append(score)
    except KeyError:
        pass
    
synonym_pairs = pd.DataFrame({
    'pair': pairs, 
    'score': scores}
).drop_duplicates(subset=['pair'])

100%|██████████| 39470/39470 [02:39<00:00, 248.04it/s]


first items are mostly numbers and just popular words, so we will skip it

In [22]:
synonym_pairs.sort_values('score', ascending=False)['pair'].to_list()[450:500]

[('hoping', 'hope'),
 ('changing', 'change'),
 ('change', 'changing'),
 ('coaster', 'roller'),
 ('roller', 'coaster'),
 ('duffel', 'duffle'),
 ('expect', 'wo'),
 ('could', 'be'),
 ('we', 'our'),
 ('could', 'to'),
 ('ipod', 'iphones'),
 ('only', 'no'),
 ('i', 'my'),
 ('900', '500'),
 ('plane', 'airplane'),
 ('airplane', 'plane'),
 ('you', 'can'),
 ('trouser', 'baggy'),
 ('grain', 'wheat'),
 ('warmer', 'cooler'),
 ('eleven', 'nine'),
 ('back', 'out'),
 ('out', 'back'),
 ('s8', 's9'),
 ('one', 'five'),
 ('ca', 'wo'),
 ('wo', 'ca'),
 ('enough', 'able'),
 ('manga', 'anime'),
 ('anime', 'manga'),
 ('400', '100'),
 ('but', 'he'),
 ('serving', 'serve'),
 ('serve', 'serving'),
 ('ran', 'running'),
 ('be', 'can'),
 ('can', 'be'),
 ('would', 'able'),
 ('europe', 'european'),
 ('maybe', 'me'),
 ('might', 'wo'),
 ('daughter', 'sister'),
 ('freezer', 'refrigerator'),
 ('refrigerator', 'freezer'),
 ('white', 'black'),
 ('black', 'white'),
 ('once', 'then'),
 ('then', 'once'),
 ('make', 'to'),
 ('able

### Potential improvements

- pairs from word vectorizer looks noisy and could be improved with better model selection (e.g. something trained on retail data for example)
- it is worth trying to get some kind of diff from sessions and sub-sessions synonymic phrase pairs and assume it as word synonyms
- get more data and train word2vec on search terms to find synonymic words
- get more data and train word2vec on sessions (one session is sentence, one search query is a word) to find synonymic phrases