In [None]:
import json

from adjustText import adjust_text
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
from torch.optim import Adam
from tqdm.auto import tqdm
import umap

In [None]:
with open('cooc.json', 'r') as f:
    obj = json.load(f)
cooc = obj['matrix']
names = obj['names']

In [None]:
# Histogram the log-cooccurrences to figure out a good
# cutoff for the weighting function.
coocs = np.array(cooc).flatten()
coocs = coocs[np.nonzero(coocs)]
plt.hist(np.log10(coocs))
plt.xlabel('log_10 cooccurrence')
plt.ylabel('count')
plt.show()

# Select a weighting cutoff based on a percentile.
x_max = sorted(coocs)[len(coocs) // 2]
print('selected x_max as', x_max)

In [None]:
# Train GloVe-style embeddings.

n_stores = len(names)
n_feats = 2
features = nn.Parameter(torch.zeros(n_stores, n_feats))
features_bias = nn.Parameter(torch.zeros(n_stores))
contexts = nn.Parameter(torch.zeros(n_stores, n_feats))
contexts_bias = nn.Parameter(torch.zeros(n_stores))
cooc_matrix = torch.tensor(cooc).to(features)

def glove_loss(alpha=0.75):
    pred = (features @ contexts.T) + features_bias[:, None] + contexts_bias
    targ = torch.where(cooc_matrix == 0, 0.0, torch.log(cooc_matrix))
    weights = cooc_matrix.clamp(max=x_max) ** alpha
    return (weights * (pred - targ)**2).mean()

opt = Adam([features, features_bias, contexts, contexts_bias], lr=1e-2)
losses = []
for _ in tqdm(range(3000)):
    loss = glove_loss()
    opt.zero_grad()
    loss.backward()
    opt.step()
    losses.append(loss.item())

plt.plot(losses)
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.ylim(0, sorted(losses)[(8*len(losses))//10])
plt.show()

features = features.detach().numpy()

In [None]:
reducer = umap.UMAP()
embedding = reducer.fit_transform(features)
embedding.shape

In [None]:
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(embedding[:, 0], embedding[:, 1])
texts = [plt.text(embedding[i, 0], embedding[i, 1], name, ha='center', va='center') for i, name in enumerate(names)]
adjust_text(texts, arrowprops=dict(arrowstyle='->', color='red'))
fig.show()

In [None]:
# correlations = np.array(cooc)
# counts = np.diag(correlations)
# correlations /= np.sqrt(counts[None] @ counts[:, None])
# correlations[names.index('Starbucks')]
correlations = obj['correlation']
for i, (name, row) in enumerate(zip(names, correlations)):
    row_copy = row.copy()
    row_copy[i] = 0
    max_idx = np.argmax(row_copy)
    print(name, '<->', names[max_idx], row_copy[max_idx])

In [None]:
def most_similar(store):
    idx = names.index(store)
    row = obj['correlation'][idx].copy()
    row[idx] = 0
    indices = np.argsort(row)[::-1][:5]
    print('Results for:', store)
    for i in indices:
        print(f"{names[i]}: {row[i]}")
    print('----------------')

most_similar("Sephora")
most_similar("Ulta Beauty")
most_similar("Victoria's Secret")
most_similar("Five Below")