In [None]:
import json
import math

from adjustText import adjust_text
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim
from tqdm.auto import tqdm
import umap

In [None]:
with open('cooc_r0_0002526.json', 'r') as f:
    obj = json.load(f)
cooc = np.array(obj['binary_counts'])
names = obj['names']

In [None]:
# Histogram the log-cooccurrences to figure out a good
# cutoff for the weighting function.
coocs = cooc.flatten()
coocs = coocs[np.nonzero(coocs)]
plt.hist(np.log10(coocs))
plt.xlabel('log_10 cooccurrence')
plt.ylabel('count')
plt.show()

# Select a weighting cutoff based on a percentile.
x_max = sorted(coocs)[len(coocs) // 2]
print('selected x_max as', x_max)

In [None]:
# Train GloVe-style embeddings.

n_stores = len(names)
n_feats = 16
bias_lr_boost = math.sqrt(n_feats)
features = nn.Parameter(torch.randn(n_stores, n_feats)*0.1)
features_bias = nn.Parameter(torch.zeros(n_stores))
contexts = nn.Parameter(torch.randn(n_stores, n_feats)*0.1)
contexts_bias = nn.Parameter(torch.zeros(n_stores))
cooc_matrix = torch.tensor(cooc).to(features)

def glove_loss(alpha=0.75):
    pred = (features @ contexts.T) + (features_bias[:, None] + contexts_bias) * bias_lr_boost
    targ = torch.where(cooc_matrix == 0, 0.0, torch.log(cooc_matrix))
    weights = (cooc_matrix.clamp(max=x_max) / x_max) ** alpha
    weights -= torch.diag(torch.diag(weights))
    return (weights * (pred - targ)**2).mean()

num_iters = 20000
opt = torch.optim.Adam([features, features_bias, contexts, contexts_bias], lr=1e-2)
lrs = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=num_iters)
losses = []
pbar = tqdm(range(num_iters))
for _ in pbar:
    loss = glove_loss()
    opt.zero_grad()
    loss.backward()
    opt.step()
    lrs.step()
    losses.append(loss.item())
    pbar.set_description(f"loss {loss.item():7.05}")

features = features.detach().numpy()

In [None]:
plt.plot(losses)
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.ylim(min(losses), sorted(losses)[round(len(losses)*0.9)])
plt.show()



In [None]:
reducer = umap.UMAP()
embedding = reducer.fit_transform(features)
embedding.shape

In [None]:
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(embedding[:, 0], embedding[:, 1])
texts = [plt.text(embedding[i, 0], embedding[i, 1], name, ha='center', va='center') for i, name in enumerate(names)]
adjust_text(texts, arrowprops=dict(arrowstyle='->', color='red'))
fig.show()

In [None]:
# Looking up neighbors using co-occurrences alone doesn't work well.
# Example: McDonald's has tons of locations, so chances are it will
# be near just about anything.

correlation = cooc @ cooc.T
correlation /= np.sqrt(np.diag(correlation)[None, :] @ np.diag(correlation)[:, None])

def frequent_neighbors(store):
    idx = names.index(store)
    row = cooc[idx].copy()
    store_count = row[idx]
    row[idx] = 0
    print(f'Frequent neighbors for "{store}" ({store_count} locations)')
    indices = np.argsort(row)[::-1][:5]
    for i in indices:
        print(f"{names[i]}: {100*(row[i] / store_count):.02f}%")
    print('----------------')

frequent_neighbors("Sephora")
frequent_neighbors("McDonald's")

In [None]:
# Looking up neighbors using embeddings.

norm_features = features / np.linalg.norm(features, axis=-1, keepdims=True)
def frequent_neighbors(store):
    idx = names.index(store)
    dots = norm_features @ norm_features[idx]
    dots[idx] = 0
    print(f'Cosine neighbors for "{store}"')
    indices = np.argsort(dots)[::-1][:5]
    for i in indices:
        print(f"{names[i]}: dot product {dots[i]}")
    print('----------------')

# frequent_neighbors("Sephora")
frequent_neighbors("McDonald's")
frequent_neighbors("Apple")