In [190]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy import spatial
from sklearn.manifold import TSNE
from sklearn.metrics import pairwise_distances
import zipfile
from tqdm import tqdm
import re
import random

In [12]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
%load_ext autoreload
%autoreload 2

from src.path import get_path_from_root
from src.questions import questions
from src.search import load_embeddings_dict, rough_tokenize, make_rank_question, search_questions, load_question_vecs

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
embeddings_dict = load_embeddings_dict()
print(len(embeddings_dict))
embeddings_dict['walrus']

400000


array([ 0.48698 , -0.45614 , -0.88501 , -0.58914 ,  0.75248 ,  0.1977  ,
       -0.31347 , -0.67976 , -0.25581 , -0.86747 ,  0.95085 ,  0.62886 ,
        0.98183 ,  0.68557 ,  0.38731 ,  0.63174 ,  0.70197 ,  0.85176 ,
       -0.86779 ,  0.6683  , -0.77849 , -0.23294 ,  0.68584 , -0.50269 ,
        0.39149 , -0.11991 , -0.58507 ,  1.0757  ,  0.55094 , -0.456   ,
       -0.66728 , -0.16855 , -0.18119 ,  1.4549  , -0.50935 ,  0.40332 ,
       -0.28225 ,  0.070051, -0.71228 , -0.33878 ,  0.34351 , -0.037361,
        0.47597 , -0.57553 ,  0.73109 , -0.43637 ,  0.27931 ,  0.4762  ,
       -0.86048 , -0.20388 ], dtype=float32)

In [5]:
{
    'walrus-fish': spatial.distance.euclidean(embeddings_dict['walrus'], embeddings_dict['fish']),
    'walrus-cat': spatial.distance.euclidean(embeddings_dict['walrus'], embeddings_dict['cat']),
    'walrus-lift': spatial.distance.euclidean(embeddings_dict['walrus'], embeddings_dict['lift']),
    'cat-cat': spatial.distance.euclidean(embeddings_dict['cat'], embeddings_dict['cat']),
    'cat-dog': spatial.distance.euclidean(embeddings_dict['cat'], embeddings_dict['dog']),
}

{'walrus-fish': 5.8787922859191895,
 'walrus-cat': 4.8616862297058105,
 'walrus-lift': 6.322596549987793,
 'cat-cat': 0.0,
 'cat-dog': 1.8846031427383423}

In [7]:
rough_tokenize('What\'s my best friend\'s hat colored like in Zurich?', embeddings_dict)

['my', 'best', 'friend', 'hat', 'colored', 'like']

In [14]:
question_vecs = load_question_vecs()
len(question_vecs)

76

In [15]:
search_questions('old house', question_vecs, embeddings_dict)[:5]

[{'id': 46, 'rank': (7.389457017118303, 3)},
 {'id': 47, 'rank': (8.77560891740643, 5)},
 {'id': 34, 'rank': (11.622080763863892, 3)},
 {'id': 35, 'rank': (11.622080763863892, 3)},
 {'id': 48, 'rank': (11.622080763863892, 4)}]

In [16]:
len(embeddings_dict)

400000

In [166]:
good_words = []
for w in embeddings_dict.keys():
    if rough_tokenize(w, embeddings_dict) == [w]:
        good_words.append(w)
print(len(good_words))

286849


In [173]:
res_count = 0
for i in range(1000):
    w = random.choice(good_words)
    res = search_questions(w, question_vecs, embeddings_dict)
    # print(w, len(res), res[0]['rank'] if len(res) else None)
    if len(res) and res[0]['rank'][0] < 20:
        res_count += 1
        print(w, '|', next(q['text'] for q in questions if q['id'] == res[0]['id']))
print(res_count)

sculpted | How many drinking fountains are there in Zurich?
discouragement | How many foreigners live in Zurich?
overabundance | How many foreigners live in Zurich?
ordering | How many tram and bus stops are there in Zurich?
cstc | How many tram and bus stops are there in Zurich?
grinning | How many foreigners live in Zurich?
awestruck | How many foreigners live in Zurich?
roofer | How many dentists are there in Zurich?
attempted | How many tram and bus stops are there in Zurich?
announcement | How does energy use change over the course of a week?
affront | How many foreigners live in Zurich?
sprawling | Where is the tallest building in Zurich?
conducive | What contributes to air pollution?
orthopedic | How many dentists are there in Zurich?
identical | How many buildings in Zurich are older than 100?
brown | How many people in Zurich live with kids?
bonda | How many tram and bus stops are there in Zurich?
forced | When were dogs born in Zurich?
comeback | How many apartments were buil

In [171]:
100e6 / (286849 * (220 / 1000))

1584.6157893018785

In [184]:
good_word_embeddings = np.array([
    embeddings_dict[w]
    for w in good_words
])
good_word_embeddings.shape

(286849, 50)

In [188]:
question_words = []
question_words_ids = []
for q in questions:
    for w in rough_tokenize(q['text'], embeddings_dict):
        if w not in embeddings_dict:
            continue
        question_words.append(w)
        question_words_ids.append(q['id'])
print(len(question_words))

291


In [189]:
question_word_embeddings = np.array([
    embeddings_dict[w]
    for w in question_words
])
question_word_embeddings.shape

(291, 50)

In [227]:
good_question_distances = pairwise_distances(good_word_embeddings, question_word_embeddings)
print(good_question_distances.shape)
good_question_distances = good_question_distances[good_question_distances.min(axis=1) <= np.sqrt(25)]
print(
    good_question_distances.shape,
    good_question_distances[123, 45],
    spatial.distance.euclidean(good_word_embeddings[123], question_word_embeddings[45]),
)

(286849, 291)
(62626, 291) 4.3970356 3.932997226715088


In [210]:
question_ids = [q['id'] for q in questions]
question_ids.sort()
assert question_ids[0] == 1
assert question_ids[-1] == len(question_ids)

temp = [None for q in questions]
for q in questions:
    word_indices = []
    for word_index, question_id in enumerate(question_words_ids):
        if question_id == q['id']:
            word_indices.append(word_index)
    temp[q['id'] - 1] = np.min(good_question_distances[:, word_indices], axis=1)
good_question_distances_grouped = np.array(temp).T
good_question_distances_grouped.shape

(286849, 76)

In [230]:
((62626 * 76) * 4) / 100e6

0.19038304

In [258]:
def chunk_name_from_word(w):
    l = 2
    return w[:l].ljust(l, '_')

word_indices_by_chunk = {}
for i, w in enumerate(good_words):
    chunk_name = chunk_name_from_word(w)
    if chunk_name not in word_indices_by_chunk:
        word_indices_by_chunk[chunk_name] = []
    word_indices_by_chunk[chunk_name].append(i)
    
(
    len(word_indices_by_chunk),
    len(word_indices_by_chunk[chunk_name_from_word('walrus')]),
    good_words[word_indices_by_chunk[chunk_name_from_word('walrus')][0]],
)

(698, 2014, 'war')