In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy import spatial
from sklearn.manifold import TSNE
import zipfile
from tqdm import tqdm
import re

In [2]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
%load_ext autoreload
%autoreload 2

from src.path import get_path_from_root
from src.questions import questions
from src.search import load_embeddings_dict, rough_tokenize, rough_vecs_from_scentence, make_rank_question

In [3]:
embeddings_dict = load_embeddings_dict()
print(len(embeddings_dict))
embeddings_dict['walrus']

400000


array([ 0.48698 , -0.45614 , -0.88501 , -0.58914 ,  0.75248 ,  0.1977  ,
       -0.31347 , -0.67976 , -0.25581 , -0.86747 ,  0.95085 ,  0.62886 ,
        0.98183 ,  0.68557 ,  0.38731 ,  0.63174 ,  0.70197 ,  0.85176 ,
       -0.86779 ,  0.6683  , -0.77849 , -0.23294 ,  0.68584 , -0.50269 ,
        0.39149 , -0.11991 , -0.58507 ,  1.0757  ,  0.55094 , -0.456   ,
       -0.66728 , -0.16855 , -0.18119 ,  1.4549  , -0.50935 ,  0.40332 ,
       -0.28225 ,  0.070051, -0.71228 , -0.33878 ,  0.34351 , -0.037361,
        0.47597 , -0.57553 ,  0.73109 , -0.43637 ,  0.27931 ,  0.4762  ,
       -0.86048 , -0.20388 ], dtype=float32)

In [4]:
{
    'walrus-fish': spatial.distance.euclidean(embeddings_dict['walrus'], embeddings_dict['fish']),
    'walrus-cat': spatial.distance.euclidean(embeddings_dict['walrus'], embeddings_dict['cat']),
    'walrus-lift': spatial.distance.euclidean(embeddings_dict['walrus'], embeddings_dict['lift']),
    'cat-cat': spatial.distance.euclidean(embeddings_dict['cat'], embeddings_dict['cat']),
    'cat-dog': spatial.distance.euclidean(embeddings_dict['cat'], embeddings_dict['dog']),
}

{'walrus-fish': 5.8787922859191895,
 'walrus-cat': 4.8616862297058105,
 'walrus-lift': 6.322596549987793,
 'cat-cat': 0.0,
 'cat-dog': 1.8846031427383423}

In [5]:
rough_tokenize('What\'s my best friend\'s hat colored like in Zurich?')

['whats', 'my', 'best', 'friends', 'hat', 'colored', 'like']

In [6]:
question_vecs = [
    rough_vecs_from_scentence(q['text'], embeddings_dict)
    for q in questions
]
len(question_vecs[0])

4

In [7]:
rank_question = make_rank_question('old buildings', embeddings_dict)
temp = [
    { 'text': q['text'], 'rank': rank_question(q_vec) }
    for q, q_vec in zip(questions, question_vecs)
]
temp.sort(key=lambda e: e['rank'])
temp[:5]

[{'text': 'How old is the oldest building in Zurich?',
  'rank': 8.483619540857944},
 {'text': 'How many buildings in Zurich are older than 100?',
  'rank': 20.60789794227844},
 {'text': 'How many single-family houses are there in Zurich?',
  'rank': 22.799968243924468},
 {'text': 'When was the oldest building in Zurich built?',
  'rank': 24.698596945160944},
 {'text': 'Where is the tallest building in Zurich?',
  'rank': 26.3936886691381}]