In [56]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy import spatial
from sklearn.manifold import TSNE
import zipfile
from tqdm import tqdm
import re

In [53]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
%load_ext autoreload
%autoreload 2

from src.path import get_path_from_root
from src.questions import questions

Exception ignored in: <function tqdm.__del__ at 0x7f95bea15940>
Traceback (most recent call last):
  File "/nix/store/6gzjqk57c8jg7yai948yn0bc6rjz1zd9-python3-3.9.6-env/lib/python3.9/site-packages/tqdm/std.py", line 1145, in __del__
    self.close()
  File "/nix/store/6gzjqk57c8jg7yai948yn0bc6rjz1zd9-python3-3.9.6-env/lib/python3.9/site-packages/tqdm/notebook.py", line 283, in close
    self.disp(bar_style='danger', check_delay=False)
AttributeError: 'tqdm_notebook' object has no attribute 'disp'


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [45]:
embeddings_dict = {}
with zipfile.ZipFile(str(get_path_from_root('data/search/glove.6B.csv')), 'r') as archive:
    with archive.open("glove.6B.50d.txt", 'r') as f:
        for line in tqdm(f, total=400000):
            line = str(line, encoding='utf-8')
            parts = line.strip().split(' ')
            word = parts[0]
            vec = np.asarray(parts[1:], "float32")
            embeddings_dict[word] = vec

100%|██████████| 400000/400000 [00:07<00:00, 50705.29it/s]


In [47]:
embeddings_dict['walrus']

array([ 0.48698 , -0.45614 , -0.88501 , -0.58914 ,  0.75248 ,  0.1977  ,
       -0.31347 , -0.67976 , -0.25581 , -0.86747 ,  0.95085 ,  0.62886 ,
        0.98183 ,  0.68557 ,  0.38731 ,  0.63174 ,  0.70197 ,  0.85176 ,
       -0.86779 ,  0.6683  , -0.77849 , -0.23294 ,  0.68584 , -0.50269 ,
        0.39149 , -0.11991 , -0.58507 ,  1.0757  ,  0.55094 , -0.456   ,
       -0.66728 , -0.16855 , -0.18119 ,  1.4549  , -0.50935 ,  0.40332 ,
       -0.28225 ,  0.070051, -0.71228 , -0.33878 ,  0.34351 , -0.037361,
        0.47597 , -0.57553 ,  0.73109 , -0.43637 ,  0.27931 ,  0.4762  ,
       -0.86048 , -0.20388 ], dtype=float32)

In [51]:
{
    'walrus-fish': spatial.distance.euclidean(embeddings_dict['walrus'], embeddings_dict['fish']),
    'walrus-cat': spatial.distance.euclidean(embeddings_dict['walrus'], embeddings_dict['cat']),
    'walrus-lift': spatial.distance.euclidean(embeddings_dict['walrus'], embeddings_dict['lift']),
    'cat-cat': spatial.distance.euclidean(embeddings_dict['cat'], embeddings_dict['cat']),
    'cat-dog': spatial.distance.euclidean(embeddings_dict['cat'], embeddings_dict['dog']),
}

{'walrus-fish': 5.8787922859191895,
 'walrus-cat': 4.8616862297058105,
 'walrus-lift': 6.322596549987793,
 'cat-cat': 0.0,
 'cat-dog': 1.8846031427383423}

In [181]:
stop_words = 'a the does did in zurich use how there'
stop_words = set(stop_words.split(' '))

def rough_tokenize(s):
    return [
        w.lower()
        for w in re.sub(r'[^a-zA-Z]', ' ', s.replace("'", '')).strip().split()
        if w.lower() not in stop_words
    ]

rough_tokenize('What\'s my best friend\'s hat colored like in Zurich?')

['whats', 'my', 'best', 'friends', 'hat', 'colored', 'like']

In [136]:
def rough_vecs_from_scentence(s):
    return [
        embeddings_dict[word]
        for word in rough_tokenize(s)
        if word in embeddings_dict
    ]

question_vecs = [
    rough_vecs_from_scentence(q['text'])
    for q in questions
]
len(question_vecs[0])

6

In [189]:
def make_rank_question(query):
    query_word_vecs = rough_vecs_from_scentence(query)
    
    def rank_question(question_word_vecs):
        distances_per_query_word = np.array([
            min(
                spatial.distance.euclidean(query_word_vec, question_word_vec)
                for question_word_vec in question_word_vecs
            )
            for query_word_vec in query_word_vecs
        ])
        rank = np.sum(distances_per_query_word ** 2)
        return np.nan_to_num(rank, np.inf)
    
    return rank_question

In [199]:
rank_question = make_rank_question('old buildings')
temp = [
    { 'text': q['text'], 'rank': rank_question(q_vec) }
    for q, q_vec in zip(questions, question_vecs)
]
temp.sort(key=lambda e: e['rank'])
temp[:5]

[{'text': 'How old is the oldest building in Zurich?',
  'rank': 8.483619540857944},
 {'text': 'How many buildings in Zurich are older than 100?',
  'rank': 20.60789794227844},
 {'text': 'How many single-family houses are there in Zurich?',
  'rank': 22.799968243924468},
 {'text': 'When was the oldest building in Zurich built?',
  'rank': 24.698596945160944},
 {'text': 'Where is the tallest building in Zurich?',
  'rank': 26.3936886691381}]