In [106]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
plt.rcParams['figure.figsize'] = (30, 30)

import wikipedia

import re
import spacy
from bs4 import BeautifulSoup
from IPython.core.display import display, HTML

import numpy as np
import pandas as pd
import networkx as nx

from umap import UMAP
from sklearn.cluster import KMeans, AgglomerativeClustering
from scipy.spatial.distance import cosine

from tqdm import tqdm_notebook as tqdm

In [79]:
nlp = spacy.load('en')

In [None]:
df = pd.read_json('../data/calm_records.json')

### get average word vector for our collection-level record

In [None]:
record = df.loc[269229]['AdminHistory'][0]
soup = BeautifulSoup(record, 'html.parser')
plain_text = soup.get_text()
plain_text = ' '.join(plain_text.split())


print(plain_text)

In [None]:
doc = nlp(plain_text)
doc_avg_wv = np.array([word.vector for word in doc]).mean(axis=0)

### get average word vector for a specific, known wikipedia page

In [None]:
h = wikipedia.page('Hinge_loss')
doc = nlp(h.content)
np.array([word.vector for word in doc]).mean(axis=0).shape

In [None]:
known_page = nlp(wikipedia.page('dog').content)
known_avg_wv = np.array([word.vector for word in known_page]).mean(axis=0)

### get average word vectors for all pages in wikipedia search 

In [None]:
wikipedia.search('pool')

In [None]:
possible_avg_wvs = pd.Series()
possible_page_titles = [title for title in wikipedia.search('chicken')
                        if 'disambiguation' not in title]
print(possible_page_titles)

In [48]:
def get_pages(page_name):
    pages = []
    try:
        wikipedia.page(page_name).url
        # we're just hitting the url to check for disambiguation errors
        pages.append(page_name)
    except wikipedia.exceptions.DisambiguationError as disambiguation:
        for option in disambiguation.options:
            # Note that we're only going one level deep into disambiguations.
            # This should be more than enough for our purposes, and it's easy
            # to get caught in horrible, endless loops and branches if we make
            # this properly recursive.
            try: 
                wikipedia.page(option).url
                pages.append(option)
            except wikipedia.exceptions.DisambiguationError: pass
    return pages

In [None]:
def flatten(list_of_lists):
    return [j for i in list_of_lists for j in i]

In [None]:
all_page_names = np.unique(flatten([get_pages(page_name) 
                                    for page_name in possible_page_titles]))

In [None]:
known_page = nlp(wikipedia.page('dog').summary)
known_avg_wv = np.array([word.vector for word in known_page]).mean(axis=0)

In [None]:
candidate_wvs = pd.Series({page_name: (np.array([word.vector for word in 
                                                 nlp(wikipedia.page(page_name).content)])
                                       .mean(axis=0))
                           for page_name in tqdm(all_page_names)})

in our example we have a load of text about swimming, and we want to figure out which version of the page 'pool' is most relevant

In [None]:
df = candidate_wvs.to_frame('word vector')

df['similarity'] = df['word vector'].apply(lambda avg_wv: cosine(avg_wv, known_avg_wv))
df.sort_values(by='similarity')

In [None]:
wikipedia.page('Chicken wings as food').url

In [None]:
for ent in nlp(plain_text).ents:
    print(ent)

In [None]:
all_page_names = np.unique(flatten([get_pages(page_name) 
                                    for page_name in wikipedia.search(str(nlp(plain_text).ents[0]))]))

candidate_wvs = pd.Series({page_name: (np.array([word.vector for word in 
                                                 nlp(wikipedia.page(page_name).content)])
                                       .mean(axis=0))
                           for page_name in tqdm(all_page_names)})

In [None]:
df = candidate_wvs.to_frame('word vector')

df['similarity'] = df['word vector'].apply(lambda avg_wv: cosine(avg_wv, doc_avg_wv))
df.sort_values(by='similarity')

In [None]:
wikipedia.search('dog')

In [None]:
string_2 = wikipedia.page('chicken').summary

# try (ratio'd) set intersection of words

# try tfidf

# try glove

In [62]:
with open('/Users/pimh/Downloads/glove.6B/glove.6B.300d.txt') as f:
    glove = {}
    for line in tqdm(f.read().split('\n')):
        try:
            line = line.split()
            id = line[0]
            wv = np.array(line[1:]).astype(np.float32)
            glove[id] = wv
        except: pass

In [63]:
wikipedia.page('swimming').summary

'Swimming is the self-propulsion of a person through fresh or salt water, usually for recreation, sport, exercise, or survival. Locomotion is achieved through coordinated movement of the limbs, the body, or both. Humans can hold their breath underwater and undertake rudimentary locomotive swimming within weeks of birth, as an evolutionary response.\nSwimming is consistently among top public recreational activities, and in some countries, swimming lessons are a compulsory part of the educational curriculum. As a formalized sport, swimming features in a range of local, national, and international competitions, including every modern Summer Olympics.'

In [125]:
candidate_page_summaries = pd.Series({page: wikipedia.page(page).content.lower()
                                      for page in tqdm(get_pages('amazon'))})



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [126]:
known_summary = wikipedia.page('rainforest').content.lower()
known_wv = get_doc_vector(known_summary)

In [127]:
def get_doc_vector(input_text):
    return np.stack([glove[str(word)] for word in nlp(input_text)
                     if str(word) in glove.keys()]).mean(axis=0)

In [128]:
frame = candidate_page_summaries.apply(get_doc_vector).to_frame('wv')

In [129]:
frame['similarity'] = frame['wv'].apply(lambda x: cosine(x, known_wv))

In [130]:
frame.sort_values(by='similarity')

Unnamed: 0,wv,similarity
Amazon rainforest,"[-0.13217017, 0.09086505, -0.0535981, -0.11312...",0.010135
Amazon basin,"[-0.10516881, 0.11370765, -0.053919222, -0.132...",0.010877
Amazon Reef,"[-0.06606301, 0.10743145, -0.054334152, -0.128...",0.027716
Amazon River,"[-0.1087415, 0.064478986, -0.051910195, -0.137...",0.027809
Amazonen-Werke,"[-0.095847264, 0.10285617, -0.035618577, -0.17...",0.047389
Diablo II,"[-0.117192894, 0.124908805, -0.013397733, -0.1...",0.048475
Takembeng,"[-0.095976144, 0.066766486, -0.008813564, -0.1...",0.050255
Game of the Amazons,"[-0.12012394, 0.116951585, -0.026628084, -0.13...",0.051603
equestrians,"[-0.09384323, 0.0854721, 0.009414364, -0.13802...",0.053917
Amazon Creek,"[-0.15647501, -0.009726174, -0.06234486, -0.10...",0.056518
