In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
plt.rcParams['figure.figsize'] = (30, 30)

import wikipedia

import re
import spacy
from bs4 import BeautifulSoup
from IPython.core.display import display, HTML

import numpy as np
import pandas as pd
import networkx as nx

from umap import UMAP
from sklearn.cluster import KMeans, AgglomerativeClustering
from scipy.spatial.distance import cosine

from tqdm import tqdm_notebook as tqdm

In [4]:
nlp = spacy.load('en')

In [244]:
df = pd.read_json('data/calm_records.json')

### get average word vector for our collection-level record

In [245]:
record = df.loc[269057]['AdminHistory'][0]
soup = BeautifulSoup(record, 'html.parser')
plain_text = soup.get_text()
plain_text = ' '.join(plain_text.split())


print(plain_text[:100])

The Abortion Law Reform Association (ALRA) was founded in 1935 for the legalisation of abortion in c


In [255]:
doc = nlp(plain_text)
doc_avg_wv = np.array([word.vector for word in doc]).mean(axis=0)

### get average word vector for a specific, known wikipedia page

In [42]:
h = wikipedia.page('Hinge_loss')
doc = nlp(h.content)
np.array([word.vector for word in doc]).mean(axis=0).shape

(384,)

In [225]:
known_page = nlp(wikipedia.page('dog').content)
known_avg_wv = np.array([word.vector for word in known_page]).mean(axis=0)

### get average word vectors for all pages in wikipedia search 

In [226]:
wikipedia.search('chicken')

['Chicken',
 'Chicken (disambiguation)',
 'Chicken as food',
 'KFC',
 'Chickenpox',
 'Chicken and rice',
 'Chicken wing',
 'Rooster',
 'Chickenhawk',
 'Curry']

In [227]:
possible_avg_wvs = pd.Series()
possible_page_titles = [title for title in wikipedia.search('chicken')
                        if 'disambiguation' not in title]
print(possible_page_titles)

['Chicken', 'Chicken as food', 'KFC', 'Chickenpox', 'Chicken and rice', 'Chicken wing', 'Rooster', 'Chickenhawk', 'Curry']


In [231]:
def get_pages(page_name):
    pages = []
    try:
        wikipedia.page(page_name).url
        # we're just hitting the url to check for disambiguation errors
        pages.append(page_name)
    except wikipedia.exceptions.DisambiguationError as disambiguation:
        for option in disambiguation.options:
            # Note that we're only going one level deep into disambiguations.
            # This should be more than enough for our purposes, and it's easy
            # to get caught in horrible, endless loops and branches if we make
            # this properly recursive.
            try: 
                wikipedia.page(option).url
                pages.append(option)
            except wikipedia.exceptions.DisambiguationError: pass
    return pages

In [232]:
def flatten(list_of_lists):
    return [j for i in list_of_lists for j in i]

In [233]:
all_page_names = np.unique(flatten([get_pages(page_name) 
                                    for page_name in possible_page_titles]))



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [234]:
known_page = nlp(wikipedia.page('dog').summary)
known_avg_wv = np.array([word.vector for word in known_page]).mean(axis=0)

In [235]:
candidate_wvs = pd.Series({page_name: (np.array([word.vector for word in 
                                                 nlp(wikipedia.page(page_name).content)])
                                       .mean(axis=0))
                           for page_name in tqdm(all_page_names)})

in our example we have a load of text about swimming, and we want to figure out which version of the page 'pool' is most relevant

In [236]:
df = candidate_wvs.to_frame('word vector')

df['similarity'] = df['word vector'].apply(lambda avg_wv: cosine(avg_wv, known_avg_wv))
df.sort_values(by='similarity')

Unnamed: 0,word vector,similarity
Chicken wings as food,"[0.51195973, 0.2876703, 0.24118368, 0.8079274,...",0.031837
Chicken as food,"[0.51195973, 0.2876703, 0.24118368, 0.8079274,...",0.031837
Chickenpox,"[0.53671205, 0.41329744, 0.21139945, 0.8837856...",0.04582
Red-tailed hawk,"[0.5274143, 0.30443037, 0.056660794, 0.8691285...",0.047592
Claypot chicken rice,"[0.479356, 0.52213734, 0.20710574, 0.82475156,...",0.051678
Cooper's hawk,"[0.45091563, 0.34084162, 0.24449615, 0.9361986...",0.053301
Chicken,"[0.34808967, 0.39554432, 0.34948257, 0.8447767...",0.05334
Curry,"[0.33137044, 0.35823157, 0.3451136, 0.7453335,...",0.063725
Hainanese chicken rice,"[0.21281174, 0.5049968, 0.32360315, 0.61717415...",0.068132
Chicken pilaf,"[0.25391656, 0.41949353, 0.11254915, 0.7203721...",0.068609


In [239]:
wikipedia.page('Chicken wings as food').url

'https://en.wikipedia.org/wiki/Chicken_as_food'

In [247]:
for ent in nlp(plain_text).ents:
    print(ent)

The Abortion Law Reform Association
1935
the 1967 Abortion Act
Association
Act
the United Kingdom
1803
1861
the Offences Against the Person Act 1929 Infant Life (Preservation) Act
1931-1932
McCardie
Leeds Assizes
1934
Cooperative Women's
Annual
1934
British Medical Association Committee
the Medical Aspects of Abortion
1936
1936
Three
Stella Browne
Harry Roberts
A Ludovici
1936
Foundation of Abortion Law Reform Association
ALRA
1936-1937
Joint Committee of Midwifery Induced Abortion Survey 1937 Interdepartmental Committee on Abortion
Norman Birkett
1939
Alec Bourne
14-year-old
Macnaghten
1944
ALRA
1947
Publication of Back Street Surgery
1948
Bergmann
Ferguson
1949
Eustace Chesser
'Society and Abortion'
1950
Chesser
1950
The Fallen Sparrow
Gateway Theatre Club 1950
1952
Joseph Reeves'
Bill
1954
Amulree
Bill
House of Lords
Second
Kenneth Robinson
House of Commons
1957
Lena Jeger
Douglas Houghton
1958
Newton
Stungo
1958
Publication
Glanville Williams
The Sanctity of Life
the Criminal Law
A

In [253]:
all_page_names = np.unique(flatten([get_pages(page_name) 
                                    for page_name in wikipedia.search(str(nlp(plain_text).ents[0]))]))

candidate_wvs = pd.Series({page_name: (np.array([word.vector for word in 
                                                 nlp(wikipedia.page(page_name).content)])
                                       .mean(axis=0))
                           for page_name in tqdm(all_page_names)})




Exception in thread Thread-6:
Traceback (most recent call last):
  File "/Users/pimharr/anaconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/Users/pimharr/anaconda3/lib/python3.6/site-packages/tqdm/_monitor.py", line 62, in run
    for instance in self.tqdm_cls._instances:
  File "/Users/pimharr/anaconda3/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration






In [258]:
df = candidate_wvs.to_frame('word vector')

df['similarity'] = df['word vector'].apply(lambda avg_wv: cosine(avg_wv, doc_avg_wv))
df.sort_values(by='similarity')

Unnamed: 0,word vector,similarity
Vera Houghton,"[-0.4301998, 0.54130113, 0.92792326, 0.7759411...",0.063963
Abortion Law Reform Association of New Zealand,"[-0.21933034, 0.7977514, 0.5871409, 0.7833126,...",0.073753
Peter Diggory,"[-0.17954479, 0.5839987, 0.8003913, 1.1062534,...",0.082774
Abortion in New Zealand,"[0.10222111, 0.6283409, 0.5274474, 0.87226874,...",0.093775
Abortion Rights (organisation),"[0.09492731, 0.6478804, 0.5869149, 1.133949, 0...",0.135124
Alice Jenkins,"[-0.00235377, 0.5708108, 0.6733205, 1.0527755,...",0.145237
Abortion in the United Kingdom,"[0.33629876, 0.46168223, 0.46644717, 0.9045381...",0.145799
Stella Browne,"[0.24075787, 0.3299855, 0.40706477, 0.9012635,...",0.192801
Abortion-rights movements,"[0.3448882, 0.50422907, 0.29530606, 0.9066715,...",0.192875
History of abortion,"[0.3595639, 0.40332535, 0.2431662, 0.9069025, ...",0.220592


# try tfidf