In [1]:
import numpy as np
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from urllib.request import urlopen
import string
from sklearn.metrics.pairwise import cosine_similarity
import json
import pickle

In [2]:
# In the following, I've decided to take out the most common words, which may not convey much meaning: stop words
stop_words = stopwords.words('english')

In [3]:
# Download the GloVe Model
!curl -O 'http://nlp.stanford.edu/data/glove.6B.zip'

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   308  100   308    0     0   2026      0 --:--:-- --:--:-- --:--:--  2026


In [4]:
!unzip 'glove.6B.zip'

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [5]:
with open('glove.6B.50d.txt', 'r') as file:
    first_line = file.readline()
file.close()

In [254]:
# Each line begins with the word, and continues with the components of the vector, all entered as text
first_line

'the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0.14767 -0.55677 0.14658 -0.0095095 0.011658 0.10204 -0.12792 -0.8443 -0.12181 -0.016801 -0.33279 -0.1552 -0.23131 -0.19181 -1.8823 -0.76746 0.099051 -0.42125 -0.19526 4.0071 -0.18594 -0.52287 -0.31681 0.00059213 0.0074449 0.17778 -0.15897 0.012041 -0.054223 -0.29871 -0.15749 -0.34758 -0.045637 -0.44251 0.18785 0.0027849 -0.18411 -0.11514 -0.78581\n'

In [23]:
# Run this if you only want to use the smallest GloVe vector representation
# We form a dictionary of words. key = word, value = vector for the word embedding model
with open('glove.6B.50d.txt', 'r') as file:
    words = {line.split()[0]: list(float(x) for x in line.split()[1:]) for line in file}
    file.close()
# with open('glove_50d.pickle', 'wb') as f:
#     pickle.dump(words, f)
# f.close()

In [10]:
# We form a dictionary of words. key = word, value = vector for the word embedding model
# We form 3 dictionaries corresponding to the 3 different word embeddings that we got from GloVe
# We ignore 300D because it's pretty big
file_names = ['glove.6B.50d.txt','glove.6B.100d.txt','glove.6B.200d.txt']
glove_dict = {}

for file_name in file_names:
    with open(file_name, 'r') as file:
        glove_dict[file_name[9:-4]] = {line.split()[0]: list(float(x) for x in line.split()[1:]) for line in file}
    file.close()

In [394]:
# Store this as a pickle file
with open('glove_dictionaries.pickle', 'wb') as f:
    pickle.dump(glove_dict, f)
f.close()

In [395]:
# with open('glove_dictionaries.pickle', 'rb') as f:
#     glove_dict = pickle.load(f)
# f.close()

In [218]:
# # If we want to take the stop words out of the glove dictionary. But we'll just take them out of the article text.
# for k,v in glove_dict.items():
#     for w in stop_words:
#         v.pop(w, None)

In [11]:
# Here is our vectorizer class. Has a transform method to transform the document into a single vector which is the
# mean of all the word vectors.  If a word is not in the dictionary, it will be the zero vector
class WordVectorizer50DMean(object):
    def __init__(self, worddict):
        self.worddict = worddict
        self.dim = len(worddict['hello']) # length of the word vectors

    def transform(self, X):
        return np.mean([self.worddict[word] if word in self.worddict else np.zeros(self.dim) for word in X], axis=0)

In [12]:
# Example of word vector. 
glove_dict['50d']['hello']

[-0.38497,
 0.80092,
 0.064106,
 -0.28355,
 -0.026759,
 -0.34532,
 -0.64253,
 -0.11729,
 -0.33257,
 0.55243,
 -0.087813,
 0.9035,
 0.47102,
 0.56657,
 0.6985,
 -0.35229,
 -0.86542,
 0.90573,
 0.03576,
 -0.071705,
 -0.12327,
 0.54923,
 0.47005,
 0.35572,
 1.2611,
 -0.67581,
 -0.94983,
 0.68666,
 0.3871,
 -1.3492,
 0.63512,
 0.46416,
 -0.48814,
 0.83827,
 -0.9246,
 -0.33722,
 0.53741,
 -1.0616,
 -0.081403,
 -0.67111,
 0.30923,
 -0.3923,
 -0.55002,
 -0.68827,
 0.58049,
 -0.11626,
 0.013139,
 -0.57654,
 0.048833,
 0.67204]

In [13]:
# Initialize a vectorizer called model, using our 50d dictionary
words = glove_dict['50d']
model = WordVectorizer50DMean(words)

In [14]:
# New York Times article scraper
# Returns article text transformed according to our model
def get_nytimes_text(page_url):
    page = urlopen(page_url)
    soup = BeautifulSoup(page, "html.parser")
    stuff = soup.find_all("div", attrs={"class":"css-18sbwfn StoryBodyCompanionColumn"}) # Updated August 9, 2018
    article_text = ""
    for things in stuff:
        blah = things.text.strip()
        article_text += blah + ' '
    article_text = "".join((char.lower() for char in article_text if char not in string.punctuation))
    article_text = article_text.replace('"', " ").replace("'",'').replace('“',' ').replace('”',' ').replace("’",'').replace('-',' ').replace('--',' ').replace('—',' ').replace('…',' ')
    article_text = [word for word in article_text.split() if word not in stop_words]
    article_text = [list(model.transform(article_text))]
    return article_text

In [15]:
# Onion article scraper
# Returns article text transformed according to our model
def get_onion_text(page_url):
    page = urlopen(page_url)
    soup = BeautifulSoup(page, "html.parser")
    stuff = soup.find_all("p", attrs={"class":None})
    article_text = ""
    for things in stuff:
        blah = things.text.strip()
        article_text += blah + ' '
    article_text = "".join((char.lower() for char in article_text if char not in string.punctuation))
    article_text = article_text.replace('"', " ").replace("'",'').replace('“',' ').replace('”',' ').replace("’",'').replace('-',' ').replace('--',' ').replace('—',' ').replace('…',' ')
    article_text = [word for word in article_text.split() if word not in stop_words]
    article_text = [list(model.transform(article_text))]
    return article_text

In [16]:
# Load the JSON (derived from final_borowitz.json which was cleaned), containing all our Borowitz Report articles
# These articles are free of stop words
with open('final_borowitz_glove.json','r') as file:
    borowitz = json.load(file)
file.close()

In [17]:
# Collect all the borowitz articles and transform them according to our model
# At the end, we'll have a list of lists, where each individual list is a transformed article
borowitz_articles = []
for article in borowitz:
    borowitz_articles.append(list(model.transform(article[3])))

In [18]:
nytimes_url = 'https://www.nytimes.com/2018/08/12/technology/google-facebook-dominance-hurts-ad-tech-firms-speeding-consolidation.html'

In [19]:
# Here is our New York Times text, transformed
nytimes_text = get_nytimes_text(nytimes_url)

In [20]:
# We combine everything into one matrix
borowitz_articles.extend(nytimes_text)
M = np.matrix(borowitz_articles)

In [21]:
# The cosine_similarity function takes in a matrix M of row vectors, and computes the cosine of the angle between
# every pair of row vectors, returning a matrix in which the i,j entry gives the cosines of the angle between
# row vector i and row vector j. Since between every row and itself is an angle of 0, the diagonal consists of ones.
similarity_scores = cosine_similarity(M)

In [22]:
# The last row of the matrix of scores contains the cosines of the angles between our given article (NY Times in this
# case) and every Borowitz article. We only care about the first n-1 entries and sort them in descending order
# We find the top two cosines, corresponding to the two Borowitz Reports that are similar to our given article.
n = similarity_scores.shape[0]
last_row = similarity_scores[n-1][0:n-1]
sorted_last_row = sorted(last_row, reverse=True)
i = np.where(last_row==sorted_last_row[0])[0][0]
j = np.where(last_row==sorted_last_row[1])[0][0]
print(borowitz[i][0], borowitz[i][2], borowitz[j][0], borowitz[j][2], i, j)

A Letter from Mark Zuckerberg | The New Yorker https://www.newyorker.com/humor/borowitz-report/a-letter-from-mark-zuckerberg Used Tool Purchased for $1.8 Million | The New Yorker https://www.newyorker.com/humor/borowitz-report/used-tool-purchased-1-8-million 931 555
