#Recommend Similar News Articles
This notebook demonstrates how to use bag-of-word vectors and cosine similarity for news article recommendation.

In [0]:
import re
import math
import pandas as pd
from collections import Counter

#Fetching the Corpus
`get_corpus()` reads the CSV file, and then return a list of the news headlines

In [0]:
def get_corpus():
  df = pd.read_csv('https://raw.githubusercontent.com/bshmueli/108-nlp/master/reuters.csv') # https://bit.ly/nlp-reuters
  print("Dataset columns", df.columns)
  print("Dataset size", len(df))
  corpus = df.title.to_list()
  return corpus

In [0]:
def tokenize(document):
  words = document.split(' ')
  return words

#Computing word frequencies
`get_vocab(corpus)` computes the word frequencies in a given corpus. It returns a list of 2-tuples. Each tuple contains the token and its frequency.

In [0]:
def get_vocab(corpus):
  vocabulary = Counter()
  for document in corpus:
    tokens = tokenize(document)
    vocabulary.update(tokens)
  return vocabulary

In [0]:
def df():

#Compute BoW (Bag-of-Words) Vector
`doc2vec(doc)` returns a bag-of-words vector for document `doc`, corresponding to the presence of a word in `vocab`

In [0]:
def doc2vec(doc):
  words = tokenize(doc)
  return [1 if token in words else 0 for token, freq in vocab]


Compute the Bag-of-Words vector for each document

Cosine similarity between two numerical vectors

In [0]:
def cosine_similarity(vec_a, vec_b):
  assert len(vec_a) == len(vec_b)
  if sum(vec_a) == 0 or sum(vec_b) == 0:
    return 0 # hack
  a_b = sum(i[0] * i[1] for i in zip(vec_a, vec_b))
  a_2 = sum([i*i for i in vec_a])
  b_2 = sum([i*i for i in vec_b])
  return a_b/(math.sqrt(a_2) * math.sqrt(b_2))

In [0]:
def doc_similarity(doc_a, doc_b):
  return cosine_similarity(doc2vec(doc_a), doc2vec(doc_b))


# Find Similar Documents
Find and print the $k$ most similar titles to a given title

In [0]:
def k_similar(seed_id, k):
  seed_doc = corpus[seed_id]
  print('> "{}"'.format(seed_doc))
  similarities = [doc_similarity(seed_doc, doc) for id, doc in enumerate(corpus)]
  top_indices = sorted(range(len(similarities)), key=lambda i: similarities[i])[-k:] # https://stackoverflow.com/questions/13070461/get-indices-of-the-top-n-values-of-a-list
  nearest = [[corpus[id], similarities[id]] for id in top_indices]
  print()
  for story in reversed(nearest):
    print('* "{}" ({})'.format(story[0], story[1]))

# Test our program

In [0]:
corpus = get_corpus()
vocab = get_vocab(corpus).most_common(1000)
k_similar(10, 5)

Dataset columns Index(['title', 'content'], dtype='object')
Dataset size 5354
> "Trump says rival Cruz’s Canadian birthplace could be ’big problem’: Washington Post"

* "Trump says rival Cruz’s Canadian birthplace could be ’big problem’: Washington Post" (1.0000000000000002)
* "Disney buying Netflix could be practical magic" (0.5773502691896258)
* "DoubleLine’s Gundlach says Trump rallies seem to be ’losing steam’" (0.5477225575051661)
* "Departure of communications aide could be first in Trump shake-up" (0.5000000000000001)
* "McDonald’s deletes Trump tweet, says Twitter account compromised" (0.47140452079103173)
