#Recommend Similar News Articles
This notebook demonstrates how to use bag-of-word vectors and cosine similarity for news article recommendation.

In [0]:
import re
import math
import pandas as pd
from collections import Counter

#Fetching the Corpus
`get_corpus()` reads the CSV file, and then return a list of the news headlines

In [0]:
def get_corpus():
  df = pd.read_csv('https://raw.githubusercontent.com/bshmueli/108-nlp/master/reuters.csv') # https://bit.ly/nlp-reuters
  print("Dataset columns", df.columns)
  print("Dataset size", len(df))
  # Use content instead of title
  corpus = df.content.to_list()
  title = df.title.to_list()
  # Process stopwords into list
  stopword = pd.read_csv('https://raw.githubusercontent.com/bshmueli/108-nlp/master/stopwords.txt', header=None).values.tolist()
  stopword = [_[0] for _ in stopword]
  return title, corpus, stopword, len(corpus)

In [0]:
def tokenize(document):
  # Remove punctuation, and convert all tokens to lowercase
  words = re.split('\W+', document.lower())[:-1]
  # Remove stopwords
  words = [_ for _ in words if _ not in stopword]
  return words

#Computing word frequencies
`get_vocab(corpus)` computes the word frequencies in a given corpus. It returns a list of 2-tuples. Each tuple contains the token and its frequency.

In [0]:
def get_vocab(corpus):
  vocabulary = Counter()
  for document in corpus:
    tokens = tokenize(document)
    vocabulary.update(tokens)
  return vocabulary

#Compute TF-IDF Vector
`doc_to_vec(doc, vocab)` returns a TFIDF vector for document `doc`, corresponding to the presence of a word in `vocab`  
`compute_idf(vocab, corpus)` returns a IDF vector for counting frequencies in all document

In [0]:
def compute_idf():
  idf_vec = []
  for token, freq in vocab:
    appear = 0
    for doc in corpus:
      doc_tokens = tokenize(doc)
      if token in doc_tokens:
        appear += 1
    idf_vec.append(math.log(N / appear))
  return idf_vec

In [0]:
def doc2vec(doc):
  doc_tokens = tokenize(doc)
  # Compute tf vectors
  tf_vec = []
  for token, freq in vocab:
    tf_vec.append(doc_tokens.count(token))
  tf_vec = [float(_) / sum(tf_vec) for _ in tf_vec]
  # Compute tf-idf vectors
  tfidf_vec = [tf * idf for tf, idf in zip(tf_vec, idf_vec)]
  return tfidf_vec

Cosine similarity between two numerical vectors

In [0]:
def cosine_similarity(vec_a, vec_b):
  assert len(vec_a) == len(vec_b)
  if sum(vec_a) == 0 or sum(vec_b) == 0:
    return 0 # hack
  a_b = sum(i[0] * i[1] for i in zip(vec_a, vec_b))
  a_2 = sum([i * i for i in vec_a])
  b_2 = sum([i * i for i in vec_b])
  return a_b/(math.sqrt(a_2) * math.sqrt(b_2))

In [0]:
def doc_similarity(doc_a, doc_b):
  return cosine_similarity(doc2vec(doc_a), doc2vec(doc_b))

# Find Similar Documents
Find and print the $k$ most similar titles to a given title

In [0]:
def k_similar(seed_id, k):
  seed_doc = corpus[seed_id]
  print('> "{}"'.format(title[seed_id]))
  similarities = [doc_similarity(seed_doc, doc) for id, doc in enumerate(corpus)]
  top_indices = sorted(range(len(similarities)), key=lambda i: similarities[i])[-k:] # https://stackoverflow.com/questions/13070461/get-indices-of-the-top-n-values-of-a-list
  nearest = [[title[id], similarities[id]] for id in top_indices]
  print()
  for story in reversed(nearest):
    print('* "{}" ({})'.format(story[0], story[1]))

# Test our program

- Global variables
  - title, corpus, stopword, N, vocab, idf_vec

In [0]:
SELECTED_CORPUS = 75
title, corpus, stopword, N = get_corpus()
vocab = get_vocab(corpus).most_common(1000)
idf_vec = compute_idf()
k_similar(SELECTED_CORPUS, 5)

Dataset columns Index(['title', 'content'], dtype='object')
Dataset size 5354
> "Twitter CEO calls company ’people’s news network’"

* "Twitter CEO calls company ’people’s news network’" (1.0000000000000002)
* "Salesforce still mulls bid for Twitter as shareholders resist: sources" (0.7203539245853299)
* "Twitter’s video-sharing mobile app Vine to close" (0.6497663428296279)
* "No partner in sight, Twitter faces tough solo choices" (0.6326379073769336)
* "Twitter adds WNBA games, news shows, concerts in try for live viewers" (0.608258656476074)
