<a href="https://colab.research.google.com/github/tsanzxc456/NLP/blob/master/lab1_0760054.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Recommend Similar News Articles
This notebook demonstrates how to use bag-of-word vectors and cosine similarity for news article recommendation.

In [1]:
import re
import math
import pandas as pd
from collections import Counter
import string
import numpy as np

#Fetching the Corpus
`get_corpus()` reads the CSV file, and **removes punctuation and stopwords** immediately after the data is accquired, then **return a copy of the list of the news contents with the cased characters converted to lowercase**.


`get_corpus_title()` reads the CSV file, and the return the title list of data for showing the result.

In [2]:
def remove_stopwords(document):
  stopwords_list = ["i","me","my","myself","we","our","ours","ourselves","you","you're","you've","you'll","you'd","your","yours","yourself","yourselves","he","him","his","himself","she","she's","her","hers","herself","it","it's","its","itself","they","them","their","theirs","themselves","what","which","who","whom","this","that","that'll","these","those","am","is","are","was","were","be","been","being","have","has","had","having","do","does","did","doing","a","an","the","and","but","if","or","because","as","until","while","of","at","by","for","with","about","against","between","into","through","during","before","after","above","below","to","from","up","down","in","out","on","off","over","under","again","further","then","once","here","there","when","where","why","how","all","any","both","each","few","more","most","other","some","such","no","nor","not","only","own","same","so","than","too","very","s","t","can","will","just","don","don't","should","should've","now","d","ll","m","o","re","ve","y","ain","aren","aren't","couldn","couldn't","didn","didn't","doesn","doesn't","hadn","hadn't","hasn","hasn't","haven","haven't","isn","isn't","ma","mightn","mightn't","mustn","mustn't","needn","needn't","shan","shan't","shouldn","shouldn't","wasn","wasn't","weren","weren't","won","won't","wouldn","wouldn't"]
  doc_words = document.split()
  resultwords  = [word for word in doc_words if word not in stopwords_list]
  result = ' '.join(resultwords)
  return result

In [3]:
def get_corpus():
  df = pd.read_csv('https://raw.githubusercontent.com/bshmueli/108-nlp/master/reuters.csv') # https://bit.ly/nlp-reuters
  corpus = df.content.to_list()
  corpus_punc_removed = corpus
  corpus_stopwords_removed = corpus
  for i in range(0,len(corpus)):
    corpus_punc_removed[i] = corpus[i].translate(str.maketrans('', '', string.punctuation)).lower()
    corpus_stopwords_removed[i] = remove_stopwords(corpus_punc_removed[i])
  return corpus_stopwords_removed

In [4]:
def get_corpus_title():
  df = pd.read_csv('https://raw.githubusercontent.com/bshmueli/108-nlp/master/reuters.csv') # https://bit.ly/nlp-reuters
  corpus_title = df.title.to_list()
  return corpus_title

#Computing word frequencies
`get_vocab_from_document(document)` computes the word frequencies **in a single document** given. It returns a list of 2-tuples. Each tuple contains the token and its frequency.

`get_vocab_from_corpus(corpus)` computes the word frequencies **in the whole corpus** given. It returns a list of 2-tuples. Each tuple contains the token and its frequency.

In [5]:
def tokenize(document):
  words = document.split(' ')
  return words

In [6]:
def get_vocab_from_corpus(corpus):
  vocabulary = Counter()
  for document in corpus:
    tokens = tokenize(document)
    vocabulary.update(tokens)
  return vocabulary

In [7]:
def get_vocab_from_document(document):
  vocabulary = Counter()
  tokens = tokenize(document)
  vocabulary.update(tokens)
  return vocabulary

#Compute TFIDF Score and return as a Vector
`get_df(word_list)` compute df of each word in given word_list, then return as a list.

`tfidf_score(num,key_word,document)` returns a TFIDF score for word `key_word` in document `document`.


`tfidf_vector(document)` returns a TFIDF vector with element in `top_1000_vocab`.

`cosine_similarity` compute cosine similarity between two numerical vectors

In [8]:
def get_df(word_list):
  df_list = []
  for word in word_list:
    tmp_df = 0
    for i in range(0,len(corpus)):
      each_doc_vocab = get_vocab_from_document(corpus[i])
      if each_doc_vocab[word[0]]!=0:
        tmp_df = tmp_df + 1
    # print("'{}' has ({})".format(word[0],tmp_df))
    df_list.append(tmp_df)
  return df_list


In [9]:
def tfidf_score(num,key_word,document):
  doc_vocab = get_vocab_from_document(document)
  tf = doc_vocab[key_word]/len(list(doc_vocab.elements()))
  idf = math.log10(len(corpus)/df_list[num])
  tfidf = tf * idf
  # print("tfidf of '{}' is ({})".format(key_word,tfidf))
  return tfidf

In [10]:
def tfidf_vector(document):
  tfidf_vec = []
  word_num = 0
  for word in top_1000_vocab:   
    tfidf_vec.append(tfidf_score(word_num,word[0],document))
    word_num += 1
    
 
  return tfidf_vec

In [11]:
def cosine_similarity(vec_a, vec_b):
  assert len(vec_a) == len(vec_b)
  if sum(vec_a) == 0 or sum(vec_b) == 0:
    return 0 # hack
  a_b = sum(i[0] * i[1] for i in zip(vec_a, vec_b))
  a_2 = sum([i*i for i in vec_a])
  b_2 = sum([i*i for i in vec_b])
  return a_b/(math.sqrt(a_2) * math.sqrt(b_2))

In [12]:
def doc_similarity(doc_a, doc_b):
  return cosine_similarity(tfidf_vector(doc_a), tfidf_vector(doc_b))


# Find Similar Documents
Find and print the $k$ most similar titles to a given content

In [13]:
def k_similar(seed_id, k):
  seed_doc = corpus[seed_id]
  seed_title = corpus_title[seed_id]
  print('> "{}"'.format(seed_title))
  similarities = [doc_similarity(seed_doc, doc) for id, doc in enumerate(corpus)]
  top_indices = sorted(range(len(similarities)), key=lambda i: similarities[i])[-k:] # https://stackoverflow.com/questions/13070461/get-indices-of-the-top-n-values-of-a-list
  nearest = [[corpus_title[id], similarities[id]] for id in top_indices]
  print()
  for story in reversed(nearest):
    print('* "{}" ({})'.format(story[0], story[1]))

# Test our program

In [14]:
corpus = get_corpus()
corpus_title = get_corpus_title()
top_1000_vocab = get_vocab_from_corpus(corpus).most_common(1000)
df_list = get_df(top_1000_vocab)
k_similar(54, 5)

> "British police name suicide bomber, threat level raised to critical"

* "British police name suicide bomber, threat level raised to critical" (1.0)
* "Trump condemns leaks after UK police briefly halt information sharing" (0.6285046844403149)
* "Two days from UK election, security dominates campaign after London attack" (0.6240940381380864)
* "Middle-aged London attacker was criminal who wasn’t seen as threat" (0.605716007450487)
* "’Enough is enough’ PM May says after London attackers kill seven" (0.6033688132049058)
