# 6. Document Similarity

## 6.1 Import Libraries

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
import re
import string
import random

## 6.2 Import Data

In [4]:
from google.colab import drive
from os import path

drive.mount('/content/drive/')

Mounted at /content/drive/


In [5]:
project_path = 'drive/My Drive/NLP/Tugas Akhir/News Recommendation'

In [6]:
data = pd.read_csv(f'{project_path}/result_final.csv')

In [7]:
data.shape

(2190, 9)

In [8]:
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,link,text,title,date,keywords,summary,title_summary
0,0,0,http://techcrunch.com/2020/09/07/vodafone-idea...,"Vodafone Idea, one of the largest telecom oper...",Indian telecom giant Vodafone Idea rebrands as...,2020-09-07 00:00:00,"['rebrands', 'idea', 'vi', 'giant', 'brand', '...","Vodafone Idea, one of the largest telecom oper...",Indian telecom giant Vodafone Idea rebrands as...
1,1,1,http://techcrunch.com/2020/09/16/facebook-addr...,"At the beginning of the previous decade, Faceb...",Facebook addresses political controversy in In...,2020-09-16 00:00:00,"['opportunities', 'whatsapp', 'controversy', '...",Politicians in the country today heavily rely ...,Facebook addresses political controversy in In...
2,2,2,http://techcrunch.com/2020/09/14/youtube-launc...,"As TikTok’s fate in the U.S. remains murky, Yo...","YouTube launches its TikTok rival, YouTube Sho...",2020-09-14 00:00:00,"['rival', 'video', 'feature', 'access', 'youtu...","As TikTok’s fate in the U.S. remains murky, Yo...","YouTube launches its TikTok rival, YouTube Sho..."
3,3,3,http://techcrunch.com/2020/09/09/groww-an-inve...,Even as more than 150 million people are using...,"Groww, an investment app for millennials in In...",2020-09-09 00:00:00,"['world', 'yc', 'continuity', 'stocks', 'groww...","YC Continuity, the growth-stage investment fun...","Groww, an investment app for millennials in In..."
4,4,4,http://techcrunch.com/2020/09/15/lanzatech-is-...,As part of the continuing global rollout of La...,LanzaTech is developing a small-scale waste bi...,2020-09-15 00:00:00,"['production', 'distributed', 'developing', 's...",As part of the continuing global rollout of La...,LanzaTech is developing a small-scale waste bi...


## 6.3 Util Functions

In [9]:
def remove_non_ascii(sentence):
  return "".join(i for i in sentence if  ord(i)<128)

In [10]:
def make_lower_case(text):
  return text.lower()

In [11]:
def remove_stop_words(text):
  text = text.split()
  stops = set(stopwords.words("english"))
  text = [w for w in text if not w in stops]
  texts = [w for w in text if w.isalpha()]
  texts = " ".join(texts)
  return texts

In [12]:
def remove_punctuation(text):
  tokenizer = RegexpTokenizer(r'\w+')
  text = tokenizer.tokenize(text)
  text = " ".join(text)
  return text

In [13]:
def remove_html(text):
  html_pattern = re.compile('<.*?>')
  return html_pattern.sub(r'', text)

In [14]:
def extract_best_indices(m, top_k):
  """
  Use sum of the cosine distance over all tokens.
  m (np.array): cos matrix of shape (nb_in_tokens, nb_dict_tokens)
  top_k (int): number of indices to return (from high to lowest in order)
  """
  # return the sum on all tokens of cosinus for each sentence
  if len(m.shape) > 1:
      cos_sim = np.mean(m, axis=0) 
  else: 
      cos_sim = m
  index = np.argsort(cos_sim)[::-1] # from highest idx to smallest score 
  mask = np.logical_or(cos_sim[index] != 0, np.ones(len(cos_sim))) #eliminate 0 cosine distance
  best_index = index[mask][:top_k+1]  
  return best_index[1:]

In [60]:
def print_recomendation(news_m, user_read_idx, cosine_m, top_k=10):
  #get similarity values with other articles
  top_k_indices = extract_best_indices(cosine_m[user_read_idx], top_k)
  
  print(f"Article Read: {news_m[user_read_idx][:50]}...")
  print(" ---------------------------------------------------------- ")
  for i in range(len(top_k_indices)):
      print(f"Recomendation {i+1}: (IDX: {top_k_indices[i]}), score: {cosine_m[top_k_indices[i]][user_read_idx]} | {news_m[top_k_indices[i]][:50]}...")
      print()

## 6.4 Data cleaning

In [16]:
ds = data[['date','title','text','link']]

In [17]:
ds = ds.drop_duplicates(subset=None, keep='first', inplace=False)

In [18]:
ds = ds.dropna()

In [19]:
ds.insert(0,'id',range(0,ds.shape[0]))

In [20]:
ds.shape

(1496, 5)

In [21]:
ds.head()

Unnamed: 0,id,date,title,text,link
0,0,2020-09-07 00:00:00,Indian telecom giant Vodafone Idea rebrands as...,"Vodafone Idea, one of the largest telecom oper...",http://techcrunch.com/2020/09/07/vodafone-idea...
1,1,2020-09-16 00:00:00,Facebook addresses political controversy in In...,"At the beginning of the previous decade, Faceb...",http://techcrunch.com/2020/09/16/facebook-addr...
2,2,2020-09-14 00:00:00,"YouTube launches its TikTok rival, YouTube Sho...","As TikTok’s fate in the U.S. remains murky, Yo...",http://techcrunch.com/2020/09/14/youtube-launc...
3,3,2020-09-09 00:00:00,"Groww, an investment app for millennials in In...",Even as more than 150 million people are using...,http://techcrunch.com/2020/09/09/groww-an-inve...
4,4,2020-09-15 00:00:00,LanzaTech is developing a small-scale waste bi...,As part of the continuing global rollout of La...,http://techcrunch.com/2020/09/15/lanzatech-is-...


In [22]:
ds['cleaned_desc'] = ds['text'].apply(remove_non_ascii)
ds['cleaned_desc'] = ds['text'].apply(make_lower_case)
ds['cleaned_desc'] = ds.cleaned_desc.apply(remove_stop_words)
ds['cleaned_desc'] = ds.cleaned_desc.apply(remove_punctuation)
ds['cleaned_desc'] = ds.cleaned_desc.apply(remove_html)

## 6.5 Statistical Doc Similarity

### 6.5.1 Data Encoding (TF-IDF)

In [38]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [39]:
## analyzer -- to select individual words# default
## max_df[0.0,1.0] - used to ignore words with frequency more than 0.8 these words can be useless words as these words may appear only once and may not have a significant meaning
tf = TfidfVectorizer(analyzer='word', stop_words='english', min_df=0.25, max_df=0.8, ngram_range=(1,3))

In [40]:
tfidf_matrix = tf.fit_transform(ds['cleaned_desc'])

In [41]:
pd.DataFrame(tfidf_matrix.toarray(), columns=tf.get_feature_names())

Unnamed: 0,according,company,coronavirus,help,including,like,make,million,new,people,said,technology,time,told,year
0,0.000000,0.605041,0.000000,0.000000,0.161192,0.000000,0.000000,0.489757,0.225278,0.147256,0.304142,0.000000,0.450714,0.000000,0.000000
1,0.081639,0.754804,0.000000,0.000000,0.321746,0.072524,0.241817,0.325859,0.056208,0.146965,0.354130,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.232638,0.000000,0.000000,0.123957,0.335291,0.248436,0.000000,0.866196,0.000000,0.077962,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.123118,0.000000,0.000000,0.131202,0.000000,0.131478,0.664395,0.366729,0.359577,0.495111,0.000000,0.000000,0.000000,0.000000
4,0.509929,0.157154,0.000000,0.172568,0.000000,0.000000,0.503476,0.169613,0.351084,0.305988,0.210662,0.337786,0.000000,0.000000,0.166947
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1491,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.884185,0.000000,0.000000,0.000000,0.467138
1492,0.718405,0.000000,0.224685,0.243120,0.000000,0.212733,0.000000,0.000000,0.164873,0.000000,0.445181,0.000000,0.219907,0.000000,0.235201
1493,0.000000,0.000000,0.459125,0.000000,0.482126,0.434701,0.000000,0.000000,0.000000,0.000000,0.606460,0.000000,0.000000,0.000000,0.000000
1494,0.000000,0.000000,0.000000,0.537085,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.655646,0.000000,0.000000,0.530724,0.000000


### 6.5.2 Predict using Cosine Similarity


In [42]:
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [43]:
extract_best_indices(cosine_similarities[3], 3)

array([507, 166, 926])

### 6.5.3 Result

In [44]:
print_recomendation(ds['text'].values, 3, cosine_similarities)

Article Read: Even as more than 150 million people are using dig...
 ---------------------------------------------------------- 
Recomendation 1: (IDX: 507), score: 0.3512586318946669 | Online advertising is a game of scale, but one att...

Recomendation 2: (IDX: 166), score: 0.27643172606925437 | LONDON, Aug 18 (Thomson Reuters Foundation) - Form...

Recomendation 3: (IDX: 926), score: 0.4310676055412757 | HONG KONG (Reuters) - Chinese online insurance tec...

Recomendation 4: (IDX: 7), score: 0.4028973594742803 | Airmeet, a startup that offers a platform to host ...

Recomendation 5: (IDX: 469), score: 0.44674590915949225 | As Trump Visits Kenosha, Hundreds Gather Where Jac...

Recomendation 6: (IDX: 1292), score: 0.3255220997122848 | BRASILIA (Reuters) - Brazil’s President Jair Bolso...

Recomendation 7: (IDX: 1454), score: 0.3255220997122848 | FILE PHOTO: Climate change activists demonstrate a...

Recomendation 8: (IDX: 1464), score: 0.3255220997122848 | FILE PHOTO: Climate change 

## 6.6 Deep Learning Doc Similarity

### 6.6.1 Transformers

In [45]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from nltk import sent_tokenize

In [46]:
!pip install sentence_transformers
from sentence_transformers import SentenceTransformer

Collecting sentence_transformers
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 3.1 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.12.3-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 11.8 MB/s 
[?25hCollecting tokenizers>=0.10.3
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 35.2 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 31.8 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 6.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_

In [47]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [48]:
model = SentenceTransformer('sentence-transformers/paraphrase-mpnet-base-v2')

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.70k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/594 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [49]:
documents = list(ds['cleaned_desc'])

In [50]:
vectors = []
for i, document in enumerate(documents):

  sentences = sent_tokenize(document)
  embeddings_sentences = model.encode(sentences)
  embeddings = np.mean(np.array(embeddings_sentences), axis=0)

  vectors.append(embeddings)

  if i % 100 == 0:
    print("making vector at index:", i)

making vector at index: 0
making vector at index: 100
making vector at index: 200
making vector at index: 300
making vector at index: 400
making vector at index: 500
making vector at index: 600
making vector at index: 700
making vector at index: 800
making vector at index: 900
making vector at index: 1000
making vector at index: 1100
making vector at index: 1200
making vector at index: 1300
making vector at index: 1400


In [51]:
scores = cosine_similarity(vectors, vectors)

In [52]:
print_recomendation(ds['text'].values, 3, scores)

Article Read: Even as more than 150 million people are using dig...
 ---------------------------------------------------------- 
Recomendation 1: (IDX: 6), score: 0.2762608230113983 | CashKaro, one of the leading cashback and coupon s...

Recomendation 2: (IDX: 9), score: 0.25234079360961914 | Byju’s has raised $500 million in a new financing ...

Recomendation 3: (IDX: 176), score: 0.2384922057390213 | 3one4 Capital, a venture capital firm in India, to...

Recomendation 4: (IDX: 741), score: 0.29171955585479736 | As a business model, SaaS has expanded to epic siz...

Recomendation 5: (IDX: 8), score: 0.3042358160018921 | Since India enforced a lockdown across the country...

Recomendation 6: (IDX: 1086), score: 0.28184065222740173 | Point72 Ventures, the early-stage investment firm ...

Recomendation 7: (IDX: 1124), score: 0.29140838980674744 | Dawn Capital, the London-based VC that focuses on ...

Recomendation 8: (IDX: 890), score: 0.26587602496147156 | Apple is well known for picki

### 6.6.2 BERT

In [None]:
!pip install transformers



In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel, pipeline
import torch
import numpy as np
from tqdm import tqdm

MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'

class BertModel:
    def __init__(self, model_name=MODEL_NAME, batch_size=4):
      """ init model attributes """
      self.model_name = model_name
      self.device = "cpu"
      self.batch_size = batch_size
      self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
      self.model = AutoModel.from_pretrained(self.model_name)
      self.pipeline = pipeline('feature-extraction', model=self.model, 
                               tokenizer=self.tokenizer, device=-1 if self.device == 'cpu' else 0)

    def embed(self, data):
      """ Create the embedded matrice from original sentences """
      nb_batchs = 1 if (len(data) < self.batch_size) else len(data) // self.batch_size
      batchs = np.array_split(data, nb_batchs)
      mean_pooled = []
      for batch in tqdm(batchs, total=len(batchs), desc='Training...'):
          mean_pooled.append(self.transform(batch))
      mean_pooled_tensor = torch.tensor(
          len(data), dtype=float).to(self.device)
      mean_pooled = torch.cat(mean_pooled, out=mean_pooled_tensor)
      self.embed_mat = mean_pooled

    @staticmethod
    def mean_pooling(model_output, attention_mask):
      token_embeddings = model_output[0]
      input_mask_expanded = attention_mask.unsqueeze(
          -1).expand(token_embeddings.size()).float()
      return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def transform(self, data):
      data = list(data)
      token_dict = self.tokenizer(
          data,
          padding=True,
          truncation=True,
          max_length=512,
          return_tensors="pt")
      # send all values to device by calling v.to(device)
      token_dict = {k: v.to(self.device) for k, v in token_dict.items()}
      with torch.no_grad():
          token_embed = self.model(**token_dict)
      attention_mask = token_dict['attention_mask']
      # average pooling of masked embeddings
      mean_pooled = self.mean_pooling(
          token_embed, attention_mask)
      mean_pooled = mean_pooled.to(self.device)
      return mean_pooled
    
    def recommend(self, news_m, user_read_idx, top_k=10):
      cosine_similarities = cosine_similarity(self.embed_mat, self.embed_mat)
      print_recomendation(news_m, user_read_idx, cosine_similarities, top_k)

In [None]:
bert_model = BertModel()
bert_model.embed(ds.cleaned_desc.values)

Training...: 100%|██████████| 374/374 [05:26<00:00,  1.15it/s]


In [None]:
bert_model.recommend(ds['text'].values, 3)

Article Read: Even as more than 150 million people are using dig...
 ---------------------------------------------------------- 
Recomendation 1: (IDX: 566), score: 0.7232925431037607 | Investor interest in no-code, low-code apps and se...

Recomendation 2: (IDX: 176), score: 0.7168260677463922 | 3one4 Capital, a venture capital firm in India, to...

Recomendation 3: (IDX: 179), score: 0.7060372750709747 | Your startup is special and different, and you nee...

Recomendation 4: (IDX: 9), score: 0.6904699084050727 | Byju’s has raised $500 million in a new financing ...

Recomendation 5: (IDX: 5), score: 0.683123557261046 | More than a third of small and medium-sized busine...

Recomendation 6: (IDX: 8), score: 0.6814406174305236 | Since India enforced a lockdown across the country...

Recomendation 7: (IDX: 498), score: 0.6803235459609738 | DNX Ventures, an investment firm that focuses on e...

Recomendation 8: (IDX: 733), score: 0.6703664729903799 | Nerdwallet, which provides resources 

### 6.6.3 Word2Vec

In [54]:
!pip install gensim==4.0.0

Collecting gensim==4.0.0
  Downloading gensim-4.0.0-cp37-cp37m-manylinux1_x86_64.whl (23.9 MB)
[K     |████████████████████████████████| 23.9 MB 44.8 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.0.0


In [55]:
from gensim.models.word2vec import Word2Vec

word2vec_model = Word2Vec(min_count=0, workers = 8) 
word2vec_model.build_vocab(ds['cleaned_desc'].values)



In [64]:
def predict_w2v(query_sentence, dataset, model, topk=10):
    query_sentence = query_sentence.split()
    in_vocab_list, best_index = [], [0]*topk
    for w in query_sentence:
        # remove unseen words from query sentence
        # Check on individual words ``word`` that it exists in ``model``.
        if w in model.wv.key_to_index.keys():
            in_vocab_list.append(w)
    # Retrieve the similarity between two words as a distance
    if len(in_vocab_list) > 0:
        sim_mat = np.zeros(len(dataset))
        for i, data_sentence in enumerate(dataset):
            if data_sentence:
                sim_sentence = model.wv.n_similarity(
                        in_vocab_list, data_sentence)
            else:
                sim_sentence = 0
            sim_mat[i] = np.array(sim_sentence)
        # Take the five highest norm
    return sim_mat

# Predict
cos_sim = predict_w2v(ds['text'][3], ds['cleaned_desc'].values, word2vec_model)

In [65]:
def print_recomendation_w2v(news_m, user_read_idx, cosine_m, top_k=10):
  #get similarity values with other articles
  top_k_indices = np.argsort(cosine_m)[::-1][:top_k]
  
  print(f"Article Read: {news_m[user_read_idx][:50]}...")
  print(" ---------------------------------------------------------- ")
  for i in range(len(top_k_indices)):
      print(f"Recomendation {i+1}: (IDX: {top_k_indices[i]}), score: {cosine_m[top_k_indices[i]]} | {news_m[top_k_indices[i]][:50]}...")
      print()

print_recomendation_w2v(ds['text'].values, 3, cos_sim) 

Article Read: Even as more than 150 million people are using dig...
 ---------------------------------------------------------- 
Recomendation 1: (IDX: 264), score: 0.6808363199234009 | Aug 26 - The following are the details of Indian S...

Recomendation 2: (IDX: 113), score: 0.6790512204170227 | Aug 19 - The following are the details of Indian S...

Recomendation 3: (IDX: 62), score: 0.5325736403465271 | (Reuters) - A look at the key facts and records of...

Recomendation 4: (IDX: 1343), score: 0.5170466899871826 | FILE PHOTO: A view is seen from the Amazon Tall To...

Recomendation 5: (IDX: 214), score: 0.46485865116119385 | Sep 1 (OPTA) - Scoreboard at close of play of 3rd ...

Recomendation 6: (IDX: 67), score: 0.4537274241447449 | Sep 13 (OPTA) - Scores from the LPGA Tour ANA Insp...

Recomendation 7: (IDX: 274), score: 0.45062702894210815 | Aug 30 (OPTA) - Scoreboard at close of play of 2nd...

Recomendation 8: (IDX: 832), score: 0.4493864178657532 | KIGALI (Reuters) - Rwandan Pr