# VerboFad
## Sushant Menon (ssmenon@iu.edu)
## Ujjwal Dubey (ujjdubey@iu.edu)
## Rahul Gattu (ragattu@iu.edu)

In [1]:
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords

## 1) Text Similarity

In [None]:
from nltk.corpus import brown 
adventure = nltk.corpus.brown.sents(nltk.corpus.brown.fileids(categories='adventure')[0])
science_fiction = nltk.corpus.brown.sents(nltk.corpus.brown.fileids(categories='science_fiction')[0])

In [10]:
sent = ["The bottle is empty","There is nothing in the bottle"]

In [None]:
from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()
import re

In [None]:
def text_normalizer(sent):
    clean_data = []
    sent = re.sub('[^a-zA-Z]',' ',sent)
    sent = sent.lower()
    sent = sent.split()
    sent = [lem.lemmatize(word) for word in sent if word not in stopwords.words('english')]
    sent = ' '.join(sent)
    clean_data.append(sent)
    return clean_data

In [None]:
text_normalizer(sent1)

['bottle empty']

In [None]:
text_normalizer(sent2)

['nothing bottle']

### Jaccard Similarity

In [25]:
def jaccard_similarity(x,y):
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    return intersection_cardinality/float(union_cardinality)

In [26]:
sentences = [sent.lower().split(" ") for sent in sent]
jaccard_similarity(sentences[0], sentences[1])

0.42857142857142855

### Euclidian Distance

In [2]:
from math import sqrt, pow, exp

In [18]:
def euclidean_distance(x,y):
    distance = sqrt(sum(pow(a-b,2) for a, b in zip(x, y)))
    return 1/exp(distance)

Word2Vec

In [8]:
!python -m spacy download en_core_web_md

2022-10-25 02:10:23.626890: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-md==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.1/en_core_web_md-3.4.1-py3-none-any.whl (42.8 MB)
[K     |████████████████████████████████| 42.8 MB 2.1 MB/s 
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.4.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [5]:
import gensim
from gensim.models import Word2Vec

In [9]:
nlp = spacy.load('en_core_web_md')

In [29]:
docs = [nlp(sents) for sents in sent]

In [20]:
print(docs[0].vector.shape)
print(docs[1].vector.shape)

(300,)
(300,)


In [21]:
euclidean_distance(docs[0].vector,docs[1].vector)

2.1927033807216703e-13

Doc2Vec

In [30]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

In [33]:
documents  = [TaggedDocument(doc, [i]) for i, doc in enumerate(sentences)]

In [35]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0)
model_dbow.build_vocab([x for x in documents])

In [38]:
for epoch in range(30):
    model_dbow.train([x for x in (documents)],total_examples=len(documents),epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha



In [43]:
vectors = ([model_dbow.infer_vector(doc.words, steps=20) for doc in documents])

In [45]:
print(vectors[0].shape)
print(vectors[1].shape)

(300,)
(300,)


In [46]:
euclidean_distance(vectors[0],vectors[1])

0.9781124353709123

TF-IDF

[['the', 'bottle', 'is', 'empty'],
 ['there', 'is', 'nothing', 'in', 'the', 'bottle']]

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(sent)
arr = X.toarray()

In [54]:
print(arr[0].shape)
print(arr[1].shape)

(7,)
(7,)


In [55]:
euclidean_distance(arr[0],arr[1])

0.35041346262799183

### Cosine Similarity

In [56]:
def squared_sum(x):
  return round(sqrt(sum([a*a for a in x])),3)

In [57]:
def cos_similarity(x,y):
  numerator = sum(a*b for a,b in zip(x,y))
  denominator = squared_sum(x)*squared_sum(y)
  return round(numerator/float(denominator),3)

In [59]:
cos_similarity(docs[0].vector,docs[1].vector)

0.75

## 2) Summarization

In [68]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 3.3 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 41.3 MB/s 
Installing collected packages: tokenizers, transformers
Successfully installed tokenizers-0.13.1 transformers-4.23.1


In [69]:
## for data
import datasets
import pandas as pd 
import numpy  
## for plotting
import matplotlib.pyplot as plt 
import seaborn as sns 
## for preprocessing
import re
import nltk  
import contractions  
## for textrank
import gensim  
## for evaluation
import rouge  
import difflib
## for seq2seq
from tensorflow.keras import callbacks, models, layers, preprocessing as kprocessing 
## for bart
import transformers

In [70]:
## load the full dataset of 300k articles
dataset = datasets.load_dataset("cnn_dailymail", '3.0.0')
lst_dics = [dic for dic in dataset["train"]]
## keep the first N articles if you want to keep it lite 
dtf = pd.DataFrame(lst_dics).rename(columns={"article":"text", 
      "highlights":"y"})[["text","y"]].head(20000)
dtf.head()

Downloading builder script:   0%|          | 0.00/8.33k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/9.88k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/13.9k [00:00<?, ?B/s]

Downloading and preparing dataset cnn_dailymail/3.0.0 (download: 558.32 MiB, generated: 1.28 GiB, post-processed: Unknown size, total: 1.82 GiB) to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Dataset cnn_dailymail downloaded and prepared to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,text,y
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa..."
3,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non..."
4,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical..."


In [71]:
i = 1
print("--- Full text ---")
print(dtf["text"][i])
print("--- Summary ---")
print(dtf["y"][i])

--- Full text ---
Editor's note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events. Here, Soledad O'Brien takes users inside a jail where many of the inmates are mentally ill. An inmate housed on the "forgotten floor," where many mentally ill inmates are housed in Miami before trial. MIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor." Here, inmates with the most severe mental illnesses are incarcerated until they're ready to appear in court. Most often, they face drug charges or charges of assaulting an officer --charges that Judge Steven Leifman says are usually "avoidable felonies." He says the arrests often result from confrontations with police. Mentally ill people often won't do what they're told when police arrive on the scene -- confrontation seems to exacerbate their illness and they become more paranoid, delusional, and less li

In [75]:
def textrank(corpus, ratio=0.2):    
    if type(corpus) is str:        
       corpus = [corpus]    
    lst_summaries = [gensim.summarization.summarize(txt,  
                     ratio=ratio) for txt in corpus]    
    return lst_summaries


In [77]:
dtf_train = dtf.iloc[i+1:]
dtf_test = dtf.iloc[:i+1]

In [78]:
predicted = textrank(corpus=dtf_test["text"], ratio=0.2)
predicted[i]

'An inmate housed on the "forgotten floor," where many mentally ill inmates are housed in Miami before trial.\nMIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor." Here, inmates with the most severe mental illnesses are incarcerated until they\'re ready to appear in court.\nSo, they end up on the ninth floor severely mentally disturbed, but not getting any real help because they\'re in jail.\nLeifman says about one-third of all people in Miami-Dade county jails are mentally ill.\nLeifman tells me that these prisoner-patients will often circulate through the system, occasionally stabilizing in a mental hospital, only to return to jail to face their charges.\nLeifman says 200 years ago people were considered "lunatics" and they were locked up in jails even if they had no charges against them.\nOver the years, he says, there was some public outcry, and the mentally ill were moved out of jails and into hospitals.\nBut Leifma

## 3) Paraphrasing

In [106]:
from transformers import *

In [107]:
model = PegasusForConditionalGeneration.from_pretrained("tuner007/pegasus_paraphrase")
tokenizer = PegasusTokenizerFast.from_pretrained("tuner007/pegasus_paraphrase")

Downloading:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--tuner007--pegasus_paraphrase/snapshots/0159e2949ca73657a2f1329898f51b7bb53b9ab2/config.json
Model config PegasusConfig {
  "activation_dropout": 0.1,
  "activation_function": "relu",
  "add_bias_logits": false,
  "add_final_layer_norm": true,
  "architectures": [
    "PegasusForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 16,
  "decoder_start_token_id": 0,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 16,
  "eos_token_id": 1,
  "extra_pos_embeddings": 1,
  "force_bos_token_to_be_generated": false,
  "forced_eos_token_id": 1,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "in

Downloading:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--tuner007--pegasus_paraphrase/snapshots/0159e2949ca73657a2f1329898f51b7bb53b9ab2/pytorch_model.bin
All model checkpoint weights were used when initializing PegasusForConditionalGeneration.

All the weights of PegasusForConditionalGeneration were initialized from the model checkpoint at tuner007/pegasus_paraphrase.
If your task is similar to the task the model of the checkpoint was trained on, you can already use PegasusForConditionalGeneration for predictions without further training.


Downloading:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

loading file spiece.model from cache at /root/.cache/huggingface/hub/models--tuner007--pegasus_paraphrase/snapshots/0159e2949ca73657a2f1329898f51b7bb53b9ab2/spiece.model
loading file tokenizer.json from cache at None
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--tuner007--pegasus_paraphrase/snapshots/0159e2949ca73657a2f1329898f51b7bb53b9ab2/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--tuner007--pegasus_paraphrase/snapshots/0159e2949ca73657a2f1329898f51b7bb53b9ab2/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--tuner007--pegasus_paraphrase/snapshots/0159e2949ca73657a2f1329898f51b7bb53b9ab2/config.json
Model config PegasusConfig {
  "_name_or_path": "tuner007/pegasus_paraphrase",
  "activation_dropout": 0.1,
  "activation_function": "relu",
  "add_bias_logits": false,
  

In [108]:
def get_paraphrased_sentences(model, tokenizer, sentence, num_return_sequences=5, num_beams=5):
  # tokenize the text to be form of a list of token IDs
  inputs = tokenizer([sentence], truncation=True, padding="longest", return_tensors="pt")
  # generate the paraphrased sentences
  outputs = model.generate(
    **inputs,
    num_beams=num_beams,
    num_return_sequences=num_return_sequences,
  )
  # decode the generated sentences using the tokenizer to get them back to text
  return tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [109]:
sentence = "Learning is the process of acquiring new understanding, knowledge, behaviors, skills, values, attitudes, and preferences."
get_paraphrased_sentences(model, tokenizer, sentence, num_beams=10, num_return_sequences=10)

['Learning involves the acquisition of new understanding, knowledge, behaviors, skills, values, attitudes, and preferences.',
 'Learning is the acquisition of new understanding, knowledge, behaviors, skills, values, attitudes, and preferences.',
 'The process of learning is the acquisition of new understanding, knowledge, behaviors, skills, values, attitudes, and preferences.',
 'Gaining new understanding, knowledge, behaviors, skills, values, attitudes, and preferences is the process of learning.',
 'New understanding, knowledge, behaviors, skills, values, attitudes, and preferences are acquired through learning.',
 'Learning is the acquisition of new understanding, knowledge, behaviors, skills, values, attitudes and preferences.',
 'The process of learning is the acquisition of new understanding, knowledge, behaviors, skills, values, attitudes and preferences.',
 'New understanding, knowledge, behaviors, skills, values, attitudes, and preferences can be acquired through learning.',
 

# Personal Contribution Statement

I have implemented the Text Similarity Methods. I have used varous methods such as Jaccard Similarity, Euclidian Distance and Cosine Similarity and used various embeddings as well. After seeing the performance of every method, we can see that Cosine Similarity works the best. Hence, we will be using that for our final application. 

My next steps include integration and front end implementations.