In [None]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
# Imports
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import string as str
import math
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Get sentences and words after preprocessing

def get_tokenized_sents(text):

    tokenized_sents = []

    # remove frequent words and punctuations
    unwanted_words = stopwords.words('english') + list(str.punctuation)

    sents = sent_tokenize(text)
    for s in sents:
        words = word_tokenize(s.lower())
        tokenized_sents.append([w for w in words if w not in unwanted_words])

    return sents, tokenized_sents

In [None]:
# Get Term frequency

def get_tf(tokenized_sents):
    tf = {}
    for s in tokenized_sents:
        for w in s:
            tf[w] = tf.get(w,0) + 1

    return tf

In [None]:
#Correct method
def word_overlap(s1, s2):
    if len(s1)>len(s2):
      return len(set(s1).intersection(s2))
    else:
      return len(set(s1).intersection(s2))

In [None]:
def cosine_sim(s1_vector, s2_vector):
    assert len(s1_vector) == len(s2_vector)
    num = sum([s1_vector[sid]*s2_vector[sid] for sid in range(len(s1_vector))])
    den1 = sum([s1_vector[sid]**2 for sid in range(len(s1_vector))])
    den2 = sum([s2_vector[sid]**2 for sid in range(len(s1_vector))])

    cosine_sim = num / (math.sqrt(den1)*math.sqrt(den2))
    return cosine_sim

In [None]:
def get_freqsum_summary(text):

    original_sentences, tokenized_sentences = get_tokenized_sents(text)
    tf = get_tf(tokenized_sentences)

    scores = {}

    # Get best sentences based on term frequency
    for sid, s in enumerate(tokenized_sentences):
        #print(s)
        scores[sid] = sum([tf.get(w,0) for w in s])

    sorted_scores = sorted(scores.items(), key = lambda x : x[1], reverse = True)
    return [original_sentences[s[0]] for s in sorted_scores[0:3]]

In [None]:

def get_sim_matrix(tokenized_sents, threshold=0.3):
    sim_mat = np.zeros((len(tokenized_sents), len(tokenized_sents)))
    for s1_id, s1 in enumerate(tokenized_sentences):
        for s2_id, s2 in enumerate(tokenized_sentences):
            if word_overlap(s1, s2) >= threshold:
                sim_mat[s1_id, s2_id] = 1
    return sim_mat


In [None]:
def get_degree_centrality_summary(text, threshold = 0.3):

    original_sentences, tokenized_sentences = get_tokenized_sents(text)
    tf = get_tf(tokenized_sentences)

    sim_mat = get_sim_matrix(tokenized_sentences, threshold)
    degree_centrality = sim_mat.sum(axis=1)

    scores = {}

    for id, d in enumerate(degree_centrality):
        scores[id] = d

    sorted_scores = sorted(scores.items(), key = lambda x : x[1], reverse = True)

    return [original_sentences[s[0]] for s in sorted_scores[0:3]]

In [None]:
def power_method(text, threshold=0.3, lam=0.15, max_num_iter = 100):

    delta = 2
    epsilon = 0.0001

    original_sentences, tokenized_sentences = get_tokenized_sents(text)
    tf = get_tf(tokenized_sentences)
    num_sents = len(original_sentences)

    sim_mat = lam/len(original_sentences) + (1-lam)*get_sim_matrix(tokenized_sentences, threshold)
    degree = np.sum(sim_mat, axis=1)

    sim_mat_norm = sim_mat/sim_mat.sum(axis=1)

    original_scores = np.array([1.0/num_sents for _ in original_sentences])

    num_iter = 0

    while delta > epsilon:
        #print(original_scores)
        #print(sim_mat_norm.sum(axis=0))
        new_scores = np.matmul(sim_mat_norm, original_scores)
        #print(new_scores)

        delta = np.mean(abs(new_scores-original_scores))
        original_scores = new_scores

        print("Iteration :{}, Delta: {}".format(num_iter, delta))

        num_iter += 1
        if num_iter > max_num_iter:
            # Break if required delta not achieved in fixed iterataions
            break

    scores = {}
    for id, d in enumerate(new_scores):
        scores[id] = d

    sorted_scores = sorted(scores.items(), key = lambda x : x[1], reverse = True)

    return [original_sentences[s[0]] for s in sorted_scores[0:3]]

In [None]:
from google.colab import files

uploaded = files.upload()

Saving brown.pickle to brown.pickle
Saving cambodia.txt to cambodia.txt
Saving covid.txt to covid.txt


In [None]:
def clean_text(data):

  data=re.sub("https*\S+", " ", data)
  data=re.sub('[^a-zA-Z]',' ',data)
  #data=data.lower()
  data=word_tokenize(data)
  data=[item for item in data if item not in stop_words]
  data=' '.join(data)
  return data

In [None]:
# Read a custom File
with open('/content/covid.txt') as f:
    text_covid = f.read()

In [None]:
text_covid = clean_text(text_covid)
text_covid=text_covid.strip('\n')

In [None]:
get_freqsum_summary(text_covid)

['Coronavirus disease COVID contagious disease caused severe acute respiratory syndrome coronavirus SARS CoV The first case identified Wuhan China December It since spread worldwide leading ongoing pandemic Symptoms COVID variable often include fever cough fatigue breathing difficulties loss smell taste Symptoms begin one fourteen days exposure virus Around one five infected individuals develop symptoms While people mild symptoms people develop acute respiratory distress syndrome ARDS ARDS precipitated cytokine storms multi organ failure septic shock blood clots Longer term damage organs particular lungs heart observed There concern significant number patients recovered acute phase disease continue experience range effects known long COVID months afterwards These effects include severe fatigue memory loss cognitive issues low grade fever muscle weakness breathlessness The virus causes COVID spreads mainly infected person close contact another person Small droplets aerosols containing v

USING COSINE_SIM

In [None]:
with open('/content/cambodia.txt') as f:
    text_camb = f.read()


In [None]:
text_camb_list = []

for word in text_camb.split(' '):
  text_camb_list.append(word)


In [None]:
text_covid_list = []

for word in text_covid.split(' '):
  text_covid_list.append(word)

In [None]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer

def get_sim_matrix_cosine(tokenized_sents, threshold=0.3):
    sim_mat = np.zeros((len(tokenized_sents), len(tokenized_sents)))
    for s1_id, s1 in enumerate(tokenized_sents):
        for s2_id, s2 in enumerate(tokenized_sents):
            r1 = tfidf.transform(s1)
            r2 = tfidf.transform(s2)
            if cosine_sim(r1, r2) >= threshold:
                sim_mat[s1_id, s2_id] = 1
    return sim_mat

#text = text.strip('\n')


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
vectorizer = TfidfVectorizer()

In [None]:
X = vectorizer.fit_transform(text_camb_list)

In [None]:
query_vec = vectorizer.transform(text_covid_list)
results = cosine_similarity(X,query_vec).reshape((-1,))
print("Array is ,",results)
print("total is ",np.sum(results))

Array is , [0. 0. 0. ... 0. 0. 0.]
total is  2005.54510739295


HOW COVID SPREADS

using similarity_index

In [None]:
def text_list2(text):
  text_list = []

  for word in text.split('.'):
    text_list.append(word)
  return text_list

In [None]:
def get_freqsum_summary(text,query):
    #print(text)
    text_list=text_list2(text)
    original_sentences, tokenized_sentences = get_tokenized_sents(text)
    tf = get_tf(tokenized_sentences)

    original_sentences2, tokenized_sentences2 = get_tokenized_sents(query)
    qtf = get_tf(tokenized_sentences2)

    scores = {}

    #print(tf)
    # Get best sentences based on term frequency
    for sid, s in enumerate(tokenized_sentences):

        scores[sid] = sum([tf.get(w,0) for w in s])


    sorted_scores = sorted(scores.items(), key = lambda x : x[1], reverse = True)
    overlap_score=0




    for s in text_list:

      overlap_score2=word_overlap(query,s)
      if overlap_score<overlap_score2:
        overlap_score=overlap_score2
        sent=s
    return sent

with open('/content/covid.txt') as f:
    text_covid = f.read()
get_freqsum_summary(text_covid,"How covid spreads")

'Coronavirus disease 2019 (COVID-19) is a contagious disease caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2). The first case was identified in Wuhan, China, in December 2019. It has since spread worldwide, leading to an ongoing pandemic.'