In [7]:
import numpy as np
import pandas as pd
import nltk 
nltk.download('punkt')
import re

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Zehra\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
# Read data
df = pd.read_csv('articles.csv',encoding='latin1')
df.head()

Unnamed: 0,article_id,article_title,article_text,source
0,1,How vaccinating monkeys could stop a pandemic,"Yellow fever kills some 15% of those infected,...",https://www.bbc.com/future/article/20210208-ye...
1,2,The disease-resistant patients exposing Covid-...,Some people are unusually resilient to the cor...,https://www.bbc.com/future/article/20210219-th...
2,3,Covid-19 paused climate emissions  but they'r...,The world's sudden launch into lockdown a year...,https://www.bbc.com/future/article/20210312-co...
3,4,What we know and don't know about Covid-19,"Over the past year, BBC Future has aimed to go...",https://www.bbc.com/future/article/20210224-th...
4,5,The remarkable power of Australian kelp,"Algae is a powerhouse for the climate, sending...",https://www.bbc.com/future/article/20210406-ho...


In [9]:
# Article example
df['article_text'][0]

"Yellow fever kills some 15% of those infected, but has an effective vaccine. Barriers to vaccinating people in potential hotspots means scientists are turning to a surprising alternative: vaccinating monkeys.\nOn a cloudy morning in October, a team of scientists set off into Brazil's Atlantic Forest, looking for monkeys. One man carried what looked like an old TV antenna and a machete. A woman beside him held a small metal cage \x96 a trap \x96 and two bags full of bananas.\n\nTheir mission: stop the next outbreak of yellow fever in monkeys before it spreads to humans.\n\nBrazil may be trying to cope with the second-highest rate of Covid-19 deaths in the world, after the United States. But the scientists fear this other, far more lethal disease is in danger of erupting in the South American country once again. Yellow fever infects some 200,000 people and kills 30,000 of them each year, more than terrorist attacks and plane crashes combined.\n\nCaused by a virus spread between humans a

In [10]:
# Splitting text into sentences
from nltk.tokenize import sent_tokenize
sentences = []
for s in df['article_text']:
    sentences.append(sent_tokenize(s))
    
sentences = [y for x in sentences for y in x]

In [11]:
# Sentence example
sentences[:3]

['Yellow fever kills some 15% of those infected, but has an effective vaccine.',
 'Barriers to vaccinating people in potential hotspots means scientists are turning to a surprising alternative: vaccinating monkeys.',
 "On a cloudy morning in October, a team of scientists set off into Brazil's Atlantic Forest, looking for monkeys."]

In [13]:
# Extracting word vectors
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding = 'utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype = 'float32')
    word_embeddings[word] = coefs
f.close()

len(word_embeddings)

400000

In [14]:
# Removing punctuation, numbers, and special expressions
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

# Making all letters in lowercase
clean_sentences = [s.lower() for s in clean_sentences]

In [15]:
# Nltk-stopwords is downloaded to extract common words (is, am, the, of, in, etc.) in sentences.
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Zehra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

# Removal of stopwords
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

In [18]:
# Creation of vectors for sentences
sentence_vectors = [] 
for i in clean_sentences:
    if len(i) != 0:
        v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
    else:
        v= np.zeros((100,))
    sentence_vectors.append(v)

In [19]:
# Similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])

In [20]:
# Cosine Similarity is used to calculate the similarity between two sentences.
from sklearn.metrics.pairwise import cosine_similarity

for i in range(len(sentences)):
    for j in range(len(sentences)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

In [21]:
# Implementing the PageRank Algorithm
import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [24]:
# To obtain the summary, the first 10 sentences in the sequence are printed.
sentences[:10]

['Yellow fever kills some 15% of those infected, but has an effective vaccine.',
 'Barriers to vaccinating people in potential hotspots means scientists are turning to a surprising alternative: vaccinating monkeys.',
 "On a cloudy morning in October, a team of scientists set off into Brazil's Atlantic Forest, looking for monkeys.",
 'One man carried what looked like an old TV antenna and a machete.',
 'A woman beside him held a small metal cage \x96 a trap \x96 and two bags full of bananas.',
 'Their mission: stop the next outbreak of yellow fever in monkeys before it spreads to humans.',
 'Brazil may be trying to cope with the second-highest rate of Covid-19 deaths in the world, after the United States.',
 'But the scientists fear this other, far more lethal disease is in danger of erupting in the South American country once again.',
 'Yellow fever infects some 200,000 people and kills 30,000 of them each year, more than terrorist attacks and plane crashes combined.',
 'Caused by a vi