## C1: TextRank

In [3]:
import numpy as np
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already present
nltk.download('stopwords')
stop_words = stopwords.words('english')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\duyen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [5]:
def remove_stopwords(sen):
    sen_clean = " ".join([i for i in sen if i not in stop_words])
    return sen_clean

In [12]:
def generate_text_rank_summary(text, per):

    sentences = [sentence.strip() for sentence in text.split('.') if sentence.strip()]
    clean_sentences = [remove_stopwords(r.split()) for r in sentences]

    sentence_vectors = []
    for i in clean_sentences:
        if len(i) != 0:
            v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()]) / (len(i.split()) + 0.001)
        else:
            v = np.zeros((100,))
        sentence_vectors.append(v)
    sim_mat = np.zeros([len(sentences), len(sentences)])

    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1, 100),
                                                  sentence_vectors[j].reshape(1, 100))[0, 0]

    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank(nx_graph)
    num_sentences = int(len(sentences) * per)
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)[:num_sentences]
    summary = '.\n'.join([sent for score, sent in ranked_sentences])

    return summary


In [13]:
text = """
The quality, type, and density of information conveyed via text varies from source to source. 
Textbooks tend to be low in density but high in quality, while academic articles are high in both quality and density. 
On the other hand, news articles can vary significantly from source to source. 
Regardless of where the text comes from the goal here is to minimize the time you spend reading. 
Thus, we will build a tool that can easily be adapted to any number of sources.
"""

summary = generate_text_rank_summary(text,0.4)
summary_sentences = summary.split('. ')
formatted_summary = '.\n'.join(summary_sentences)
print("summary :")
print(formatted_summary)

summary :
On the other hand, news articles can vary significantly from source to source.
Thus, we will build a tool that can easily be adapted to any number of sources


---

## C2: Tính độ quan trọng dựa vào tần suất từ

In [2]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest

In [5]:
def summarize(text, per):
    nlp = spacy.load('en_core_web_sm')
    doc= nlp(text)
    # tạo từ điển để lưu lại tần số các từ
    word_frequencies={}
    for word in doc:
        if word.text.lower() not in list(STOP_WORDS):
            if word.text.lower() not in punctuation:
                if word.text not in word_frequencies.keys():
                    word_frequencies[word.text] = 1
                else:
                    word_frequencies[word.text] += 1
    #chuẩn hóa từ bằng cách chia tần suất max
    max_frequency=max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word]=word_frequencies[word]/max_frequency
    sentence_tokens= [sent for sent in doc.sents]
    # tính điểm = tổng tần suất từ trong câu
    sentence_scores = {}
    for sent in sentence_tokens:
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if sent not in sentence_scores.keys():                            
                    sentence_scores[sent]=word_frequencies[word.text.lower()]
                else:
                    sentence_scores[sent]+=word_frequencies[word.text.lower()]
    # xác định số câu và in ra các câu có số điểm từ cao nhất
    select_length=int(len(sentence_tokens)*per)
    summary=nlargest(select_length, sentence_scores,key=sentence_scores.get)
    final_summary=[word.text for word in summary]
    summary=''.join(final_summary)
    return summary

In [6]:
text = '''
Yet in truth, the most useful headline for Kyiv should be how unutterably bleak the frontlines are for them now. 
In nearly every direction, the news is grim. Russian forces are hiving off parts of the eastern city of Avdiivka, yet another town Moscow seems content to throw thousands of lives at despite its minimal importance.
Along the Zaporizhzhia frontline, where the counteroffensive was focused but ultimately slow and unrewarding, Russian units have come back with renewed vigor and the defense is costly for Ukraine.
Ukraine has made a plucky (or foolhardy) dash across the Dnipro River, with some small progress into Russian lines.
The casualties have been immense, their supply lines are problematic, and their prospects dim.'''

summary = summarize(text, 0.4)

summary_sentences = summary.split('. ')
formatted_summary = '.\n'.join(summary_sentences)
print("summary :")
print(formatted_summary)

summary :
Russian forces are hiving off parts of the eastern city of Avdiivka, yet another town Moscow seems content to throw thousands of lives at despite its minimal importance.
Along the Zaporizhzhia frontline, where the counteroffensive was focused but ultimately slow and unrewarding, Russian units have come back with renewed vigor and the defense is costly for Ukraine.

