---

In [14]:
import pandas as pd
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\duyen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [36]:
# Extract word vectors
word_embeddings = {}
file = open('../../glove.6B.200d.txt', encoding='utf-8')
for line in file:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
file.close()
len(word_embeddings)

400000

In [37]:
# function to remove stopwords
def remove_stopwords(sen):
    stop_words = stopwords.words('english')
    
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

In [43]:
def sentence_vector_func (sentences_cleaned) : 
    sentence_vector = []
    for i in sentences_cleaned:
        if len(i) != 0:
            v = sum([word_embeddings.get(w, np.zeros((200,))) for w in i.split()])/(len(i.split())+0.001)
        else:
            v = np.zeros((200,))
        sentence_vector.append(v)
    
    return (sentence_vector)

In [39]:
def custom_from_numpy_array(similarity_matrix):
    num_nodes = similarity_matrix.shape[0]
    G = nx.Graph()
    G.add_nodes_from(range(num_nodes))

    for i in range(num_nodes):
        for j in range(i+1, num_nodes):
            weight = similarity_matrix[i, j]
            G.add_edge(i, j, weight=weight)

    return G

In [45]:
def custom_pagerank(graph, alpha=0.85, max_iter=10, tol=1.0e-6):
    num_nodes = graph.number_of_nodes()
    pagerank = {node: 1 / num_nodes for node in graph.nodes()}

    for _ in range(max_iter):
        new_pagerank = {node: (1 - alpha) / num_nodes for node in graph.nodes()}

        for node in graph.nodes():
            degree = graph.degree(node, weight='weight')
            if degree != 0:
                for neighbor in graph.neighbors(node):
                    new_pagerank[neighbor] += alpha * pagerank[node] / degree

        residual = np.linalg.norm(np.array(list(new_pagerank.values())) - np.array(list(pagerank.values())))
        pagerank = new_pagerank

        if residual < tol:
            break

    return pagerank


In [46]:
def summary_text(test_text, n=5):
    sentences = []

    # Token hóa văn bản
    sentences.extend(sent_tokenize(test_text))
    sentences = [y for x in sentences for y in x]  # Flatten list

    # Loại bỏ dấu câu, số và ký tự đặc biệt
    clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z0-9]", " ")

    # Chuyển đổi chữ in thành chữ thường
    clean_sentences = [s.lower() for s in clean_sentences]

    # Loại bỏ stopwords từ các câu
    clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

    # Hàm không được cung cấp, bạn cần thay thế bằng hàm tạo vectơ câu của mình
    sentence_vectors = sentence_vector_func(clean_sentences)

    # Ma trận tương đồng
    sim_mat = cosine_similarity(sentence_vectors, sentence_vectors)

    # Biểu diễn đồ thị từ ma trận tương đồng
    custom_graph = custom_from_numpy_array(sim_mat)

    # Áp dụng thuật toán Pagerank tùy chỉnh để tính điểm quan trọng
    pagerank_values = custom_pagerank(custom_graph)

    # Sắp xếp câu dựa trên điểm quan trọng
    ranked_sentences = sorted(((pagerank_values[i], s) for i, s in enumerate(sentences)), reverse=True)

    # Trích xuất câu để tạo tóm tắt
    summarised_string = ''
    for i in range(min(n, len(ranked_sentences))):
        summarised_string = summarised_string + str(ranked_sentences[i][1])

    return summarised_string

In [49]:
text = '''The quality, type, and density of information conveyed via text varies from source to source. Textbooks tend to be low in density but high in quality, while academic articles are high in both quality and density. On the other hand, news articles can vary significantly from source to source. Regardless of where the text comes from the goal here is to minimize the time you spend reading. Thus, we will build a tool that can easily be adapted to any number of sources.'''

summary = summary_text(text, 3)

summary_sentences = summary.split('. ')
formatted_summary = '.\n'.join(summary_sentences)

print(formatted_summary)

yyy


---

In [1]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest

In [82]:
def summarize(text, per):
    nlp = spacy.load('en_core_web_sm')
    doc= nlp(text)
    # tạo từ điển để lưu lại tần số các từ
    word_frequencies={}
    for word in doc:
        if word.text.lower() not in list(STOP_WORDS):
            if word.text.lower() not in punctuation:
                if word.text not in word_frequencies.keys():
                    word_frequencies[word.text] = 1
                else:
                    word_frequencies[word.text] += 1
    #chuẩn hóa từ bằng cách chia tần suất max
    max_frequency=max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word]=word_frequencies[word]/max_frequency
    sentence_tokens= [sent for sent in doc.sents]
    # tính điểm = tổng tần suất từ trong câu
    sentence_scores = {}
    for sent in sentence_tokens:
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if sent not in sentence_scores.keys():                            
                    sentence_scores[sent]=word_frequencies[word.text.lower()]
                else:
                    sentence_scores[sent]+=word_frequencies[word.text.lower()]
    # xác định số câu và in ra các câu có số điểm từ cao nhất
    select_length=int(len(sentence_tokens)*per)
    summary=nlargest(select_length, sentence_scores,key=sentence_scores.get)
    final_summary=[word.text for word in summary]
    summary=' '.join(final_summary)
    return summary

In [64]:
text = '''The quality, type, and density of information conveyed via text varies from source to source. Textbooks tend to be low in density but high in quality, while academic articles are high in both quality and density. On the other hand, news articles can vary significantly from source to source. Regardless of where the text comes from the goal here is to minimize the time you spend reading. Thus, we will build a tool that can easily be adapted to any number of sources.'''

summary = summarize(text, 0.4)

summary_sentences = summary.split('. ')
formatted_summary = '.\n'.join(summary_sentences)

print("summary :")
print(formatted_summary)

summary :
Textbooks tend to be low in density but high in quality, while academic articles are high in both quality and density.
The quality, type, and density of information conveyed via text varies from source to source.


In [44]:
len(summary)

464

In [45]:
len(text)

468