-
Notifications
You must be signed in to change notification settings - Fork 0
/
summary_word_freq.py
65 lines (44 loc) · 2.01 KB
/
summary_word_freq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import spacy
import re
from collections import Counter
import pprint
import pandas as pd
import nlp_helpers
def summary_word_freq(path: str):
#pprint.pprint("="*80)
# We need 3 forms of text
# a) normalized and lemmatized - to count number of occurance each words in whole text
# b) normalized and lemmatized where end of each sentence is broken by punkt - to count how many most important words
# occured in each sentance
# c) original text (out of '\n' and other edition signs) - to take most important sentence in original form
# 0. web scrapping
nlp = spacy.load("pl_core_news_sm")
#url = "https://wpolityce.pl/polityka/674881-sztuka-przejmowania-wladzy"
url = path
text = nlp_helpers.web_scrapping(url)
text_nlp = nlp(text)
text_c_form = text_nlp.doc
text_b_form = next(nlp_helpers.gen_spacy_lemma(text_c_form))
text_a_form_tmp = " ".join([word for word in text_b_form.split() if word not in nlp_helpers.stopwords]).lower()
pattern = '[^a-ź]'
text_a_form = re.sub(pattern, ' ', text_a_form_tmp)
# 1. analysis of frequency of occurrence words in whole text
words = Counter()
words.update(text_a_form.split())
# take words only appear more than 8
most_common_words = {i[0]: i[1] for i in words.items() if i[1]>8}
# 2. take sentences where most frequent word appear
summary = []
# znajdź zdania zawierające najwięcej punktów (najwięcej słów kluczowych zapisanych w zmiennej new)
for sentence in text_b_form.lower().split('.'):
summary.append(nlp_helpers.score_sentence(sentence, most_common_words))
idx = sorted(range(len(summary)), key=lambda k: summary[k], reverse=True)
nr_of_most_imp_sent = 5
idx = sorted(idx[:nr_of_most_imp_sent])
sents_in_list = list(text_c_form.sents)
summary_text = " ".join([str(sents_in_list[i]) for i in idx]).replace('\xa0', ' ')
#print(summary_text)
return summary_text
if __name__ == "__main__":
print(summary_word_freq("https://wpolityce.pl/polityka/674881-sztuka-przejmowania-wladzy"))
stop = 0