-
Notifications
You must be signed in to change notification settings - Fork 292
/
Copy pathdemo_time_plot.py
106 lines (90 loc) · 3.11 KB
/
demo_time_plot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import re
import scattertext as st
import spacy
nlp = spacy.blank('en')
nlp.add_pipe('sentencizer')
headline_df = st.SampleCorpora.GuardianHeadlines.get_data().assign(
Parse=lambda df: df.Text.progress_apply(nlp),
MonthNum=lambda df: df.Date.apply(lambda x: x.month),
Month=lambda df: df.Date.apply(lambda x: x.strftime("%B-%Y")),
DateStr=lambda df: df.Date.apply(lambda x: x.strftime("%Y-%m-%d")),
)
#def exclude_ngrams_which_do_not_start_and_end_with_function_words(ngram: spacy.tokens.Span) -> bool:
# return len(set([ngram[0].pos_, ngram[-1].pos_]) \
# & {"DET", "AUX", "ADP", "AUX", "PRON", "PUNCT", 'CCONJ', 'PART'}) > 0
word_number_matcher = re.compile('^[A-Za-z0-9 ]+$')
def exclude_ngrams_which_do_not_start_and_end_with_function_words(ngram: spacy.tokens.Span) -> bool:
return any([ngram[0].lower_.strip() in st.MY_ENGLISH_STOP_WORDS,
ngram[-1].lower_.strip() in st.MY_ENGLISH_STOP_WORDS,
word_number_matcher.match(ngram[0].lower_.strip()) is None,
word_number_matcher.match(ngram[-1].lower_.strip()) is None])
corpus = st.OffsetCorpusFactory(
headline_df,
category_col='DateStr',
parsed_col='Parse',
feat_and_offset_getter=st.FlexibleNGramFeatures(
ngram_sizes=[1, 2, 3, 4, 5],
exclude_ngram_filter=exclude_ngrams_which_do_not_start_and_end_with_function_words
)
).build().compact(
compactor=st.NPMICompactor(
minimum_term_count=3,
number_terms_per_length=2000,
),
non_text=True
).compact(
st.NgramPercentageCompactor(
usage_portion=0.6,
),
non_text=True
).filter_out(
lambda x: len(x) == 1,
non_text=True
).compact(
compactor=st.AssociationCompactor(
2000,
scorer=st.DeltaJSDivergenceScorer,
term_ranker=st.OncePerDocFrequencyRanker,
use_non_text_features=True
),
non_text=True
)
category_order = list(sorted(corpus.get_categories()))
heading_categories, heading_category_order = st.CharacteristicGrouper(
corpus,
non_text=True,
rank_embedder=st.RankEmbedder(
term_scorer=st.DeltaJSDivergenceScorer(corpus),
rank_threshold=10
),
window_size=1,
to_text=' to '
).get_new_doc_categories(
number_of_splits=5,
category_order=category_order
)
order = list(sorted(corpus.get_categories()))
html = st.produce_scattertext_table(
corpus=corpus,
heading_categories=heading_categories,
heading_category_order=heading_category_order,
category_order=category_order,
all_category_scorer=lambda c: st.AllCategoryTermScorer(c, term_scorer=st.DeltaJSDivergenceScorer),
metadata=lambda c: c.get_df()['Date'].astype(str),
ignore_categories=False,
plot_width=1000,
plot_height=400,
top_terms_length=5,
sort_doc_labels_by_name=True,
use_offsets=True,
non_text=True,
trend_plot_settings=st.TimePlotSettings(
category_order=category_order,
dispersion_metric='DA',
dispersion_scaler=st.Scalers.dense_rank
)
)
fn = 'demo_time_plot_da.html'
with open(fn, 'w') as of:
of.write(html)
print('Run ./open demo_time_plot_da.html')