/
6_reduced_lexicon_lda.py
189 lines (141 loc) · 7.81 KB
/
6_reduced_lexicon_lda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
"""
Zachary Kimo Stine, 2021-01-19
Train LDA models on modified corpus. Closely follows the other LDA script, but with a few changes specific to the modified corpus.
"""
import cons
import os
import csv
import gensim
import numpy as np
import logging
import datetime
import random
class ReducedCorpusFromIndices(object):
def __init__(self, subreddit_list, corpus_sub_indices):
self.subreddit_list = subreddit_list
self.corpus_sub_indices = corpus_sub_indices
self.corpus_dict = {}
for subreddit in subreddit_list:
subreddit_corpus_fpath = os.path.join(cons.corpora_dir, subreddit, 'reduced_corpus_001', 'reduced_corpus_001')
subreddit_corpus = gensim.corpora.MmCorpus(subreddit_corpus_fpath)
self.corpus_dict[subreddit] = subreddit_corpus
def __iter__(self):
for (subreddit, doc_index) in self.corpus_sub_indices:
yield self.corpus_dict[subreddit][doc_index]
def get_doc_len(bow_doc):
"""
Returns document length (ie, number of tokens).
"""
doc_len = sum([word_count for (word_id, word_count) in bow_doc])
return doc_len
def build_subreddit_document_list_reduced(subreddit_list, sample_subs, sample_size, min_len=50.0):
"""
Generates list of document indices to be included in the training corpus and randomly shuffles them.
"""
sub_index_list = []
for subreddit in subreddit_list:
# Only consider including a document if its reduced length has at least 50 tokens.
subreddit_reduced_metadata_path = os.path.join(cons.corpora_dir, subreddit, 'reduced_corpus_001', 'reduced_corpus_001_metadata.csv')
subreddit_reduced_metadata = read_corpus_metadata(subreddit_reduced_metadata_path)
subreddit_allowed_corpus_indices = [int(doc_data['corpus_index']) for doc_data in subreddit_reduced_metadata if float(doc_data['reduced_doc_length']) >= min_len]
print(subreddit + ' has ' + str(len(subreddit_allowed_corpus_indices)) + ' docs that are >= the minimum length.')
# Sample smaller # of documents (set by sample_size) for subreddits included in sample_subs list. Otherwise, keep all allowable documents.
if subreddit in sample_subs:
random_indices = np.random.choice(subreddit_allowed_corpus_indices, sample_size, replace=False)
sub_index_list += [(subreddit, rand_i) for rand_i in random_indices]
else:
sub_index_list += [(subreddit, doc_i) for doc_i in subreddit_allowed_corpus_indices]
# Randomize document order before model training.
random.shuffle(sub_index_list)
return sub_index_list
def write_top_words(model, model_dir, k, num_words=500):
"""
Writes highest-probability words and their probabilities within each topic to file.
"""
for topic_index in range(k):
# Make directory to store all topic-specific data
topic_dir = os.path.join(model_dir, 'topic_' + f'{topic_index:02d}')
cons.makedir(topic_dir)
# Get the highest probability terms from this topic
topic_words = model.show_topic(topic_index, num_words)
# Path to CSV file that will hold the topic words and probabilities:
wordlist_fname = os.path.join(topic_dir, 'topic_' + f'{topic_index:02d}' + '_word_list.csv')
# Write top terms to csv file.
with open(wordlist_fname, 'w', newline='', encoding='utf-8') as ofile:
f_writer = csv.writer(ofile)
f_writer.writerow(['word', 'probability'])
for (word, prob) in topic_words:
f_writer.writerow([word, prob])
def read_corpus_metadata(fpath):
"""
Reads in corpus metadata.
"""
data_list = []
with open(fpath, 'r', encoding='utf-8') as infile:
freader = csv.DictReader(infile)
for row_dict in freader:
data_list.append(row_dict)
return data_list
if __name__ == '__main__':
subreddit_list = cons.subreddit_list
k_list = [20, 100, 250]
models_per_k = 10
downsample_subs = ['christianity']
downsample_size = 75000 # 75,000
# LDA arguments.
passes = 5
iterations = 200
chunks = 2000
eval_every = None
min_prob = 1e-10
dictionary = gensim.corpora.Dictionary.load(os.path.join(cons.corpora_dir, 'dictionary.dict'))
combo_models_dir = cons.makedir(os.path.join(cons.lda_dir, 'combined_subs_reduced_001'))
# Set up logger for all of the combined, normal corpus LDA models.
log_fpath = os.path.join(combo_models_dir, 'reduced_001_combo_models.log')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO,
handlers=[logging.FileHandler(log_fpath, 'w', 'utf-8')])
# Since we are going to be sampling documents from a few subreddits, we will train models with the same # of topics
# (k) on a few different samples to make sure results are not overly specific to a particular sample. The number of
# samples is defined by models_per_k.
for model_pass in range(models_per_k):
sample_num = model_pass + 1
sample_random_seed = np.random.choice(1000)
current_time = datetime.datetime.now()
print(str(current_time) + ' : starting combined corpus sample ' + str(sample_num))
print('--------------------')
sample_dir = cons.makedir(os.path.join(combo_models_dir, 'combo_' + str(sample_num)))
# Create training corpus by sampling documents from the designated subreddits and using all from the others.
# Then, randomize the order of the resulting documents for training.
corpus_sub_indices = build_subreddit_document_list_reduced(subreddit_list,
downsample_subs,
downsample_size,
min_len=50.0)
print('sample ' + str(sample_num) + ' has ' + str(len(corpus_sub_indices)) + ' documents.')
combo_corpus = ReducedCorpusFromIndices(subreddit_list, corpus_sub_indices)
for k in k_list:
print(str(datetime.datetime.now()) + ' starting model with ' + str(k) + ' topics.')
model_name = 'combo_' + str(sample_num) + '-' + str(k)
print(model_name)
# Create model directory
model_dir = cons.makedir(os.path.join(sample_dir, model_name))
# Train and save LDA model.
logging.info('START ' + model_name)
lda_model = gensim.models.LdaModel(combo_corpus, num_topics=k, id2word=dictionary, passes=passes,
iterations=iterations, chunksize=chunks,
eval_every=eval_every, minimum_probability=min_prob,
alpha='auto', random_state=sample_random_seed)
logging.info('END ' + model_name)
lda_model.save(os.path.join(model_dir, 'model_files'))
# Write summary information about file.
model_description = {'k': k,
'random_state': sample_random_seed,
'passes': passes,
'iterations': iterations,
'chunksize': chunks,
'eval_every': eval_every}
with open(os.path.join(model_dir, 'model_description.txt'), 'w') as outfile:
outfile.write(str(model_description))
# Write highest probability words to file for reference.
write_top_words(lda_model, model_dir, k, num_words=100)
print('-------------\n')