In [1]:
from tqdm import tqdm
import matplotlib.pyplot as plt
from adjustText import adjust_text
import numpy as np
from collections import Counter, OrderedDict
from math import log

In [2]:
# get theta data

with open('theta.tsv','r', encoding='utf-8') as f_in:
    theta_data = f_in.read()

# very hacky! should be cleaned in pre-training textual data (if isn't already)
theta_data = theta_data.replace('*','')
theta_data = theta_data.replace('=','')

theta_rows = theta_data.split('\n')
theta_rows.pop(-1); # blank final row
theta_rows.pop(0); # header row with topic abbreviations
theta_rows.pop(0); # useless "!ctsdata" second header row

# print("first theta data row (100-char preview):\n%s\n" % theta_rows[0][:100])

In [3]:
def mk_flt(s): return float(s) if s else 0

In [4]:
# handy general function for getting max values of list in descending order
def indices_of_top_N_elements(L, N):
    return sorted(range(len(L)), key=lambda x: L[x], reverse=True)[:N]

# characterize document by (pythonic index!) numbers (INT) for max_N topics over threshold and corresponding percentage (STRING)
# for topic plot caption
def get_top_topic_indices(doc_id, doc_thetas, max_N=5, threshold=3):
# return list of tuples of type (%d, %s)
    indices_of_dominant_N_topics = indices_of_top_N_elements(L=doc_thetas, N=max_N)
    qualifying_indices = [  i
                            for i in indices_of_dominant_N_topics
                            if doc_thetas[i] >= threshold
                            ]
    return qualifying_indices

In [5]:
# my solution for adjusting overlapping labels

def update_box_position_values(pos_x, pos_y, label):
    right, left = pos_x+len(label)/4, pos_x-len(label)/4
    bottom, top = pos_y, pos_y+8
    return right, left, bottom, top

def anticipate_obstruction(interrupter_points, left, bottom, top):
    for (x,y) in interrupter_points:
        if x > left and (y >= bottom and y <= top):
            return True
    else: 
        return False

In [6]:
# set plot size

size_x, size_y = 1000, 700
my_dpi=150

generate_plots = True
curr_text_name = ''
if generate_plots:
    for row in tqdm(theta_rows[19470:]):

        cols = row.split('\t')

        plt.clf()
        fig = plt.figure( figsize=(size_x/my_dpi, size_y/my_dpi), dpi=my_dpi )

        doc_id = cols[1]

        # for keeping track of output progress
        text_name = doc_id[:doc_id.find('_')]
        if text_name != curr_text_name:
            print(text_name)
            curr_text_name = text_name
#         plt.title("document: " + doc_id)
        
        plt.ylabel('% of doc ' + doc_id)
        plt.xlabel('topic #')

        axes = plt.gca()
        ymin,ymax = 0,100
        axes.set_ylim([ymin,ymax])

        fig.patch.set_facecolor('white')

        topic_nums = range(1,76)

        doc_thetas_str_list = cols[3:]
        doc_thetas = [ mk_flt(val)*100 for val in doc_thetas_str_list ]

        plt.bar(topic_nums, doc_thetas)

        dominant_topic_indices = get_top_topic_indices(doc_id, doc_thetas)
        
        interrupter_points = []

        for container in axes.containers:

            for i, rect in enumerate(container.get_children()):

                if i not in dominant_topic_indices: continue
                
                height = rect.get_height()
                width = rect.get_width()
                x,y = rect.get_xy()
                pos_x = x + width/2
                pos_y = height+1
                label = '#{}'.format(i+1) + '\n({:.1f}%)'.format(height)
                right, left, bottom, top = update_box_position_values(pos_x, pos_y, label)

                while anticipate_obstruction(interrupter_points, left, bottom, top) == True:
                    pos_y += 4
                    bottom, top = pos_y, pos_y+8
                    right, left, bottom, top = update_box_position_values(pos_x, pos_y, label)
                
                axes.annotate(label, (pos_x, pos_y), ha='center', va='bottom')
                
                interrupter_points += [(right,bottom), (right, top)]
        
        plt.savefig('png/' + doc_id + ".png")
        plt.close(fig)

  0%|          | 0/8911 [00:00<?, ?it/s]

MīmBh


 19%|█▉        | 1736/8911 [07:17<27:10,  4.40it/s]  

NPS


 20%|█▉        | 1759/8911 [07:23<24:54,  4.79it/s]

YD


 27%|██▋       | 2394/8911 [10:09<23:29,  4.62it/s]  

NyKand


 37%|███▋      | 3266/8911 [13:53<21:57,  4.28it/s]  

TriṃśBh


 38%|███▊      | 3400/8911 [14:29<33:54,  2.71it/s]

ŚVK


 58%|█████▊    | 5197/8911 [22:24<14:03,  4.40it/s]   

NV


 74%|███████▍  | 6582/8911 [29:33<09:05,  4.27it/s]   

ViṃśV


 74%|███████▍  | 6618/8911 [29:41<08:43,  4.38it/s]

NBh


 83%|████████▎ | 7363/8911 [32:41<06:25,  4.02it/s]

SŚP


 85%|████████▍ | 7530/8911 [33:19<04:51,  4.73it/s]

VyV


100%|██████████| 8911/8911 [40:50<00:00,  3.64it/s]   


<Figure size 432x288 with 0 Axes>

In [7]:
# hereafter is manually filtering out words to improve phi interpretation
# (from old explore_topic_top_words.py)

In [8]:
# get phi data

with open('phi.csv','r') as f_in:
    phi_data = f_in.read()

phi_data = phi_data.replace('"','') # I think this here but not for theta because of way theta TSV was re-exported

phi_rows = phi_data.split('\n')
phi_rows.pop(-1); # blank final row

# print("first phi data row (100-char preview):\n%s\n" % phi_rows[0][:100])
# print("second phi data row (100-char preview):\n%s\n" % phi_rows[1][:100])

naive_topic_labels = phi_rows[0].split(',')[1:]
K = len(naive_topic_labels)

phis = OrderedDict()
for row in phi_rows[1:]:
    cells = row.split(',')
    word, phi_values = cells[0], cells[1:]
    phis[word] = [ float(ph) for ph in phi_values ]
vocab = list(phis.keys())

In [9]:
# prep true lexicon stats from theta data
doc_fulltext = []
for row in tqdm(theta_rows):
    cols = row.split('\t')
    doc_fulltext.append(cols[2])

corpus_string = ' '.join( doc_fulltext )
corpus_string.replace('  ',' ');
corpus_tokens = corpus_string.split()

num_tokens = len(corpus_tokens)
freq_w = Counter(corpus_tokens)
def prob(w): return (freq_w[w] / num_tokens)

100%|██████████| 28381/28381 [00:00<00:00, 140940.23it/s]


In [10]:
# relevance factor lambda by which phi values will be adjusted during consideration
L = 0.8

# 82 additional words to be filtered out during consideration (not used as stopwords in topic modeling)
unwanted_words = ['a', 'sva', 'tvam', 'tathā', 'syāt', 'evam', 'āha', 'tva', 'tatra', 'asti', 'yadi', 'kim', 'tasya', 'yathā', 'sa', 'ced', 'yat', 'atas', 'etat', 'katham', 'ayam', 'bhavati', 'atra', 'tasmāt', 'vat', 'tā', 'uktam', 'tatas', 'atha', 'tvena', 'tve', 'asya', 'nanu', 'punar', 'idam', 'tadā', 'ucyate', 'tena', 'tayā', 'tāvat', 'yaḥ', 'sati', 'saḥ', 'sā', 'ādeḥ', 'tarhi', 'ādīnām', 'iva', 'ityādi', 'anena', 'ādayaḥ', 'kutas', 'yatas', 'te', 'iha', 'kaḥ', 'asau', 'kvacid', 'ādau', 'teṣām', 'yatra', 'kaścid', 'yena', 'ādiṣu', 'yasya', 'yadā', 'iyam', 'ukta', 'khalu', 'tām', 'tvasya', 'kiñcid', 'ādikam', 'astu', 'bhavet', 'eṣa', 'ete', 'kintu', 'tam', 'tayoḥ', 'yasmāt', 'ye']

# depth of how many words to consider showing and how many to actually show for each topic during consideration
max_consider = 500
max_show = 300

In [11]:
# adjust for L, find new top words
# i.e., set the LDAvis slider

class Topic:
    def __init__(self):
        self.num = 0
        self.naive_label = "" # top_7_words
        self.adjusted_phis = OrderedDict()
        self.top_words = []
        self.filtered_adjusted_phis = OrderedDict()

Ts = []
for i in range(K):

    T = Topic()
    T.num = i + 1
    T.naive_label = naive_topic_labels[i]
    T.adjusted_phis = { word: (L) * log(phis[word][i]) + (1 - L) * log(phis[word][i] / prob(word)) for word in vocab }
    sorted_results = sorted(T.adjusted_phis.items(), key=lambda item: item[1], reverse=True)
    sorted_relevance_dict = { res[0]: res[1] for res in sorted_results }
    T.top_words = list(sorted_relevance_dict.keys())[:max_consider] # consider only max_consider words for each topic

    Ts.append(T)

In [12]:
# sanity check
i=3
word='artham'
print("old phi: ", phis['artha'][i])
print("prob: ", prob('artha'))
print("manual adjusted phi: ", (L) * log(phis[word][i]) + (1 - L) * log(phis[word][i] / prob(word)))

print("stored adjusted phi: ", Ts[i].adjusted_phis[word])

old phi:  0.00584789242497561
prob:  0.0061816347958662645
manual adjusted phi:  -13.252594575196175
stored adjusted phi:  -13.252594575196175


In [13]:
print([len(T.top_words) for T in Ts])

[500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500]


In [14]:
# filter out unwanted words, retain only max_show words for each topic

for T in Ts:
    filtered_top_words = []
    for t_w in T.top_words:
        if t_w in unwanted_words: continue
        filtered_top_words.append(t_w)
    T.top_words = filtered_top_words[:max_show]

In [15]:
print(len(Ts))
print(max_show)
print([len(T.top_words) for T in Ts])

75
300
[300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300]


In [16]:
# output 
# for this typically want smaller max_words
# (or could use larger and then delete manually)

with open('top_words_%d_%d_%.1f.txt' % (max_consider, max_show, L), 'w') as f_out:
    f_out.write( '\n'.join([ str(T.num) + '\t' + ' '.join(T.top_words) for T in Ts ]) )


In [17]:
for T in Ts:
    T.filtered_adjusted_phis = { word: T.adjusted_phis[word] for word in T.top_words }

In [18]:
from wordcloud import WordCloud

wordcloud = WordCloud(
#     regexp=r"[a-zāīūṛṝḷṅñṭḍṇśṣḥṃ]+",
#     regexp=r"[\u1E00-\u1EFF]+",
    font_path='HelveticaNeue.ttc',
    max_words=500, 
    background_color="white", 
    contour_color='steelblue', 
    contour_width=3, 
    width=800, 
    height=400
)

for i, T in enumerate(tqdm(Ts)):
    wordcloud.generate_from_frequencies(frequencies=T.filtered_adjusted_phis)
    wordcloud.to_file('cloud_pngs/topic_{:02}_wordcloud.png'.format(i+1))

100%|██████████| 75/75 [01:41<00:00,  1.36s/it]
