In [1]:
import math
import tqdm
import numpy as np

from fastdist import fastdist

from get_size import get_size as mem_size

In [2]:
# get theta data

with open('theta.tsv','r') as f_in:
    theta_data = f_in.read()

theta_data = theta_data.replace('*','') # very hacky, should be cleaned in data itself

theta_rows = theta_data.split('\n')
theta_rows.pop(-1); # blank final row
theta_rows.pop(0); # header row with topic abbreviations
theta_rows.pop(0); # useless "!ctsdata" second header row

# print("first theta data row (100-char preview):\n%s\n" % theta_rows[0][:100])
print("mem_size of theta_rows: %s" % f"{ mem_size(theta_rows) :,d}")

mem_size of theta_rows: 44,560,027


In [3]:
# from theta data, get doc ids, doc full-text, and theta numbers

from collections import OrderedDict

doc_ids = []
doc_fulltext = OrderedDict() # e.g. doc_fulltext[DOC_ID]
thetas = {} # e.g. theta[DOC_ID]

for row in theta_rows:

    cells = row.split('\t') # must have been converted to TSV first!
    doc_id, doc_text, theta_values = cells[1], cells[2], cells[3:]
    # don't need cells[0] which would be doc_num
    
    doc_ids.append(doc_id)
    doc_fulltext[doc_id] = doc_text
    thetas[doc_id] = [ float(th) for th in theta_values ]

print("len(doc_ids): ", len(doc_ids))
# print("mem_size of doc_ids: %s" % f"{ mem_size(doc_ids) :,d}")
# print("mem_size of doc_fulltext: %s" % f"{ mem_size(doc_fulltext) :,d}")
print("mem_size of thetas: %s" % f"{ mem_size(thetas) :,d}")

len(doc_ids):  15938
mem_size of thetas: 29,161,950


In [4]:
# overall corpus string and list of all words

corpus_words_string = ' '.join( doc_fulltext.values() )
corpus_words_string.replace('  ',' ');

corpus_words_list = corpus_words_string.split()

corpus_vocab = list(set(corpus_words_list))
corpus_vocab.sort()

from copy import copy
corpus_vocab_reduced = copy(corpus_vocab)

# print("num of chars in corpus_words_string: %s" % f"{len(corpus_words_string):,d}")
# print("corpus_words_string preview (100-char):\n%s" % corpus_words_string[:100])
print("mem_size of corpus_words_string: %s" % f"{ mem_size(corpus_words_string) :,d}")
# print()
# print("num of words in corpus_words_list: %s" % f"{len(corpus_words_list):,d}")
# print("corpus_words_list preview (100-word):\n%s" % corpus_words_list[:100])
print("mem_size of corpus_words_list: %s" % f"{ mem_size(corpus_words_list) :,d}")
# print()
# print("num of words in corpus_vocab: %s" % f"{len(corpus_vocab):,d}")
# print("corpus_vocab preview (100-word):\n%s" % corpus_vocab[:100])
print("mem_size of corpus_vocab: %s" % f"{ mem_size(corpus_vocab) :,d}")

mem_size of corpus_words_string: 15,834,636
mem_size of corpus_words_list: 94,013,094
mem_size of corpus_vocab: 6,294,099


In [5]:
# create word frequency dictionary for words in entire corpus

from collections import Counter

freq_w = Counter(corpus_words_list)

# print("frequency of word “artha”: %s" % f"{freq_w['artha']:,d}")
# print("frequency of word “maṅgalena”: %s" %freq_w['maṅgalena'])
# print("frequency of string “:”: %s" % freq_w[':'])

# freq_w_first_100 = {k:freq_w[k] for k in list(freq_w.keys())[:100]}
# print("num of words in freq_w: %s" % f"{ len(freq_w) :,d}")
# print("freq_w preview (100-word):\n%s" % freq_w_first_100)

print("mem_size of freq_w: %s" % f"{ mem_size(freq_w) :,d}")

mem_size of freq_w: 8,398,551


In [6]:
# load phi data

with open('phi.csv','r') as f_in:
    phi_data = f_in.read()

phi_data = phi_data.replace('"','') # I think this here but not for theta because of way theta TSV was re-exported

phi_rows = phi_data.split('\n')
phi_rows.pop(-1); # blank final row
phi_rows.pop(0);

# print("first phi data row (100-char preview):\n%s\n" % phi_rows[0][:100])
print("mem_size of phi_rows: %s" % f"{ mem_size(phi_rows) :,d}")

mem_size of phi_rows: 42,971,614


In [7]:
# store phi data

phis = {} # e.g., phis[WORD][TOPIC_NUM-1] = P(w|t) conditional probability

for row in phi_rows:

    cells = row.split(',')
    word, phi_values = cells[0], cells[1:]

    phis[word] = [ float(ph) for ph in phi_values ]

# print("size of phis: %s" % f"{ get_size(phis) :,d}")
print("mem_size of phis: %s" % f"{ mem_size(phis) :,d}")

100%|██████████| 22123/22123 [00:00<00:00, 62122.63it/s]


mem_size of phis: 41,199,601


In [8]:
# note: words with freq_w < 3 were excluded from topic modeling and so will not be in phi_data, phis

# the same goes for stopwords
stopwords = ['iti', 'na', 'ca', 'api', 'eva', 'tad', 'tvāt', 'tat', 'hi', 'ādi', 'tu', 'vā'] # used in topic modeling

# and here are a few errors that need to be treated in the same way
error_words = [':', '*tat', 'eva*', '*atha', ')'] # fix in the data!

# however, all of these WILL be found when going through the full-text in the theta file, which > corpus_vocab
# so make another version that reflects this

corpus_vocab_reduced = [
    word 
    for word in corpus_vocab 
        if not (word in (stopwords + error_words) or freq_w[word] < 3)
]

# print(P_w_t['tattva'][27])
# print('maṅgalena' in P_w_t) # False because freq_w < 3
# print('tad' in P_w_t) # False because in stopwords
# print(':' in P_w_t) # False because in error_words

print("len(corpus_vocab): ", len(corpus_vocab))
print("len(corpus_vocab_reduced): ", len(corpus_vocab_reduced))
print("mem_size of corpus_vocab_reduced: %s" % f"{ mem_size(corpus_vocab_reduced) :,d}")

len(corpus_vocab):  67483
len(corpus_vocab_reduced):  22123
mem_size of corpus_vocab_reduced: 2,015,249


In [9]:
# 0) given a query document

query_id = "NBhū_104,6^1"
query_text = doc_fulltext[query_id]

In [19]:
# 1) compare by topic proportions (i.e., theta values)

N = 10 # number of closest docs to find

query_vector = np.array(thetas[query_id])
topic_similiarity_score = {} # e.g. topic_similiarity_score[DOC_ID] = FLOAT

topic_candidate_vectors = []
for doc_id in tqdm.tqdm(doc_ids):
    candidate_vector = np.array(thetas[doc_id]) # dimensionality = k, number of topics
    topic_candidate_vectors.append(candidate_vector)
    topic_similiarity_score[doc_id] = fastdist.cosine(query_vector, candidate_vector)
    
sorted_results = sorted(topic_similiarity_score.items(), key=lambda item: item[1], reverse=True)
ids_for_closest_N_docs_by_topics = [ res[0] for res in sorted_results[:N+1] ][1:] # omit first which is query itself

for id in ids_for_closest_N_docs_by_topics[:20]:
    print("score for %s: %f" % (id, topic_similiarity_score[id]))
print("mem_size of topic_candidate_vectors: %s" % f"{ mem_size(topic_candidate_vectors) :,d}")

100%|██████████| 15938/15938 [00:00<00:00, 153356.04it/s]


score for PVin_I,034,i: 0.971307
score for ViṃśV_87,i_89,ii: 0.968265
score for NV_478,04_478,05: 0.932906
score for NV_205,20^2: 0.930796
score for TriṃśBh_44,iii_44,vi: 0.927448
score for NBh_1046,i_1046,iii: 0.926241
score for NBh_1049,iii_1050,i: 0.926162
score for NBh_1047,i_1047,ii: 0.925879
score for NV_473,01_473,02: 0.923914
score for ViṃśV_93,i_95,i: 0.922694
mem_size of topic_candidate_vectors: 33,673,072


In [11]:
# 2) compare by tf.idf scores

# first calculate all IDF

IDF = {} # e.g. IDF[WORD] = FLOAT for each word in vocab
docs_containing = {} # e.g. docs_containing[WORD] = INT for each word in vocab

for doc_id in doc_ids:
    
    doc_text = doc_fulltext[doc_id]
    doc_words = doc_text.split()
    unique_doc_words = list(set(doc_words))

    for word in unique_doc_words:

        # increment docs_containing tally
        if word in docs_containing:
            docs_containing[word] += 1
        else:
            docs_containing[word] = 1

total_num_docs = len(doc_ids)

for word in corpus_vocab:
    IDF[word] = math.log(total_num_docs / docs_containing[word])

print("mem_size of IDF: %s" % f"{ mem_size(IDF) :,d}")

mem_size of IDF: 9,995,307


In [12]:
# all_freqs = sorted(freq_w.values())
# freq_counter = Counter(all_freqs)
# print("num of distinct freqs: ", len(freq_counter))
# print("freq_counter: ", freq_counter)
# keylist = list(freq_counter.keys())

# from collections import OrderedDict
# freq_counter_subset = OrderedDict()
# for key in keylist[:100]:
#     freq_counter_subset[key] = freq_counter[key]
# print("freq_counter_subset: ", freq_counter_subset)

# import matplotlib.pyplot as plt
# labels, values = zip(*freq_counter_subset.items())
# indexes = np.arange(len(labels))
# width = 1
# plt.figure(figsize=(15,4))
# plt.bar(indexes, values, width)
# plt.xticks(indexes + width * 0.5, labels);
# plt.show()

# # corpus_vocab_reduced = []

In [13]:
# also prepare function for calculating tf.idf vector for any given doc

def get_TF_IDF_vector(doc_id):
# returns numpy array
    
    doc_text = doc_fulltext[doc_id]
    doc_words = doc_text.split()
    unique_doc_words = list(set(doc_words))

    total_doc_word_len = len(doc_words)

    TF_IDF_dict = {} # e.g. TF_IDF_dict[WORD] = [FLOAT, FLOAT, FLOAT, FLOAT, ... FLOAT]
    for word in unique_doc_words:
        TF_d_w = doc_words.count(word) / total_doc_word_len
        TF_IDF_dict[word] = TF_d_w * IDF[word]
    
    TF_IDF_vector = np.zeros( len(corpus_vocab_reduced) )
    # e.g. TF_IDF_vector[WORD] = [0, 0, 0, ... FLOAT, 0, 0, ... FLOAT, 0, ... 0]
    
    for word in TF_IDF_dict.keys():
        if word in corpus_vocab_reduced:
            i = corpus_vocab_reduced.index(word) # alphabetical index
            TF_IDF_vector[i] = TF_IDF_dict[word]
    
    return TF_IDF_vector

In [14]:
# now do actual tf.idf vector comparison on docs already selected above in topic comparison
# relatively slow and expensive

query_vector = get_TF_IDF_vector(query_id)
TF_IDF_candidate_vectors = []
doc_ids_for_comparison = ids_for_closest_N_docs_by_topics
TF_IDF_comparison_scores = {} # e.g. tf_idf_score[DOC_ID] = FLOAT

for doc_id in tqdm.tqdm(doc_ids_for_comparison):
    candidate_vector = get_TF_IDF_vector(doc_id) 
    TF_IDF_candidate_vectors.append(candidate_vector)
    TF_IDF_comparison_scores[doc_id] = fastdist.cosine(query_vector, candidate_vector)

sorted_results = sorted(TF_IDF_comparison_scores.items(), key=lambda item: item[1], reverse=True)

print("mem_size of TF_IDF_candidate_vectors: %s" % f"{ mem_size(TF_IDF_candidate_vectors) :,d}")

100%|██████████| 10/10 [00:00<00:00, 18.73it/s]


mem_size of TF_IDF_candidate_vectors: 8,236,792


In [15]:
ids_for_closest_N_docs_by_TF_IDF = [ res[0] for res in sorted_results ]
for id in ids_for_closest_N_docs_by_topics[:20]:
    print("score for %s: %f" % (id, TF_IDF_comparison_scores[id]))

score for PVin_I,034,i: 0.674471
score for ViṃśV_87,i_89,ii: 0.020631
score for NV_478,04_478,05: 0.073989
score for NV_205,20^2: 0.125342
score for TriṃśBh_44,iii_44,vi: 0.022004
score for NBh_1046,i_1046,iii: 0.178364
score for NBh_1049,iii_1050,i: 0.265177
score for NBh_1047,i_1047,ii: 0.125379
score for NV_473,01_473,02: 0.086711
score for ViṃśV_93,i_95,i: 0.024244


In [16]:
# 3) compare by contextual IC score

# first prepare function for calculating IC vector for any given doc

def get_IC_vector(doc_id):
# returns numpy array
    
    doc_text = doc_fulltext[doc_id]
    doc_words = doc_text.split()
    unique_doc_words = list(set(doc_words))

    IC_dict = {} # e.g. IC_dict[WORD] = FLOAT

    for word in unique_doc_words: # should this be doc_words instead ??

        if (freq_w[word] < 3
            or word in stopwords
            or word in error_words
           ): # was excluded in modeling, so neither in phi_data nor in P_w_t
            continue

        if word not in phis:
            import pdb;pdb.set_trace()
            # stop and add to error_words
            
        IC_dict[word] = 0
        k = len(phis[word])
        for i in range(0,k):
            weight = thetas[doc_id][i]
            IC_for_word_given_topic = -( weight * math.log( phis[word][i] , 2) )
            IC_dict[word] += IC_for_word_given_topic
    
    IC_vector = np.zeros( len(corpus_vocab_reduced) )
    # e.g. IC_vector = [0, 0, 0, ... FLOAT, 0, 0, ... FLOAT, 0, ... 0]
    
    for word in IC_dict.keys():
        if word in corpus_vocab_reduced:
            i = corpus_vocab_reduced.index(word) # alphabetical index
            IC_vector[i] = IC_dict[word]
    
    return IC_vector

In [17]:
# now do actual IC vector comparison

query_vector = get_IC_vector(query_id)
IC_candidate_vectors = []
doc_ids_for_comparison = ids_for_closest_N_docs_by_topics
IC_comparison_scores = {} # e.g. IC_comparison_score[DOC_ID] = FLOAT

for doc_id in tqdm.tqdm(doc_ids_for_comparison):
    candidate_vector = get_IC_vector(doc_id) 
    IC_candidate_vectors.append(candidate_vector)
    IC_comparison_scores[doc_id] = fastdist.cosine(query_vector, candidate_vector)

sorted_results = sorted(IC_comparison_scores.items(), key=lambda item: item[1], reverse=True)
print(sorted_results[:20])

print("mem_size of IC_candidate_vectors: %s" % f"{ mem_size(IC_candidate_vectors) :,d}")

100%|██████████| 10/10 [00:00<00:00, 22.30it/s]


[('PVin_I,034,i', 0.539807773465966), ('NBh_1049,iii_1050,i', 0.14224065560007612), ('NV_205,20^2', 0.08552828001494603), ('NBh_1046,i_1046,iii', 0.08072412319612125), ('NV_478,04_478,05', 0.0695698144779478), ('NBh_1047,i_1047,ii', 0.06809293420775551), ('NV_473,01_473,02', 0.06443309267239383), ('ViṃśV_93,i_95,i', 0.04147626016279547), ('ViṃśV_87,i_89,ii', 0.035161954673979916), ('TriṃśBh_44,iii_44,vi', 0.032913577453843365)]
mem_size of IC_candidate_vectors: 8,850,392


In [18]:
# 4) compare by SequenceMatcher.ratio score

from difflib import SequenceMatcher
# SequenceMatcher.ratio()
#     returns a similarity score between input strings as a float in [0,1]
#     = 2.0*M / T
#     where M is the number of matches
#     and T is the total number of elements in both sequences
# <https://docs.python.org/3/library/difflib.html#difflib.SequenceMatcher.ratio>

query_fulltext = doc_fulltext[query_id]
SM_ratio_candidate_fulltexts = []
doc_ids_for_comparison = ids_for_closest_N_docs_by_topics
SM_ratio_comparison_scores = {} # e.g. SM_ratio_comparison_scores[DOC_ID] = FLOAT

for doc_id in tqdm.tqdm(doc_ids_for_comparison):
    candidate_fulltext = doc_fulltext[doc_id]
    SM_ratio_candidate_fulltexts.append(candidate_fulltext)
    SM_ratio_comparison_scores[doc_id] = SequenceMatcher(a=query_fulltext, b=candidate_fulltext).ratio()

sorted_results = sorted(SM_ratio_comparison_scores.items(), key=lambda item: item[1], reverse=True)
print(sorted_results[:100])

print("mem_size of SM_ratio_candidate_fulltexts: %s" % f"{ mem_size(SM_ratio_candidate_fulltexts) :,d}")

100%|██████████| 10/10 [00:00<00:00, 463.97it/s]

[('NBh_1047,i_1047,ii', 0.16742493175614195), ('TriṃśBh_44,iii_44,vi', 0.05390835579514825), ('PVin_I,034,i', 0.041189931350114416), ('NBh_1046,i_1046,iii', 0.038525963149078725), ('NV_473,01_473,02', 0.021078735275883446), ('NV_205,20^2', 0.014814814814814815), ('NV_478,04_478,05', 0.010484927916120577), ('ViṃśV_93,i_95,i', 0.010126582278481013), ('NBh_1049,iii_1050,i', 0.009389671361502348), ('ViṃśV_87,i_89,ii', 0.006329113924050633)]
mem_size of SM_ratio_candidate_fulltexts: 11,258





In [None]:
vectors = {
    
}
import json
vector_json_fn = "vectors.json"
with open(vector_json_fn,'w') as f_out:
    json_object = {}
    json_object["vectors"] = accepted_ngrams # updated
    json.dump(json_object, f_out, indent=4, ensure_ascii=False)
                
