In [1]:
# create virtual server here in notebook with access to all functions and variables
from IR_tools import *

In [2]:
# search settings
priority_texts = period_1_works + period_2_works # pre-NBhū
non_priority_texts = period_3_works + period_4_works # others

N_tf_idf_shallow = int( len(doc_ids) * 0.15)
N_sw_w_shallow = 200

N_tf_idf_deep = int( len(doc_ids) * 1.00)
N_sw_w_deep = 1000

In [3]:
# functions for assessing speed and summarizing results 

from datetime import datetime, date
from time import sleep

def calc_dur(start, end):
    delta = datetime.combine(date.today(), end) - datetime.combine(date.today(), start)
    duration_secs = delta.seconds + delta.microseconds / 1000000
    return duration_secs

def summarize_result(results, label, duration, num_comparisons, display_depth):
    print('{} ({:.3f} s, {} comparisons, {:.6f} s/comparison)'.format(
            label,
            duration, 
            num_comparisons,
            duration/num_comparisons
            )
    )
    for k,v in list(results.items())[:display_depth]:
        print(k, ": ", v)
    print()
    sleep(0.01)


In [4]:
# streamlined version of get_closest_docs() with no HTML rendering, for assessing speed of algorithm parts

def get_closest_docs_2(query_id, search_depth='shallow', prioritze=True, display_depth=10, mode='speed_summary'):
    
    if search_depth=='shallow':
        N_tf_idf = N_tf_idf_shallow
        N_sw_w = N_sw_w_shallow
    elif search_depth=='deep':
        N_tf_idf = N_tf_idf_deep
        N_sw_w = N_sw_w_deep
    
    # rank candidates by topic similarity

    start1 = datetime.now().time()
    all_topic_candidates = rank_all_candidates_by_topic_similarity(
        query_id
        )    
    end1 = datetime.now().time()
    topic_time = calc_dur(start1, end1)
    if mode=='speed_summary': summarize_result(
        results=all_topic_candidates,
        label='topics',
        duration=topic_time,
        num_comparisons=len(doc_ids),
        display_depth=display_depth
    )

    if prioritze:
        # prioritize candidates by text name, discard secondary candidates
            priority_candidate_ids, _ = divide_doc_id_list_by_work_priority(
                list(all_topic_candidates.keys()),
                priority_texts
                )
            priority_topic_candidates = { doc_id: all_topic_candidates[doc_id]
                for doc_id in priority_candidate_ids
                }
    else:
        # don't prioritize
        priority_topic_candidates = all_topic_candidates
        priority_topic_candidate_ids = list(all_topic_candidates.keys())
    
    # limit further computation to only top N_tf_idf of sorted candidates (minus query itself)
    pruned_priority_topic_candidates = { k:v
        for (k,v) in list(priority_topic_candidates.items())[:N_tf_idf-1]
        }

    # further rank candidates by tiny tf-idf
    start2 = datetime.now().time()
    tf_idf_candidates = rank_candidates_by_tiny_TF_IDF_similarity(
        query_id,
        list(pruned_priority_topic_candidates.keys())
        )
    end2 = datetime.now().time()
    tf_idf_time = calc_dur(start2, end2)
    if mode=='speed_summary': summarize_result(
        results=tf_idf_candidates,
        label='tf-idf',
        duration=tf_idf_time,
        num_comparisons=N_tf_idf,
        display_depth=display_depth
    )

    # limit further computation to only top N_sw_w of sorted candidates
    pruned_tf_idf_candidates = { k:v
        for (k,v) in list(tf_idf_candidates.items())[:N_sw_w-1]
        }

    # further rank candidates by sw_w
    start3 = datetime.now().time()
    sw_w_candidates = rank_candidates_by_sw_w_alignment_score(
        query_id,
        list(pruned_tf_idf_candidates.keys())
        )
    end3 = datetime.now().time()
    sw_w_time = calc_dur(start3, end3)

    # do not convert sw_w scores of 0.0 to empty string
    # do not add blank entries for other docs for which sw_w comparison not performed

    if mode=='speed_summary':
        summarize_result(
            results=sw_w_candidates, 
            label='sw_w', 
            duration=sw_w_time, 
            num_comparisons=N_sw_w, 
            display_depth=display_depth
        )
        return

    elif mode=='evaluate_pairs':
        return all_topic_candidates, priority_topic_candidates, tf_idf_candidates, sw_w_candidates


In [5]:
# test single 'speed_summary' run, shallow (no values returned)

get_closest_docs_2('NBhū_142,19', search_depth='shallow', display_depth=10, mode='speed_summary')

topics (0.713 s, 28381 comparisons, 0.000025 s/comparison)
NM_I,497,i :  0.9593123186380321
NBhū_316,25 :  0.9525708583701007
NBṬ_243,i_244,ii :  0.9517903101932932
VyV_I,143,9 :  0.9501246382716453
NM_II,623,ii :  0.9490189928244503
NB_3.126_3.130 :  0.9457072627896023
NBhū_345,16 :  0.9429301905834047
NM_I,570,i :  0.9401085148754629
VyV_II,149,25 :  0.9400783534753556
NyKand_568,ii^1 :  0.9389596636516778

tf-idf (1.463 s, 4257 comparisons, 0.000344 s/comparison)
PVin_I,091,i_I,092,i :  0.4476406323861196
PVSV_010,13_010,15 :  0.4366356177407236
PVin_I,092,ii_I,092,iii :  0.3154629129055202
PVSV_010,19_010,21 :  0.30638191389324454
PSṬ_II,115,ii_II,115,iii :  0.16877487866883265
PVSV_011,05_011,12 :  0.15478327326428523
PVSV_008,16_009,01 :  0.12926378368826538
NBṬ_197,ii_197,iii :  0.1272434754611184
PVA_607,iv_607,vii :  0.12628304328524773
PVin_II,111,ii_II,112,i :  0.12170302311234055

sw_w (0.861 s, 200 comparisons, 0.004305 s/comparison)
PVin_I,091,i_I,092,i :  93.2
PVSV_010,1

In [6]:
# run for 'speed_summary', deep (no values returned)

# get_closest_docs_2('NBhū_142,19', search_depth='deep', display_depth=10, mode='speed_summary')

In [7]:
# identify my NBhū docs

NBhu_doc_ids = [ di for di in doc_ids if parse_complex_doc_id(di)[0] == 'NBhū' ]
print(len(NBhu_doc_ids), NBhu_doc_ids[238], NBhu_doc_ids[377], 377-238+1)

doc_id_full_list = NBhu_doc_ids[238:377+1]
print(len(doc_id_full_list))

1765 NBhū_104,6^1 NBhū_154,15 140
140


In [8]:
# set up doc pairs to evaluate as expected from benchmark

doc_id_suspected_pair_list = [
    ('NBhū_104,6^1', 'PVin_I,034,i'),
    ('NBhū_104,6^2', 'PV_3.148ab_3.150cd'),
    ('NBhū_104,6^2', 'NS_4.2.8_4.2.14'),
    ('NBhū_106,3', 'NS_4.2.23_4.2.28'),
    ('NBhū_106,3', 'NV_487,02_487,04'),
    ('NBhū_106,11_107,1', 'ViṃśV_93,i_95,i'),
    ('NBhū_106,11_107,1', 'PVin_I,035,i_I,036,ii'),
    ('NBhū_106,11_107,1', 'NS_4.2.15_4.2.22'),
    ('NBhū_107,6_108,1', 'PVin_I,039,i_I,039,ii'),
    ('NBhū_108,4_108,6', 'PVin_I,040,i'),
    ('NBhū_108,10', 'PVin_I,041,i'),
    ('NBhū_108,10', 'PV_3.431cd_3.434cd'),
    ('NBhū_109,1', 'PV_3.431cd_3.434cd'),
    ('NBhū_109,7', 'PV_3.329ab_3.332ab'),
    ('NBhū_109,7', 'PVin_I,035,i_I,036,ii'),
    ('NBhū_115,1_115,4', 'PV_3.281ab_3.284ab'),
    ('NBhū_115,18', 'PVA_289,xiv_290,ii'),
    ('NBhū_115,18', 'PVA_290,iii'),
    ('NBhū_115,18', 'PVA_290,iv_290,vi'),
    ('NBhū_116,7', 'NS_4.1.35_4.1.42'),
    ('NBhū_116,7', 'NV_454,17_454,18'),
    ('NBhū_117,3^1', 'PVA_288,vii'),
    ('NBhū_117,3^2', 'PVA_288,vii'),
    ('NBhū_117,3^2', 'PV_3.208cd_3.211ab'),
    ('NBhū_121,2^2', 'PVin_I,046,i_I,046,iii'),
    ('NBhū_124,8^2', 'PV_2.066ab_2.069ab'),
    ('NBhū_125,15', 'ŚV_5,4.250ab_5,4.253ab'),
    ('NBhū_126,6^1', 'NS_4.2.8_4.2.14'),
    ('NBhū_126,6^1', 'NBh_1047,i_1047,ii'),
    ('NBhū_126,6^1', 'NS_2.1.33_2.1.39'),
    ('NBhū_126,6^3', 'NS_2.1.33_2.1.39'),
    ('NBhū_126,6^3', 'NS_4.2.8_4.2.14'),
    ('NBhū_131,11_131,17', 'ViṃśV_93,i_95,i'),
    ('NBhū_132,2', 'NS_4.2.15_4.2.22'),
    ('NBhū_132,11^1', 'TUS_ii,102,i_ii,102,ii'),
    ('NBhū_132,11^1', 'PV_3.386cd_3.389cd'),
    ('NBhū_138,9', 'PVA_353,x'),
    ('NBhū_138,9', 'PVA_353,xi_353,xii'),
    ('NBhū_139,1_139,3', 'PVA_353,xiii_353,xv'),
    ('NBhū_139,1_139,3', 'PVSV_022,06_022,20'),
    ('NBhū_139,1_139,3', 'PVin_I,086,ii^1'),
    ('NBhū_139,26_140,1', 'PV_3.326ab_3.328cd'),
    ('NBhū_140,21', 'PVin_I,046,i_I,046,iii'),
    ('NBhū_142,2', 'PV_3.329ab_3.332ab'),
    ('NBhū_142,12', 'PVA_387,xvii_387,xxii'),
    ('NBhū_142,12', 'PVA_359,iv_359,vi'),
    ('NBhū_142,19', 'PVA_361,xiii_361,xvi'),
    ('NBhū_142,19', 'PVSV_010,13_010,15'),
    ('NBhū_142,19', 'PVSV_010,19_010,21'),
    ('NBhū_142,19', 'PVin_I,091,i_I,092,i'),
    ('NBhū_142,19', 'PVin_I,092,ii_I,092,iii'),
    ('NBhū_144,20^1', 'PVA_361,xiii_361,xvi'),
    ('NBhū_145,15', 'PVA_360,ix'),
    ('NBhū_145,15', 'PVA_360,x'),
    ('NBhū_145,22', 'PVA_360,x'),
    ('NBhū_145,22', 'PVA_360,xi_361,i'),
    ('NBhū_146,7', 'PVA_360,xi_361,i'),
    ('NBhū_146,7', 'PVA_361,ii_361,iii'),
    ('NBhū_146,14_146,18', 'PVA_361,ii_361,iii'),
    ('NBhū_146,14_146,18', 'PVA_361,iv_361,vi'),
    ('NBhū_146,21', 'PVA_361,iv_361,vi'),
    ('NBhū_146,21', 'PVA_361,vii'),
    ('NBhū_147,3_147,6', 'PVA_361,x_361,xii'),
    ('NBhū_149,4_149,16', 'PVA_366,v_366,ix'),
    ('NBhū_149,19', 'PVA_366,iv'),
    ('NBhū_150,1', 'HB_3,1^1'),
    ('NBhū_150,1', 'PV_2.001ab_2.005cd'),
    ('NBhū_150,6^2', 'PVin_II,001,i_II,001,ii'),
    ('NBhū_150,6^2', 'NB_3.1_3.8'),
    ('NBhū_153,4_153,7', 'PVA_356,iv_356,vii'),
    ('NBhū_153,14', 'PVA_358,ix^4')
]
num_expected_pairs = len(doc_id_suspected_pair_list)

# turn into dict of lists
doc_id_suspected_pair_dict = {}
for doc_id_1, doc_id_2 in doc_id_suspected_pair_list:
    if doc_id_1 not in doc_id_suspected_pair_dict:
        doc_id_suspected_pair_dict[doc_id_1] = [doc_id_2]
    else:
        doc_id_suspected_pair_dict[doc_id_1].append(doc_id_2)

In [9]:
# function for describing doc_id's position in ranked results as well as giving absolute score

def format_score_summary(ranking_dict, doc_id):

    if doc_id in ranking_dict:

        ks = list(ranking_dict.keys())
        rank = ks.index(doc_id) + 1
        score = ranking_dict[doc_id]

        return "{} ({:.2f})".format(rank, score)

    else:
        return ""

In [11]:
# compare performance against benchmark
# i.e., loop over get_closest_docs_2() in mode='evaluate_pairs'
# outputs to tsv spreadsheet

from tqdm.notebook import tqdm

pbar = tqdm(total=len(doc_id_full_list))

output_buffer_1 = ""
output_buffer_2 = '\t'.join([
    "doc_id", "num_expected_pairs", "num_pairs_confirmed",
    "num_novel_pairs[50]", "num_novel_pairs[40]", "num_novel_pairs[30]"
    ]) + '\n'
i = 0
rank_threshold = 5 # this determines whether system "CONFIRMS" an expected pair
sw_w_min_threshold = 30 # this determines a "NOVEL PAIR"
sw_w_threshold_for_display = 30
for doc_id_1 in doc_id_full_list:

    # announce new doc_id_1
    
    print()
    print(doc_id_1)
    something_to_say = False # in case totally uneventful, will output "(none)"

    # perform full search of doc_id_1 with get_closest_docs_2() in 'evaluate_pairs' mode

    all_topic_candidates, priority_topic_candidates, tf_idf_candidates, sw_w_candidates = get_closest_docs_2(
        doc_id_1, search_depth='deep', mode='evaluate_pairs'
    )

    # get doc_id_2 pair suspects

    if doc_id_1 in doc_id_suspected_pair_dict:
        doc_id_2_list = doc_id_suspected_pair_dict[doc_id_1]
        num_pairs_confirmed = 0
    else:
        doc_id_2_list = []
        num_pairs_confirmed = ""

    # evaluate each supposed pair
    
    num_expected_pairs = len(doc_id_2_list)
    for doc_id_2 in doc_id_2_list:
        
        something_to_say = True

        # format four ranked scores as "rank (score)"
        topic_all_scores = format_score_summary(all_topic_candidates, doc_id_2)
        topic_priority_scores = format_score_summary(priority_topic_candidates, doc_id_2)
        tf_idf_scores = format_score_summary(tf_idf_candidates, doc_id_2)
        sw_w_scores = format_score_summary(sw_w_candidates, doc_id_2)

        # format results for output to spreadsheet
        output_buffer_1 += '\t'.join([
            str(i),
            doc_id_1,
            doc_id_2,
            topic_all_scores,
            topic_priority_scores,
            tf_idf_scores,
            sw_w_scores,
        ]) + '\n'
        
        # have msg ready to confirm in notebook significant scores for suspected pairs
        if sw_w_scores == "":
            rank = 0
            sw_w_abs_score = 0
        else:
            rank_str, sw_w_abs_score_str = sw_w_scores.split(' ', 1)
            rank = int(rank_str)
            sw_w_abs_score = float(sw_w_abs_score_str[1:-1])
        
        confirmation_msg = (0 < rank <= rank_threshold) * '(CONFIRMED)'
        num_pairs_confirmed += bool(confirmation_msg) * 1

        # output to notebook
        i += 1
        print("pair #{}/{} ({}, {}): {} {} {} {} {}".format(
            i, num_expected_pairs,
            doc_id_1, doc_id_2,
            topic_all_scores,
            topic_priority_scores,
            tf_idf_scores,
            sw_w_scores,
            confirmation_msg
            )
        )

    # also report novel findings
        
    num_novel_pairs = {30: 0, 40: 0, 50: 0}
    for k,v in sw_w_candidates.items():

        if v > sw_w_min_threshold and k not in doc_id_2_list:
            
            something_to_say = True

            for threshold in num_novel_pairs.keys():
                if v > threshold:
                    num_novel_pairs[threshold] += 1

            # format four ranked scores as "rank (score)"
            topic_all_scores = format_score_summary(all_topic_candidates, k)
            topic_priority_scores = format_score_summary(priority_topic_candidates, k)
            tf_idf_scores = format_score_summary(tf_idf_candidates, k)
            sw_w_scores = format_score_summary(sw_w_candidates, k)

            # format results for output to spreadsheet
            output_buffer_1 += '\t'.join([
                "no id",
                doc_id_1,
                k,
                topic_all_scores,
                topic_priority_scores,
                tf_idf_scores,
                sw_w_scores,
            ]) + '\n'

            # output to notebook
            print("NOVEL PAIR ({}, {}): {} {} {} {}".format(
                doc_id_1, k,
                topic_all_scores,
                topic_priority_scores,
                tf_idf_scores,
                sw_w_scores,
                )
            )
        
        # stop once scores too low
        elif (v < sw_w_min_threshold):
            break
    
    if num_expected_pairs > 0:
        print("expected pairs confirmed (@{}): {}/{}".format(rank_threshold, num_pairs_confirmed, num_expected_pairs))
    if num_novel_pairs[sw_w_threshold_for_display] > 0:
        print("novel pairs (@{}): {}".format(sw_w_min_threshold, num_novel_pairs[sw_w_threshold_for_display]))
        
    # give explicit negative output in notebook if there are neither suspected pairs nor novel findings
    if not something_to_say:
        print("(none)")

    # prepare final summary of doc_1 output to second file
    output_buffer_2 += "{}\t{}\t{}\t{}\t{}\t{}".format(
        doc_id_1, num_expected_pairs, num_pairs_confirmed,
        num_novel_pairs[50], num_novel_pairs[40], num_novel_pairs[30]
        ) + '\n'
        
    # update progress bar as last thing
    pbar.update()        

            
# finish up

with open('pairs.tsv','w') as f_out_1:
    f_out_1.write(output_buffer_1)

with open('doc_1_summaries.tsv','w') as f_out_2:
    f_out_2.write(output_buffer_2)
    
pbar.close()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=140.0), HTML(value='')))


NBhū_104,6^1
pair #1/1 (NBhū_104,6^1, PVin_I,034,i): 218 (0.96) 180 (0.96) 1 (0.61) 1 (356.20) (CONFIRMED)
NOVEL PAIR (NBhū_104,6^1, PVA_092,xv_093,ii): 858 (0.75) 585 (0.75) 23 (0.14) 2 (35.20)
NOVEL PAIR (NBhū_104,6^1, PV_2.083ab_2.085cd): 971 (0.72) 649 (0.72) 10 (0.18) 3 (31.60)
expected pairs confirmed (@5): 1/1
novel pairs (@30): 2

NBhū_104,6^2
pair #2/2 (NBhū_104,6^2, PV_3.148ab_3.150cd): 13662 (0.03) 6284 (0.03) 42 (0.11) 4 (24.00) (CONFIRMED)
pair #3/2 (NBhū_104,6^2, NS_4.2.8_4.2.14): 14 (0.98) 11 (0.98) 123 (0.08) 45 (14.60) 
expected pairs confirmed (@5): 1/2

NBhū_106,3
pair #4/2 (NBhū_106,3, NS_4.2.23_4.2.28): 84 (0.92) 65 (0.92) 351 (0.05) 26 (15.00) 
pair #5/2 (NBhū_106,3, NV_487,02_487,04): 122 (0.92) 96 (0.92) 4 (0.21) 1 (31.00) (CONFIRMED)
expected pairs confirmed (@5): 1/2

NBhū_106,11_107,1
pair #6/3 (NBhū_106,11_107,1, ViṃśV_93,i_95,i): 1679 (0.57) 783 (0.57) 15 (0.13) 397 (8.00) 
pair #7/3 (NBhū_106,11_107,1, PVin_I,035,i_I,036,ii): 12087 (0.01) 4899 (0.01) 3 (0

pair #31/2 (NBhū_126,6^3, NS_2.1.33_2.1.39): 685 (0.79) 484 (0.79) 166 (0.05) 6 (32.00) 
pair #32/2 (NBhū_126,6^3, NS_4.2.8_4.2.14): 167 (0.93) 146 (0.93) 15 (0.12) 4 (35.00) (CONFIRMED)
NOVEL PAIR (NBhū_126,6^3, NBh_1054,i_1054,iii): 80 (0.94) 74 (0.94) 7 (0.14) 1 (48.80)
NOVEL PAIR (NBhū_126,6^3, NV_477,04_477,05): 149 (0.93) 132 (0.93) 8 (0.14) 2 (45.60)
NOVEL PAIR (NBhū_126,6^3, NV_227,13_227,14): 601 (0.82) 430 (0.82) 62 (0.07) 3 (36.00)
NOVEL PAIR (NBhū_126,6^3, NBh_0497,i_0497,ii): 311 (0.90) 250 (0.90) 115 (0.06) 5 (32.00)
expected pairs confirmed (@5): 1/2
novel pairs (@30): 4

NBhū_126,6^4
NOVEL PAIR (NBhū_126,6^4, NV_364,17_365,03): 95 (0.96) 78 (0.96) 61 (0.08) 1 (34.40)
novel pairs (@30): 1

NBhū_127,11
(none)

NBhū_127,17
(none)

NBhū_127,26^1
(none)

NBhū_127,26^2
(none)

NBhū_127,26^3
NOVEL PAIR (NBhū_127,26^3, NV_229,16_229,17): 3458 (0.33) 1634 (0.33) 10 (0.13) 1 (34.00)
NOVEL PAIR (NBhū_127,26^3, NBh_0500,ii_0500,iii): 2893 (0.38) 1333 (0.38) 33 (0.11) 2 (34.00)
NOVE

pair #55/2 (NBhū_145,22, PVA_360,x): 3 (0.93) 3 (0.93) 2 (0.22) 2 (129.20) (CONFIRMED)
pair #56/2 (NBhū_145,22, PVA_360,xi_361,i): 297 (0.80) 275 (0.80) 1 (0.40) 1 (187.40) (CONFIRMED)
expected pairs confirmed (@5): 2/2

NBhū_146,7
pair #57/2 (NBhū_146,7, PVA_360,xi_361,i): 1773 (0.52) 929 (0.52) 2 (0.32) 2 (181.20) (CONFIRMED)
pair #58/2 (NBhū_146,7, PVA_361,ii_361,iii): 6 (0.95) 6 (0.95) 1 (0.55) 1 (269.80) (CONFIRMED)
expected pairs confirmed (@5): 2/2

NBhū_146,14_146,18
pair #59/2 (NBhū_146,14_146,18, PVA_361,ii_361,iii): 21 (0.91) 19 (0.91) 2 (0.36) 2 (109.60) (CONFIRMED)
pair #60/2 (NBhū_146,14_146,18, PVA_361,iv_361,vi): 105 (0.85) 95 (0.85) 1 (0.64) 1 (232.60) (CONFIRMED)
expected pairs confirmed (@5): 2/2

NBhū_146,21
pair #61/2 (NBhū_146,21, PVA_361,iv_361,vi): 275 (0.92) 270 (0.92) 3 (0.15) 2 (51.00) (CONFIRMED)
pair #62/2 (NBhū_146,21, PVA_361,vii): 2 (0.99) 2 (0.99) 1 (0.80) 1 (273.80) (CONFIRMED)
expected pairs confirmed (@5): 2/2

NBhū_147,3_147,6
pair #63/1 (NBhū_147,3