In [1]:
# create virtual server here in notebook with access to all functions and variables
from IR_tools import *

In [54]:
# set up variables
priority_texts = period_1_works + period_2_works
non_priority_texts = period_3_works + period_4_works

N_tf_idf_shallow = int( len(doc_ids) * 0.15)
N_sw_w_shallow = 50

N_tf_idf_deep = int( len(doc_ids) * 1.00)
N_sw_w_deep = 400

In [55]:
# function for assessing speed

from datetime import datetime, date
from time import sleep

def calc_dur(start, end):
    delta = datetime.combine(date.today(), end) - datetime.combine(date.today(), start)
    duration_secs = delta.seconds + delta.microseconds / 1000000
    return duration_secs

In [74]:
# function for simulating esssential search aspects of get_closest_docs() with focus on assessing speed

def get_closest_docs_2(query_id, search_depth='shallow', display_depth=10):

    if search_depth=='shallow':
        N1 = N_tf_idf_shallow
        N2 = N_sw_w_shallow
    elif search_depth=='deep':
        N1 = N_tf_idf_deep
        N2 = N_sw_w_deep
    
    # rank candidates by topic similarity

    start1 = datetime.now().time()
    preliminary_N_topic_candidates = rank_N_candidates_by_topic_similarity(query_id, N=N1)
    end1 = datetime.now().time()
    topic_time = calc_dur(start1, end1)
    print('topics ({:.3f} s, {} comparisons, {:.6f} s/comparison)'.format(
            topic_time, len(doc_ids), topic_time/len(doc_ids)
            )
    )
    for k,v in list(preliminary_N_topic_candidates.items())[:display_depth]:
        print(k, ": ", v)
    print()
    sleep(0.01)

    # don't prioritize
    priority_topic_candidates = preliminary_N_topic_candidates
    priority_topic_candidate_ids = list(preliminary_N_topic_candidates.keys())

    # prioritize for further processing
#     priority_topic_candidate_ids, secondary_topic_candidate_ids = divide_doc_id_list_by_work_priority(
#         list(preliminary_N_topic_candidates.keys()),
#         priority_texts
#     )
#     priority_topic_candidates =  { doc_id: preliminary_N_topic_candidates[doc_id] 
#                                      for doc_id in priority_topic_candidate_ids
#                                  }
#     secondary_topic_candidates = { doc_id: preliminary_N_topic_candidates[doc_id] 
#                                     for doc_id in secondary_topic_candidate_ids
#                                  }

    # rank candidates by tf-idf similarity
    start2 = datetime.now().time()
    tf_idf_candidates = rank_candidates_by_tiny_TF_IDF_similarity(query_id, priority_topic_candidates)            
    end2 = datetime.now().time()
    tf_idf_time = calc_dur(start2, end2)
    print('tf-idf ({:.3f} s, {} comparisons, {:.6f} s/comparison)'.format(
            tf_idf_time, len(priority_topic_candidate_ids), tf_idf_time/len(priority_topic_candidate_ids)
            )
    )
    for k,v in list(tf_idf_candidates.items())[:display_depth]:
        print(k, ": ", v)
    print()
    sleep(0.01)

    # rank candidates by sw_w alignment score
    tf_idf_candidate_ids = list(tf_idf_candidates.keys())
    start3 = datetime.now().time()
    sw_w_candidates = rank_N_candidates_by_sw_w_alignment_score(
        query_id, tf_idf_candidate_ids,
        N = N2
    )
    end3 = datetime.now().time()
    sw_w_time = calc_dur(start3, end3)
    print('sw_w ({:.3f} s, {} comparisons, {:.6f} s/comparison)'.format(
            sw_w_time, N2, sw_w_time/N2
            )
    )
    for k,v in list(sw_w_candidates.items())[:display_depth]:
        print(k, ": ", v)
    print()


In [77]:
get_closest_docs_2('NBhū_142,19', search_depth='shallow', display_depth=10)

topics (0.150 s, 21996 comparisons, 0.000007 s/comparison)
NV_294,15_294,16 :  0.993042855183746
NBhū_303,5 :  0.9853636453344283
NBṬ_214,iii_214,iv :  0.9605000968502141
PVA_235,vi_235,vii :  0.9577571536886973
PVin_II,101,ii :  0.9573596537497747
NBhū_212,10 :  0.940371619686815
NBhū_323,15^1 :  0.9392688117169197
NBhū_311,10^1 :  0.938656600230677
HB_4,9 :  0.9288634943426238
NM_II,620,iii_II,620,v :  0.9221844556357494

tf-idf (1.038 s, 3299 comparisons, 0.000315 s/comparison)
PVin_I,091,i_I,092,i :  0.45002730640064087
PVSV_010,13_010,15 :  0.43703819766490937
PVin_I,092,ii_I,092,iii :  0.31472398638108234
PVSV_010,19_010,21 :  0.3061827137895738
PVSV_008,16_009,01 :  0.1290082881136469
NBhū_190,16 :  0.12647319392641787
PVA_607,iv_607,vii :  0.125155587589472
NBṬ_197,ii_197,iii :  0.12388302208574226
PVin_II,111,ii_II,112,i :  0.12327121938505628
PVSV_011,06_011,12 :  0.1111266017311386

sw_w (0.226 s, 50 comparisons, 0.004513 s/comparison)
PVin_I,091,i_I,092,i :  93.2
PVSV_010,1

In [78]:
get_closest_docs_2('NBhū_142,19', search_depth='deep', display_depth=50)

topics (0.145 s, 21996 comparisons, 0.000007 s/comparison)
NV_294,15_294,16 :  0.993042855183746
NBhū_303,5 :  0.9853636453344283
NBṬ_214,iii_214,iv :  0.9605000968502141
PVA_235,vi_235,vii :  0.9577571536886973
PVin_II,101,ii :  0.9573596537497747
NBhū_212,10 :  0.940371619686815
NBhū_323,15^1 :  0.9392688117169197
NBhū_311,10^1 :  0.938656600230677
HB_4,9 :  0.9288634943426238
NM_II,620,iii_II,620,v :  0.9221844556357494
NM_II,630,iv_II,630,v :  0.9155056559016219
VyV_II,151,15^1 :  0.9154931557966699
NBhū_228,5^1 :  0.9148913737645905
NBhū_227,7^2 :  0.9142426635324465
NM_I,578,ii_I,579,i :  0.9050520603703426
NBhū_334,8 :  0.8987973434987502
NM_II,625,i_II,625,ii :  0.8937945719517516
VyV_II,209,8_II,209,10 :  0.8915378545241482
PVin_I,093,i :  0.8913449615287311
NM_II,575,i :  0.88963418660733
NBhū_510,24^1 :  0.8881183418541806
NB_3.92_3.96 :  0.8880199554598937
NM_II,623,iv :  0.8874002213859872
NBṬ_189,iv_190,i :  0.8785347117190527
NBhū_307,2^1 :  0.8726543884690531
PVSV_164,1

In [24]:
# more set up specifically for evaluating pairs

def format_score_summary(ranking_dict, doc_id):
    if doc_id in ranking_dict:
        ks = list(ranking_dict.keys())
        rank = ks.index(doc_id) + 1
        score = ranking_dict[doc_id]
        if score == "": score = 0.0
        return "{} ({:.2f})".format(rank, score)
    else:
        return "0 (0)"

In [None]:
for i, (doc_id_1, doc_id_2) in enumerate(doc_id_pair_list):

    # center search on doc_id_1

    # rank candidates by topic similarity
    start = datetime.now().time()
    if topic_toggle_value == True:
        N = int( len(doc_ids) * 0.15)
    else:
        N = len(doc_ids) # i.e., do not discard any of ranked list
    preliminary_N_topic_candidates = rank_N_candidates_by_topic_similarity(doc_id_1, N)

    # prioritize for further processing
    priority_topic_candidate_ids, secondary_topic_candidate_ids = divide_doc_id_list_by_work_priority(
        list(preliminary_N_topic_candidates.keys()),
        priority_texts
    )
    priority_topic_candidates =  { doc_id: preliminary_N_topic_candidates[doc_id] 
                                     for doc_id in priority_topic_candidate_ids
                                 }
    secondary_topic_candidates = { doc_id: preliminary_N_topic_candidates[doc_id] 
                                    for doc_id in secondary_topic_candidate_ids
                                 }
    end = datetime.now().time(); topic_time = calc_dur(start, end)

    # rank candidates by tf-idf similarity
    start = datetime.now().time()
    tf_idf_candidates = rank_candidates_by_tiny_TF_IDF_similarity(doc_id_1, priority_topic_candidates)            
    end = datetime.now().time(); tf_idf_time = calc_dur(start, end)

    # rank candidates by sw_w alignment score
    start = datetime.now().time()
    sw_w_candidates = rank_N_candidates_by_sw_w_alignment_score(
        doc_id_1, list(tf_idf_candidates.keys()),
        N = 1000
    )
    end = datetime.now().time(); sw_w_time = calc_dur(start, end)

#         print(doc_id_1, "vs. ", doc_id_2)
#         print('topic')
#         for k,v in list(priority_topic_candidates.items())[:25]:
#             print(k, ": ", v)
#         print('tf-idf')
#         for k,v in list(tf_idf_candidates.items())[:25]:
#             print(k, ": ", v)
#         print('sw_w')
#         for k,v in list(sw_w_candidates.items())[:25]:
#             print(k, ": ", v)
#         print()

    print("pair #{}/{} ({}, {})".format(i+1, len(doc_id_pair_list), doc_id_1, doc_id_2))
#         print("topic_time: ", topic_time)
#         print("tf_idf_time: ", tf_idf_time)
#         print("sw_w_time: ", sw_w_time)
    for k,v in list(sw_w_candidates.items())[:25]:
        if(v>30): print(k, ": ", v)
    else:
        print("(none)")
    print()

    topic_score = format_score_summary(priority_topic_candidates, doc_id_2)
    tf_idf_score = format_score_summary(tf_idf_candidates, doc_id_2)
    sw_w_align_score = format_score_summary(sw_w_candidates, doc_id_2)

    output_buffer += '\t'.join([
        '',
        doc_id_1, doc_fulltext[doc_id_1],
        doc_id_2, doc_fulltext[doc_id_2],
        '',
        topic_score, tf_idf_score, sw_w_align_score,
    ]) + '\n'

with open('multi_IR_run.tsv','w') as f_out: f_out.write(output_buffer)

In [8]:


query_id="NBhū_119,19"
N=len(doc_ids)

from datetime import datetime, date

starting_time = datetime.now().time()
preliminary_N_topic_candidates = rank_N_candidates_by_topic_similarity(query_id, N)
ending_time = datetime.now().time()
delta = datetime.combine(date.today(), ending_time) - datetime.combine(date.today(), starting_time)
duration_secs = delta.seconds + delta.microseconds / 1000000
print("comparing %d candidates using K-dimensional topic vector took %.5f secs (%.7f secs / candidate)"
      % (len(preliminary_N_topic_candidates), duration_secs,
        duration_secs/len(preliminary_N_topic_candidates))
     )


comparing 21995 candidates using K-dimensional topic vector took 0.14913 secs (0.0000068 secs / candidate)
