In [1]:
# set working directory
import os
os.chdir("/home/aiops/zhuty/tinyllama")
import json

In [2]:
from processing.graphs.utils import read_trec_results

def calculate_jaccard_at_k(list1, list2, k):
    set1 = set(item for item in list1[:k])
    set2 = set(item for item in list2[:k])
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0

def calculate_precision_at_k(list1, list2, k):
    set1 = set(item for item in list1[:k])
    set2 = set(item for item in list2[:k])
    intersection = len(set1.intersection(set2))
    return intersection / k

def compare_search_results_at_k(list1, list2, ks=[5, 10, 20]):
    results = {}
    for k in ks:
        jaccard_similarity = calculate_jaccard_at_k(list1, list2, k)
        precision_at_k = calculate_precision_at_k(list1, list2, k)
        results[k] = {
            "Jaccard Similarity at k": jaccard_similarity,
            "Precision at k": precision_at_k
        }
    return results

# # Example usage
# list1 = [...]  # Your first list
# list2 = [...]  # Your second list
#
# comparison_metrics = compare_search_results_at_k(list1, list2, ks=[5, 10, 20])
# for k, metrics in comparison_metrics.items():
#     print(f"Metrics at k={k}: {metrics}")

def calculate_average_metrics_at_k(all_results, ks=[5, 10, 20]):
    # Initialize dictionaries to store total metrics
    total_jaccard_at_k = {k: 0 for k in ks}
    total_precision_at_k = {k: 0 for k in ks}

    # Number of queries
    num_queries = len(all_results)

    for list1, list2 in all_results:
        for k in ks:
            jaccard_at_k = calculate_jaccard_at_k(list1, list2, k)
            precision_at_k = calculate_precision_at_k(list1, list2, k)

            total_jaccard_at_k[k] += jaccard_at_k
            total_precision_at_k[k] += precision_at_k

    # Calculate averages
    avg_jaccard_at_k = {k: total_jaccard_at_k[k] / num_queries for k in ks}
    avg_precision_at_k = {k: total_precision_at_k[k] / num_queries for k in ks}

    return avg_jaccard_at_k, avg_precision_at_k

def read_jsonl_adj_lst(file):
    result = {}
    num_lines = 0
    with open(file, 'r') as f:
        for line in f:
            data = json.loads(line)
            result[data['query_id']] =[x[0] for x in  data['docs']]
    return result

# Example usage
# all_results is a list of tuples, each containing two lists (list1, list2) for each query
# all_results = [([...], [...]), ([...], [...]), ...]  # Replace [...] with actual lists for each query
def compare_two_results(path1, path2):

    # new_result = read_trec_results(path1)
    # old_result = read_trec_results(path2)
    new_result = read_jsonl_adj_lst(path1)
    old_result = read_jsonl_adj_lst(path2)
    print("Finished reading results")

    all_results = [(new_result[i], old_result[i]) for i in new_result.keys()]

    k_list = [1, 3, 5, 10, 20, 100]
    avg_jaccard, avg_precision = calculate_average_metrics_at_k(all_results, ks=k_list)

    for k in k_list:
        print(f"Average Jaccard Similarity at k={k}: {avg_jaccard[k]:.4f}")
        print(f"Average Precision at k={k}: {avg_precision[k]:.4f}")


In [3]:
for chunk_num in [0, 1,4,23, 59, 58, 48, 51, 29]:

    # result_paths = [
    #     f"/home/aiops/zhuty/ret_pretraining_data/id_added/cc/dense_search_results/keep/chunk_{chunk_num}.result.txt" , f"/home/aiops/zhuty/ret_pretraining_data/id_added/cc/bm25_search_results/first/chunk_{chunk_num}.result.txt" ,
    # ]
    result_paths = [
        # f"/home/aiops/zhuty/ret_pretraining_data/id_added/cc/dense_search_results/keep/adj_lists/result_{chunk_num}.jsonl" ,
        f"/home/aiops/zhuty/ret_pretraining_data/id_added/cc/bm25_search_results/first/adj_lists/result_{chunk_num}.jsonl" ,
        # f"/home/aiops/zhuty/ret_pretraining_data/id_added/cc/el_search_results/last_120m/adj_lists/result_{chunk_num}.jsonl" ,
        f"/home/aiops/zhuty/ret_pretraining_data/id_added/cc/el_search_results/first/adj_lists/result_{chunk_num}.jsonl" ,
        # f"/home/aiops/zhuty/ret_pretraining_data/id_added/cc/el_search_results/last/adj_lists/result_{chunk_num}.jsonl" ,


    ]
    print(f'Comparing chunk {chunk_num}')
    compare_two_results(result_paths[0], result_paths[1])

Comparing chunk 0
Finished reading results
Average Jaccard Similarity at k=1: 0.4746
Average Precision at k=1: 0.4746
Average Jaccard Similarity at k=3: 0.2548
Average Precision at k=3: 0.3244
Average Jaccard Similarity at k=5: 0.2191
Average Precision at k=5: 0.2881
Average Jaccard Similarity at k=10: 0.1937
Average Precision at k=10: 0.2586
Average Jaccard Similarity at k=20: 0.1810
Average Precision at k=20: 0.2428
Average Jaccard Similarity at k=100: 0.1738
Average Precision at k=100: 0.2337
Comparing chunk 1


FileNotFoundError: [Errno 2] No such file or directory: '/home/aiops/zhuty/ret_pretraining_data/id_added/cc/el_search_results/first/adj_lists/result_1.jsonl'

In [43]:
# calculate the percentage that itself is retrieved in top k
def calculate_self_retrieval_percentage(result_path, k):
    result = read_jsonl_adj_lst(result_path)
    num_queries = len(result)
    num_self_retrieved = 0
    for query_id, retrieved_docs in result.items():
        if query_id in retrieved_docs[:k]:
            num_self_retrieved += 1
    return num_self_retrieved / num_queries


chunk_num = 0
# Example usage

result_path =  f"/home/aiops/zhuty/ret_pretraining_data/id_added/cc/dense_search_results/keep/adj_lists/result_{chunk_num}.jsonl"
# result_path = f"/home/aiops/zhuty/ret_pretraining_data/id_added/cc/bm25_search_results/first/adj_lists/result_{chunk_num}.jsonl"
result_path = f"/home/aiops/zhuty/ret_pretraining_data/id_added/cc/el_search_results/last_120m/adj_lists/result_{chunk_num}.jsonl"

for k in [1, 3, 5, 10, 20, 100]:
    self_retrieval_percentage = calculate_self_retrieval_percentage(result_path, k)
    print(f"Percentage of queries that retrieve themselves in top {k}: {self_retrieval_percentage*100:.2f}%")

Percentage of queries that retrieve themselves in top 3: 9.32%
Percentage of queries that retrieve themselves in top 5: 10.38%
Percentage of queries that retrieve themselves in top 10: 11.83%
Percentage of queries that retrieve themselves in top 20: 13.42%
Percentage of queries that retrieve themselves in top 100: 17.64%
Percentage of queries that retrieve themselves in top 1: 7.07%
Percentage of queries that retrieve themselves in top 3: 9.32%
Percentage of queries that retrieve themselves in top 5: 10.38%
Percentage of queries that retrieve themselves in top 10: 11.83%
Percentage of queries that retrieve themselves in top 20: 13.42%
Percentage of queries that retrieve themselves in top 100: 17.64%


In [None]:
result_paths = [ "/home/aiops/zhuty/ret_pretraining_data/id_added/redpajama_2b/bm25_search_results/chunk_0_gen1k.result.txt" , "/home/aiops/zhuty/ret_pretraining_data/id_added/redpajama_2b/dense_search_results/chunk_0.result.txt" ,
 "/home/aiops/zhuty/ret_pretraining_data/id_added/redpajama_2b/bm25_search_results/chunk_0.result.txt"]

In [12]:
# compare pair-wise
compare_two_results(result_paths[0], result_paths[1])
compare_two_results(result_paths[1], result_paths[2])
compare_two_results(result_paths[0], result_paths[2])

FileNotFoundError: [Errno 2] No such file or directory: '/home/aiops/zhuty/ret_pretraining_data/id_added/cc/dense_search_results/keep/chunk_0.result.txt'

In [9]:
# path1 = "/home/aiops/zhuty/ret_pretraining_data/id_added/c4_news/dense_search_results/flat_search_chunk_10.result.txt"
# path2 = "/home/aiops/zhuty/ret_pretraining_data/id_added/c4_news/dense_search_results/chunk_10.result.txt"
path1 = "/home/aiops/zhuty/ret_pretraining_data/id_added/c4_news/dense_search_results/last/chunk_0.result.txt"
path2 = "/home/aiops/zhuty/ret_pretraining_data/id_added/c4_news/bm25_search_results/chunk_0.result.txt"

### Case Study of Retrieval Results:

In [7]:
import json

In [8]:
def get_doc_contents(doc_id):
    chunk_num,line_num = doc_id.split('_')
    file = f'/home/aiops/zhuty/ret_pretraining_data/id_added/cc/train/chunk_{chunk_num}.jsonl'
    # go to the line_num line
    with open(file, 'r') as f:
        for i, line in enumerate(f):
            if i == int(line_num):
                result = line
    data = json.loads(result)
    return data

In [9]:
doc_id = '0_100000'
get_doc_contents(doc_id)

{'meta': {'redpajama_set_name': 'RedPajamaCommonCrawl'},
 'id': '0_100000',
 'contents': 'Popster Collin Clowes captures the moments that matter in "Polaroid"\nBy Frederica R. Gibney on\t December 10, 2021 Polish Artist\nCanadian singer-songwriter Collin Clowes captures the moments that matter in his evocative new single, "Polaroid" – available now.\nThe new single from the debut EP, Sleepy, the seven song release in 2021 is a collection of heartwarming songs that feature delicate instrumental compositions over meaningful lyrics to create a sound that\'s as familiar as it is new. This musical body is the singular vision of Clowes, who not only plays each instrument, but also works through a painstaking effort of production, mixing and mastering.\nPersistence, along with a steadfast commitment to the music creation process, sets artist Collin Clowes in a league of their own. The production value and rhythmic polish of the EP is more typically associated with high-value teams of musician

In [25]:
search_type = 'dense'
query_type='keep'
search_type, query_type = 'bm25', 'first'
TOP_K=5
query_id = '0_20'

query_doc_id, query_line_id = query_id.split('_')
file = f"/home/aiops/zhuty/ret_pretraining_data/id_added/cc/{search_type}_search_results/{query_type}/adj_lists/result_{query_doc_id}.jsonl"
print("Input documents", get_doc_contents(query_id))
for item in open(file, 'r'):
    line_dict = json.loads(item)
    curr_query_id = line_dict['query_id']
    if curr_query_id != query_id:
        continue
    docs = line_dict['docs']
    for i, (doc, score) in enumerate(docs[:TOP_K]):
        doc_content = get_doc_contents(doc)
        print("#"*20)
        print(f"Retrieved top {i} document ID {doc}, Score: ",score )
        print(repr(doc_content['contents'])[:200], "......")

Input documents {'meta': {'redpajama_set_name': 'RedPajamaCommonCrawl'}, 'id': '0_20', 'contents': 'FIFA Women\'s World Cup 2019 final, USA VS Netherlands: Where to watch it live\nBy Ishan Ghosh\nJuly 7, 2019 18:43 +08\nUSA and Netherlands are going to face each other in the final Twitter\nThe United States team will face Netherlands in the final of the FIFA Women\'s World Cup 2019 on July 7 at the Lon Olympic stadium in France. USA came into the final defeating England 2-1 whereas Netherlands entered the final courtesy a 1-0 win over Sweden.\nPreview, prediction, schedule\nThe three-time world champions USA are coming into the final riding high on a hard fought win against England. They have been in magnificent form throughout the tournament and have not lost a single match. However, their star player and probably the player of the tournament Megan Rapinoe was out of the semi-final clash due to a hamstring injury. But the USA team management have announced that she will be fit for the