This notebook is to analyze the search results and the predicted path results

# 1. Search results

1.1 analyze the completeness of the search results

In [None]:
# each query should get 100 results
import os
# change the working directory to the root of the project
os.chdir("/home/aiops/zhuty/tinyllama/processing/graphs")
import tqdm
import argparse
from utils import read_trec_results
version = "20b"
result_dir = f"/home/aiops/zhuty/ret_pretraining_data/redpajama_{version}_id_added/bm25_search_results/"

In [None]:
TEST = False
# read result from all chunks
result_dict = {}
for i in tqdm.tqdm(list(range(0, 89)) + ['search_fail_queries_added'] + ['missing_queries']):
    if TEST and i > 10:
        break
    file_path = os.path.join(result_dir, "chunk_{i}.result.txt".format(i=i))
    result_dict.update(read_trec_results(file_path))

In [None]:
count = 0
# Print the results
for query_id, docs in result_dict.items():
    print(f"Query {query_id}:")
    count += 1
    for doc in docs[:10]:
        print(f"  Doc ID: {doc['doc_id']}, Score: {doc['score']}, Rank: {doc['rank']}")
    if  count > 10:
        break

#### Testing the completeness of the search results, each query should get 100 results

In [None]:
from collections import Counter
# count how many docs have itself as the top1 neighbor
count = 0
result_lengths = []
problematic_queries = []
for query_id, docs in result_dict.items():
    if docs[0]['doc_id'] == query_id:
        count += 1
    result_lengths.append(len(docs))
    if len(docs) < 100:
        problematic_queries.append(query_id)
print("Number of queries that have itself as the top1 neighbor:", count, f"percentage: {count/len(result_dict) *100:.2f}%")
print(Counter(result_lengths))
print("Number of queries that have less than 100 results:", len(problematic_queries), f"percentage: {len(problematic_queries)/len(result_dict) *100:.2f}%")
problematic_queries[:10]


In [None]:
sequence_set = set()
for i in range(0, 89):
    for j in range(0, 10000):
        sequence_set.add("{i}_{j}".format(i=i, j=j))

missing_queries = []
for query_id in sequence_set:
    if query_id not in result_dict:
        missing_queries.append(query_id)
# missing queries is problematic ==> it means that it cannot even search itself ?

In [None]:
print("Number of queries that are missing:", len(missing_queries), f"percentage: {len(missing_queries)/len(sequence_set) *100:.2f}%")
print("missing queries:", missing_queries[:10])
newly_added_search_fail_queries = set(list(problematic_queries) + missing_queries)
print("Number of newly added search fail queries:", len(newly_added_search_fail_queries))

In [None]:
import json
import os
def get_content(query_id, corpus_path):
    chunk_id, seq_id = query_id.split("_")
    base_path = corpus_path
    jsonl_file = os.path.join(base_path, "chunk_{}.jsonl".format(chunk_id))
    with open(jsonl_file, "r") as f:
        # directly go the line
        line = f.readlines()[int(seq_id)]
        data = json.loads(line)
        assert data["id"] == query_id
        return data



In [None]:
type = "queries/first" # or "train"
# dataset_name = "rpwiki_en"
# dataset_name = "redpajama_2b"
dataset_name = "c4_news"
corpus_path = f"/home/aiops/zhuty/ret_pretraining_data/id_added/{dataset_name}/{type}"
get_content("10_0", corpus_path)

In [None]:
type = "train" # or "train"
# dataset_name = "redpajama_2b"
# dataset_name = "redpajama_20b"
# dataset_name = "rpwiki_en"
dataset_name = "c4_news"
corpus_path = f"/home/aiops/zhuty/ret_pretraining_data/id_added/{dataset_name}/{type}"
get_content("365_10689", corpus_path)

In [None]:
# if the newly added queries are not empty, then we need to rerun the search for them

import json

to_write_data = []
not_searched_query_ids = sorted(missing_queries)
for docid in tqdm.tqdm(not_searched_query_ids, total=len(not_searched_query_ids)):
    chunk_id, seq_id = docid.split("_")
    base_path = "/home/aiops/zhuty/ret_pretraining_data/redpajama_2b_id_added/queries"
    jsonl_file = os.path.join(base_path, "chunk_{}.jsonl".format(chunk_id))
    with open(jsonl_file, "r") as f:
        # directly go the line
        line = f.readlines()[int(seq_id)]
        data = json.loads(line)
        assert data["id"] == docid

        if len(data['title']) > 1500:
            print("problematic docid", docid)
            print(data['title'])
            print(len(data['title'].split()))
            data['title'] = data['title'][-1500:]
        to_write_data.append(data)

        # a slower version
        # for line in f:
        #     line = line.strip()
        #     if line:
        #         data = json.loads(line)
        #         if data["id"] == docid:
        #             # print(data)
        #             # print(len(data['title'].split()))
        #             to_write_data.append(data)
        #             break
# write to jsonl
with open(os.path.join(base_path, "chunk_missing_queries.jsonl"), "w") as f:
    for data in to_write_data:
        f.write(json.dumps(data) + "\n")

In [None]:
# plot the distribution of the scores of the top1 neighbors, compared to the scores of other neighbors in top 10
import matplotlib.pyplot as plt
import numpy as np
top_1_scores = [min(300, docs[0]['score']) for docs in result_dict.values()] # add min to avoid outliers
others_scores = [min(300, docs[1]['score']) for docs in result_dict.values() if len(docs) > 1]
plt.hist(top_1_scores, bins=100, alpha=0.5, label='top1')
plt.hist(others_scores, bins=100, alpha=0.5, label='others')
plt.legend(loc='upper right')
plt.title("Distribution of the scores of the top1 neighbors, compared to the scores of top2 neighbors")
plt.show()


In [None]:
sorted(top_1_scores, reverse=True)[:1000000][-10:]

### Section 2: analyze the traversed path results

In [None]:
path_files = 'Saving result to /home/aiops/zhuty/ret_pretraining_data/redpajama_2b_id_added/traversal_paths/result_path_adj_lst_top_100_all_degree_min_degree_selection_20240103_130057.json'
import os
import re
from datetime import datetime
import json

def find_latest_file(directory, prefix, directed):
    # List all files in the directory
    files = os.listdir(directory)
    if directed:
        files = [x for x in files if 'undirected' not in x ]
    else:
        files = [x for x in files if 'undirected'  in x ]

    # Filter files based on the prefix and extract timestamps
    timestamped_files = []
    for file in files:
        if file.startswith(prefix):
            match = re.search(r'(\d{8}_\d{6})', file)
            if match:
                timestamp = match.group(1)
                timestamped_files.append((file, timestamp))

    # Check if there are any matched files
    if not timestamped_files:
        return None

    # Convert timestamps to datetime objects and find the latest file
    timestamped_files.sort(key=lambda x: datetime.strptime(x[1], '%Y%m%d_%H%M%S'), reverse=True)
    return timestamped_files[0][0]
from collections import Counter
import numpy as np
import pandas as pd
def get_path_length_statistics(paths):
    # print the length of the paths
    print("Number of paths:", len(paths))
    path_lengths = [len(path) for path in paths]
    print(Counter(path_lengths))
    # get statistics of the path lengths
    print("Average path length:", sum(path_lengths) / len(path_lengths))
    print("Max path length:", max(path_lengths))
    print("Min path length:", min(path_lengths))
    print("std path length:", np.std(path_lengths))
    stats_dict = {
        "num_paths": len(paths),
        "avg_path_length": sum(path_lengths) / len(path_lengths),
        "max_path_length": max(path_lengths),
        "min_path_length": min(path_lengths),
        "std_path_length": np.std(path_lengths)
    }
    return stats_dict



row_lst = []
paths_dir = "/home/aiops/zhuty/ret_pretraining_data/redpajama_2b_id_added/traversal_paths"
# for k in 1 3 5 10 20 ; do
for k in [1,3,5,10,20,100]:
  for node_selection in ["random", "min_degree", "max_degree"]:
    for degree_measure in ["in" ,"out" ,"all" ]:
        for directed in [True, False]:
            latest_path_file = find_latest_file(directory = paths_dir,
                                                prefix = f"result_path_adj_lst_top_{k}_{degree_measure}_degree_{node_selection}_selection",
                                                directed=directed)
            print(latest_path_file)
            paths = json.load(open(os.path.join(paths_dir, latest_path_file)),)
            stats_dict = get_path_length_statistics(paths)
            stats_dict['name'] = latest_path_file
            row_lst.append(stats_dict)

# create a dataframe
df = pd.DataFrame(row_lst)
print(df)

In [None]:
# get the average path length for each name in the df
name_avg_length = zip(df['name'], df['avg_path_length'])
name_avg_length = sorted(name_avg_length, key=lambda x: x[1])
for name, avg_length in name_avg_length:
    print(name, avg_length)

In [None]:
# adj_lst = json.load(open("/home/aiops/zhuty/ret_pretraining_data/redpajama_2b_id_added/adj_lists/adj_lst_top_1.json", "r"))
# adj_lst = json.load(open("/home/aiops/zhuty/ret_pretraining_data/redpajama_2b_id_added/adj_lists/adj_lst_top_3.json", "r"))
# adj_lst = json.load(open("/home/aiops/zhuty/ret_pretraining_data/redpajama_2b_id_added/adj_lists/adj_lst_top_5.json", "r"))
# path_file = "/home/aiops/zhuty/ret_pretraining_data/redpajama_2b_id_added/traversal_paths/result_path_adj_lst_top_1_20240103_074215.json"
# path_file = "/home/aiops/zhuty/ret_pretraining_data/redpajama_2b_id_added/traversal_paths/result_path_adj_lst_top_3_20240103_081822.json"
path_file = "/home/aiops/zhuty/ret_pretraining_data/redpajama_2b_id_added/traversal_paths/result_path_adj_lst_top_5_20240103_082740.json"
path_file = os.path.join("/home/aiops/zhuty/ret_pretraining_data/id_added/c4_news/traversal_paths/dense_first",
                         "result_path_adj_lists_top10_all_degree_random_selection_undirected_20240117_172931.json")
paths = json.load(open(path_file, "r"))


In [None]:
from collections import defaultdict
# analyze the in-degree of the nodes in the adjacency list
in_degree = defaultdict(int)
for query_id, neighbors in tqdm.tqdm(adj_lst.items()):
    for neighbor in neighbors:
        if neighbor[0] not in in_degree:
            in_degree[neighbor[0]] = 0
        in_degree[neighbor[0]] += 1

In [None]:
flattened_path = [item for sublist in paths for item in sublist]
assert len(flattened_path) == 890000

In [None]:
print(flattened_path.index('10_1832'))
print(flattened_path.index('87_1340'))

In [None]:
# print the sample paths
start_idx = 23998
for path in paths[start_idx:start_idx+10]:
    print(path, adj_lst[path[0]], in_degree[path[0]], len(adj_lst[path[0]]))

In [None]:
paths[:10]

In [None]:
len(paths[6])

In [None]:
paths[-1]

In [None]:
# Get the distribution of length of paths
from collections import Counter
path_lengths = [len(path) for path in paths]
print(Counter(path_lengths))

# plot the distribution of the path lengths
import matplotlib.pyplot as plt
import numpy as np
plt.hist(path_lengths, bins=100)
plt.show()


#### Testing whether we should merge

In [None]:
import json
from tqdm import tqdm
import os

In [None]:
orig_paths= json.load(open("/home/aiops/zhuty/ret_pretraining_data/id_added/redpajama_2b/traversal_paths/dense/result_path_adj_lst_top_100_all_degree_min_degree_selection_undirected_20240115_132632.json"))

In [None]:
def read_jsonl_files(jsonl_dir):
    adj_list = {}
    for file_name in tqdm(os.listdir(jsonl_dir)):
        file_path = os.path.join(jsonl_dir, file_name)
        with open(file_path) as f:
            for line in f:
                line = json.loads(line)
                adj_list[line['query_id']] = line['docs']
    print("Read", len(adj_list), "queries from", jsonl_dir)
    print("Read {} files from {}".format(len(os.listdir(jsonl_dir)), jsonl_dir))
    return adj_list



In [None]:
# adj_lst = read_jsonl_files("/home/aiops/zhuty/ret_pretraining_data/id_added/c4_news/dense_search_results/first/adj_lists")
adj_lst = json.load(open("/home/aiops/zhuty/ret_pretraining_data/id_added/redpajama_2b/adj_lists/dense/adj_lst_top_100.json"))

In [None]:
adj_lst['0_0'][:10]

In [None]:
len(adj_lst)

In [None]:
def merge(cluster2docs, doc2cluster, cluster_size = 21, top_k = 10):

    # data_stats(self.cluster2docs)

    merged_clusters_num = 0
    for cluster, cluster_docs in tqdm(cluster2docs.copy().items()):
        if len(cluster_docs) < cluster_size:
            merged_clusters_num += 1
            # print(merged_clusters_num)
            for doc in cluster_docs:
                # knns, relative_id = self.knns, doc
                # top1k, top1k_cluster = self.output_first_doc_knn_not_in_the_cluster(knns[relative_id, :], cluster)
                if doc not in adj_lst:
                    neighbor = None
                    neighbor_cluster = None
                else:
                    for neighbor,score in adj_lst[doc][:top_k]:
                        # find the first neighbor that is not in the same cluster
                        if doc2cluster[neighbor] == cluster:
                            continue
                        else:
                            break

                    neighbor_cluster = doc2cluster[neighbor]
                # bp()
                k_cluster_docs = cluster2docs[neighbor_cluster]
                # bp()
                # add k to doc
                # k_cluster_docs.append(k)
                if neighbor is None:
                    k_cluster_docs.append(doc)
                else:
                    k_cluster_docs.insert(k_cluster_docs.index(neighbor), doc)

                # update the cluster
                cluster2docs[neighbor_cluster] = k_cluster_docs
                doc2cluster[doc] = neighbor_cluster
            del cluster2docs[cluster]
    print(merged_clusters_num)
    return cluster2docs, doc2cluster

Note: I think this is slightly problematic
Suppose that initially we have A in C21, C22 has B, C and D.
Now if A joins C22, and the next time we go to C22, we would not have "cluster_docs" including A. Therefore it will get deleted.
Issue solved, it wouln't , because it is a shallow copy. See the copy() function.

In [None]:
from collections import defaultdict
cluster2docs = defaultdict(list)
doc2cluster = {}
for i, docs in enumerate(orig_paths):
    cluster2docs[i] = docs
    for doc in docs:
        doc2cluster[doc] = i

In [None]:
print(len(cluster2docs))

In [None]:
len(orig_paths[0])

In [None]:
from collections import defaultdict
a = defaultdict(list)
a[None]

In [None]:
new_cluster2docs, new_doc2cluster = merge(cluster2docs, doc2cluster, cluster_size=21)

In [None]:
len(new_doc2cluster)

In [None]:
len(new_cluster2docs[0])

In [None]:
# conver to path file
new_paths = []
for cluster, docs in new_cluster2docs.items():
    new_paths.append(docs)
print(len(new_paths))

json.dump(new_paths, open("/home/aiops/zhuty/ret_pretraining_data/id_added/redpajama_2b/traversal_paths/dense/result_path_adj_lst_top_100_all_degree_min_degree_selection_undirected_20240115_132632.json.merged", "w"))

In [None]:
print(len(new_cluster2docs))

In [None]:
len(cluster2docs[0])

In [None]:
len(new_cluster2docs[1])

In [None]:
count = 0
for docid, cluster in new_doc2cluster.items():
    print("New cluster:", cluster, "Old cluster:", doc2cluster[docid])
    count += 1
    if count > 1000:
        break

In [None]:
# get number of clusters and average size
cluster_size = []
for cluster, docs in new_cluster2docs.items():
    cluster_size.append(len(docs))
print("Number of clusters:", len(new_cluster2docs))
print("Average cluster size:", sum(cluster_size) / len(cluster_size))

In [None]:
doc2cluster['10_0']

In [None]:
# get the distribution of the cluster size
from collections import Counter
print(Counter(cluster_size))

In [None]:
max(cluster_size)

In [None]:
import json
import os
os.chdir("/home/aiops/zhuty/tinyllama/processing/graphs")
from utils import get_path_stats


In [None]:
basic_dir = "/home/aiops/zhuty/ret_pretraining_data/id_added/c4_news/traversal_paths/dense_first"
path_file = "result_path_adj_lists_top100_max21_all_degree_random_selection_undirected_20240119_033720.json"
path_file = os.path.join(basic_dir, path_file)
paths = json.load(open(path_file, "r" ))
get_path_stats(paths)

Number of paths: 12498589
Average length: 1.0825469979051234 standard deviation: 0.9460474568199976
Maximum length: 21
Number of paths with length 1: 12253662
% of paths with length 1: 98.04036279615242 %
Top 10 paths length: [21, 21, 21, 21, 21, 21, 21, 21, 21, 21]
Bottom 10 paths length: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]