In [6]:
import csv
import gzip
import json
import jsonlines
import os
import pickle
import pandas as pd
from collections import Counter, defaultdict
from tqdm import tqdm

from document_preprocessor import RegexTokenizer
from indexing import Indexer, IndexType
from ranker import Ranker, BM25, PersonalizedBM25
from network_features import NetworkFeatures
from l2r import CrossEncoderScorer, L2RFeatureExtractor, L2RRanker

In [7]:
DRIVE_PATH = '../data/'
CACHE_PATH = '../cache/'
STOPWORD_PATH = DRIVE_PATH + 'stopwords.txt'
DATASET_PATH = DRIVE_PATH + 'wikipedia_200k_dataset.jsonl.gz'
EDGELIST_PATH = DRIVE_PATH + 'edgelist.csv.gz'
NETWORK_STATS_PATH = DRIVE_PATH + 'network_stats.csv'
DOC2QUERY_PATH = DRIVE_PATH + 'doc2query.csv'
MAIN_INDEX = 'main_index_augmented'
TITLE_INDEX = 'title_index'
RELEVANCE_TRAIN_DATA = DRIVE_PATH + 'hw4_relevance.train.csv'
ENCODED_DOCUMENT_EMBEDDINGS_NPY_DATA = DRIVE_PATH + \
    'wiki-200k-vecs.msmarco-MiniLM-L12-cos-v5.npy'
PERSON_ATTRIBUTES = DRIVE_PATH + 'person-attributes.csv'
DOCUMENT_IDS = DRIVE_PATH + 'document-ids.txt'
PERSONALIZATION_PATH = DRIVE_PATH + 'personalization.jsonl'

In [8]:
with jsonlines.Reader(open(PERSONALIZATION_PATH, 'r')) as reader:
    personalization = [line for line in reader]
docids_in_order = open(DOCUMENT_IDS, 'r', encoding='utf-8').read().split('\n')

pagerank1_ids = []
for doc in personalization[0]['seed_docs']:
    pagerank1_ids.append(doc['docid'])
pagerank2_ids = []
for doc in personalization[1]['seed_docs']:
    pagerank2_ids.append(doc['docid']) 

In [9]:
for id, person in enumerate(personalization):
    if os.path.exists(CACHE_PATH + 'network_stats_ps' + str(id + 1) + '.csv'):
        continue
    person_docs = person['seed_docs']
    weights = dict()
    num_docs = len(person_docs)
    for doc in person_docs:
        docid = doc['docid']
        if str(docid) not in docids_in_order:
            num_docs -= 1
            print('doc not found')
        weights[docids_in_order.index(str(docid))] = 1
    for doc in weights.keys():
        weights[doc] = 1 / num_docs

    nf = NetworkFeatures()
    print('loading network')
    graph = nf.load_network(EDGELIST_PATH, total_edges=92650947)
    print('getting stats')
    net_feats_df = nf.get_all_network_statistics(graph, weights=weights)
    graph = None
    print('Saving')
    net_feats_df.to_csv(CACHE_PATH + 'network_stats_ps' +
                        str(id + 1) + '.csv', index=False)
del personalization

In [10]:
# Load in the stopwords

stopwords = set()
with open(STOPWORD_PATH, 'r', encoding='utf-8') as file:
    for stopword in file:
        stopwords.add(stopword.strip())
f'Stopwords collected {len(stopwords)}'

'Stopwords collected 543'

In [11]:
# Get the list of categories for each page (either compute it or load the pre-computed list)
file_path = CACHE_PATH + 'docid_to_categories.pkl'
if not os.path.exists(file_path):
    docid_to_categories = {}
    with gzip.open(DATASET_PATH, 'rt') as file:
        for line in tqdm(file, total=200_000):
            document = json.loads(line)
            docid_to_categories[document['docid']] = document['categories']
    pickle.dump(docid_to_categories, open(file_path, 'wb'))
else:
    docid_to_categories = pickle.load(open(file_path, 'rb'))
f'Document categories collected'

'Document categories collected'

In [12]:
# Get or pre-compute the list of categories at least the minimum number of times (specified in the homework)
category_counts = Counter()
for cats in tqdm(docid_to_categories.values(), total=len(docid_to_categories)):
    for c in cats:
        category_counts[c] += 1
recognized_categories = set(
    [cat for cat, count in category_counts.items() if count >= 1000])
print("saw %d categories" % len(recognized_categories))

file_path = CACHE_PATH + 'doc_category_info.pkl'
if not os.path.exists(file_path):
    # Map each document to the smallert set of categories that occur frequently
    doc_category_info = {}
    for docid, cats in tqdm(docid_to_categories.items(), total=len(docid_to_categories)):
        valid_cats = [c for c in cats if c in recognized_categories]
        doc_category_info[docid] = valid_cats
    pickle.dump(doc_category_info, open(file_path, 'wb'))
else:
    doc_category_info = pickle.load(open(file_path, 'rb'))

  0%|          | 0/200000 [00:00<?, ?it/s]

100%|██████████| 200000/200000 [00:00<00:00, 548148.21it/s]


saw 118 categories


In [13]:
doc_augment_dict = defaultdict(lambda: [])
with open(DOC2QUERY_PATH, 'r', encoding='utf-8') as file:
    dataset = csv.reader(file)
    for idx, row in tqdm(enumerate(dataset), total=600_000):
        if idx == 0:
            continue
        doc_augment_dict[int(row[0])].append(row[2])

600001it [00:00, 876669.14it/s]                             


In [14]:
# Load or build Inverted Indices for the documents' main text and titles
#
# Estiamted times:
#    Document text token counting: 4 minutes
#    Document text indexing: 5 minutes
#    Title text indexing: 30 seconds
preprocessor = RegexTokenizer('\w+')

# Creating and saving the index

# main_index_path = CACHE_PATH + MAIN_INDEX
main_index = Indexer.create_index(
    IndexType.InvertedIndex, DATASET_PATH, preprocessor,
    stopwords, 50, doc_augment_dict=doc_augment_dict)
# main_index.save(main_index_path)
                
# title_index_path = CACHE_PATH + TITLE_INDEX
title_index = Indexer.create_index(             
    IndexType.InvertedIndex, DATASET_PATH, preprocessor,
    stopwords, 2, text_key='title')
# title_index.save(title_index_path)

Indexing...


100%|██████████| 200000/200000 [02:51<00:00, 1165.13it/s]


Indexing...


100%|██████████| 200000/200000 [00:16<00:00, 12028.06it/s]


In [15]:
with open(RELEVANCE_TRAIN_DATA, 'r', encoding='utf-8') as file:
    data = csv.reader(file)
    train_queries = []
    train_docs = []
    for idx, row in tqdm(enumerate(data)):
        if idx == 0:
            continue
        if row[0] not in train_queries:
            train_queries.append(row[0])
        if row[2] not in train_docs:
            train_docs.append(row[2])

9605it [00:00, 50647.76it/s]


In [16]:
# create the raw text dictionary by going through the wiki dataset
# this dictionary should store only the first 500 characters of the raw documents text

file_path = CACHE_PATH + 'raw_text_dict_train.pkl'
if not os.path.exists(file_path):
    raw_text_dict = {}
    file = gzip.open(DATASET_PATH, 'rt')
    with jsonlines.Reader(file) as reader:
        for _ in tqdm(range(200_000)):
            try:
                data = reader.read()
                if str(data['docid']) in train_docs:
                    raw_text_dict[int(data['docid'])] = data['text'][:500]
            except EOFError:
                break
    pickle.dump(raw_text_dict, open(file_path, 'wb'))
else:
    raw_text_dict = pickle.load(open(file_path, 'rb'))

In [17]:
network_features = {}
# Get or load the network statistics for the Wikipedia link network

if not os.path.exists(NETWORK_STATS_PATH):
    nf = NetworkFeatures()
    print('loading network')
    graph = nf.load_network(EDGELIST_PATH, total_edges=92650947)
    print('getting stats')
    net_feats_df = nf.get_all_network_statistics(graph)
    graph = None
    print('Saving')
    net_feats_df.to_csv(NETWORK_STATS_PATH, index=False)

    print("converting to dict format")
    network_features = defaultdict(dict)
    for i, row in tqdm(net_feats_df.iterrows(), total=len(net_feats_df)):
        for col in ['pagerank', 'hub_score', 'authority_score']:
            network_features[row['docid']][col] = row[col]
    net_feats_df = None
else:
    with open(NETWORK_STATS_PATH, 'r', encoding='utf-8') as file:
        for idx, line in enumerate(file):
            if idx == 0:
                continue
            else:
                # the indexes may change depending on your CSV
                splits = line.strip().split(',')
                network_features[int(splits[0])] = {
                    'pagerank': float(splits[1]),
                    'authority_score': float(splits[2]),
                    'hub_score': float(splits[3])
                }
f'Network stats collection {len(network_features)}'

'Network stats collection 999841'

In [18]:
def get_network_features(path):
    network_features = defaultdict(dict)
    with open(path, 'r', encoding='utf-8') as file:
        for idx, line in enumerate(file):
            if idx == 0:
                continue
            else:
                # the indexes may change depending on your CSV
                splits = line.strip().split(',')
                network_features[int(splits[0])] = {
                    'pagerank': float(splits[1]),
                    'authority_score': float(splits[2]),
                    'hub_score': float(splits[3])
                }
    return network_features

In [77]:
# Create the feature extractor. This will be used by both pipelines
cescorer = CrossEncoderScorer(raw_text_dict)
fe = L2RFeatureExtractor(main_index, title_index, doc_category_info,
                          preprocessor, stopwords, recognized_categories,
                          network_features, cescorer)

In [79]:
bm25 = BM25(main_index)
ranker = Ranker(main_index, preprocessor, stopwords, bm25, raw_text_dict)

pipeline = L2RRanker(main_index, title_index,
                       preprocessor, stopwords, ranker, fe)
pipeline.train(RELEVANCE_TRAIN_DATA, 'model')

Loading training data...
Training model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001250 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2641
[LightGBM] [Info] Number of data points in the train set: 9604, number of used features: 123


In [21]:
relevant_doc_index_1 = Indexer.create_rel_index(
    IndexType.InvertedIndex, DATASET_PATH, preprocessor,
    stopwords, 50, doc_augment_dict=doc_augment_dict, 
    rel_ids=pagerank1_ids)

relevant_doc_index_2 = Indexer.create_rel_index(
    IndexType.InvertedIndex, DATASET_PATH, preprocessor,
    stopwords, 50, doc_augment_dict=doc_augment_dict, 
    rel_ids=pagerank2_ids)

Indexing...


100%|██████████| 200000/200000 [00:15<00:00, 12867.30it/s]


Indexing...


100%|██████████| 200000/200000 [00:15<00:00, 12849.80it/s]


In [72]:
personalizedBM25_1 = PersonalizedBM25(main_index, relevant_doc_index_1)
personalizedBM25_2 = PersonalizedBM25(main_index, relevant_doc_index_2)

# network_features_1 = get_network_features(CACHE_PATH + 'network_stats_ps1.csv')
# network_features_2 = get_network_features(CACHE_PATH + 'network_stats_ps2.csv')

In [88]:
pipeline.ranker.scorer = personalizedBM25_1
# pipeline.feature_extractor.network_features = network_features_1

In [78]:
queries = ['python', 'java', 'bug']

In [95]:
results = []
for query in queries:
    results.append(pipeline.query(query, 10))

In [96]:
for result in results:
    print(queries[results.index(result)])
    print(result[:10])

python
[(819149, 0.12848279598091011), (19701, 0.12848279598091011), (17920, 0.12848279598091011), (23862, 0.12838779279166723), (18942, 0.12838779279166723), (3596573, 0.12029291607111543), (5087621, 0.12029291607111543), (794677, 0.12029291607111543), (23372115, 0.11890082165899062), (1441500, 0.09105333267692566)]
java
[(17521476, 0.137569245627738), (24920873, 0.12987489039303493), (417018, 0.12987489039303493), (5712491, 0.12987489039303493), (7771171, 0.12848279598091011), (269441, 0.12848279598091011), (42871, 0.12848279598091011), (4718446, 0.12838779279166723), (561875, 0.12798727130581852), (12540957, 0.12078844074620701)]
bug
[(2047790, 0.12987489039303493), (1028992, 0.12987489039303493), (605702, 0.12987489039303493), (18203680, 0.12987489039303493), (12170044, 0.12987489039303493), (952459, 0.12848279598091011), (273456, 0.12798727130581852), (1012051, 0.12078844074620701), (898290, 0.12078844074620701), (802481, 0.12029291607111543)]


: 

In [94]:
docids = []
count = 0
for item in results[0]:
    if count == 10:
        break
    docids.append(item[0])
    count += 1
docids[:10]

[23862,
 794677,
 5087621,
 3596573,
 23372115,
 29040606,
 390263,
 1441500,
 2137644,
 7018181]

In [92]:
dataset_file = gzip.open(DATASET_PATH, 'rt')
with jsonlines.Reader(dataset_file) as reader:
    for _ in range(200_000):
        try:
            data = reader.read()
            if data['docid'] in docids:
                print(data['docid'])
                print(data['title'])
        except EOFError:
            break

23862
Python (programming language)
29040606
Flask (web framework)
23372115
Monty Python's Flying Circus
7018181
Anonymous function
5087621
Python License
3596573
Python Software Foundation License
2137644
Quantum programming
1441500
Web Server Gateway Interface
390263
Jython
794677
Mod python


In [93]:
pipeline.ranker.scorer = personalizedBM25_2
# pipeline.feature_extractor.network_features = network_features_2

In [64]:
results = []
for query in queries:
    results.append(pipeline.query(query, 10))

for result in results:
    print(queries[results.index(result)])
    print(result[:10])

python
[(819149, 0.12848279598091011), (19701, 0.12848279598091011), (17920, 0.12848279598091011), (23862, 0.12838779279166723), (18942, 0.12838779279166723), (3596573, 0.12029291607111543), (5087621, 0.12029291607111543), (794677, 0.12029291607111543), (23372115, 0.11890082165899062), (1441500, 0.09105333267692566)]
java
[(17521476, 0.137569245627738), (24920873, 0.12987489039303493), (417018, 0.12987489039303493), (5712491, 0.12987489039303493), (7771171, 0.12848279598091011), (269441, 0.12848279598091011), (42871, 0.12848279598091011), (4718446, 0.12838779279166723), (561875, 0.12798727130581852), (12540957, 0.12078844074620701)]
bug
[(2047790, 0.12987489039303493), (1028992, 0.12987489039303493), (605702, 0.12987489039303493), (18203680, 0.12987489039303493), (12170044, 0.12987489039303493), (952459, 0.12848279598091011), (273456, 0.12798727130581852), (1012051, 0.12078844074620701), (898290, 0.12078844074620701), (802481, 0.12029291607111543)]
