In [1]:
import os
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import numpy as np
no_num_clean_p = re.compile(r'[^\w\s]+|\d+', re.UNICODE)

document_folder = "/home/tfink/data/kodicare/longeval/longeval-train-v2/publish/English/Documents/Json"
queries_file = "/home/tfink/data/kodicare/longeval/longeval-train-v2/publish/English/Queries/train.tsv"
qrels_file = "/home/tfink/data/kodicare/longeval/longeval-train-v2/publish/French/Qrels/train.txt"

# triplet generation process
* preprocess text
* create tf idf vectors
    * dictionary
    * tf-idf model
    * convert to vector? (sparse)
* store vectors in faiss (with document id)
* store vectors somewhere else because sparse
* get positive document (PD) vectors 
* get approximate nearest neighbors of PD to create negatives

In [2]:
def document_text_iterator():
    for file in os.listdir(document_folder)[80:100]:
        with open(os.path.join(document_folder, file), "r") as fp:
            data = json.load(fp)
            for doc in data:
                doc_id = doc['id']
                contents = doc['contents']
                yield contents


def document_iterator():
    for file in os.listdir(document_folder)[80:100]:
        with open(os.path.join(document_folder, file), "r") as fp:
            data = json.load(fp)
            for doc in data:
                doc_id = doc['id']
                contents = doc['contents']
                yield doc_id, contents


def get_document_ids():
    doc_ids = []
    doc_ids_inv = {}
    for file in os.listdir(document_folder)[80:100]:
        with open(os.path.join(document_folder, file), "r") as fp:
            data = json.load(fp)
            for doc in data:
                doc_id = doc['id']
                doc_ids_inv[doc_id] = len(doc_ids)
                doc_ids.append(doc_id)
    return doc_ids, doc_ids_inv

In [3]:
def read_qrels():
    relevance_judgements = {}
    with open(qrels_file, "r") as fp:
        for qrel_line in fp:
            q_id, _, doc_id, relevance = qrel_line.strip().split()
            relevance = int(relevance)
            if relevance == 1:
                if q_id not in relevance_judgements:
                    relevance_judgements[q_id] = set()
                relevance_judgements[q_id].add(doc_id)
    return relevance_judgements


def read_queries():
    queries = {}
    with open(queries_file, "r") as fp:
        for queries_line in fp:
            q_id, q_text = queries_line.strip().split(sep="\t")
            queries[q_id] = q_text
    return queries

In [4]:
tfidf_vect = TfidfVectorizer(max_df=0.75, min_df=10)
tfidf_vect.fit(document_text_iterator())

In [5]:
x = tfidf_vect.transform(document_text_iterator())

In [6]:
doc_ids, doc_ids_inv = get_document_ids()
relevance_judgements = read_qrels()

In [81]:
def get_top_k_similar(relevant_doc_vector, doc_vecs, relevant_doc_ids, top_k=10, upper_bound=1.0, lower_bound=0.70):
    # calculate similarity
    sim = cosine_similarity(relevant_doc_vector, doc_vecs)
    # set all known relevant documents to -inf
    relevant_doc_idx = [doc_ids_inv[doc_id] for doc_id in relevant_doc_ids if doc_id in doc_ids_inv]
    sim[0,relevant_doc_idx] = -np.inf
    # set all documents outside the boundaries to -inf
    sim = np.where(np.logical_and(sim <= upper_bound, sim >= lower_bound), sim, -np.inf)
    # get the top k document ids with the highest similarity, filtering out -inf
    top_sims_idx = np.argpartition(sim[0,:], -top_k)[-top_k:]
    sims_with_idx = [(doc_ids[idx], s) for idx, s in zip(top_sims_idx, sim[0,:][top_sims_idx]) 
                     if s != -np.inf]
    return sims_with_idx

In [82]:
triplets = []
for q_id, relevant_docs in relevance_judgements.items():
    for pos_doc_id in relevant_docs:
        if pos_doc_id not in doc_ids_inv:
            continue
        relevant_doc_vector = x[doc_ids_inv[pos_doc_id]]
        sims_with_idx = get_top_k_similar(relevant_doc_vector=relevant_doc_vector, doc_vecs=x, relevant_doc_ids=relevant_docs, lower_bound=0.40)
        sims_with_idx = sorted(sims_with_idx, key=lambda x:x[1], reverse=True)
        for neg_doc_id, s in sims_with_idx:
            triplets.extend(((q_id, pos_doc_id, neg_doc_id), s))

In [83]:
# 9706
len(triplets)

10772

In [84]:
triplets[:10]

[('q06223196', 'doc062200205493', 'doc062201708464'),
 0.5656752642847067,
 ('q06223196', 'doc062200205493', 'doc062200206319'),
 0.4693844337219547,
 ('q062287', 'doc062200209981', 'doc062200201728'),
 0.7384276989851657,
 ('q062287', 'doc062200209981', 'doc062200203904'),
 0.7233468421353527,
 ('q062287', 'doc062200209981', 'doc062200202239'),
 0.7110544130693836]

In [9]:
skipped = 0
for q_id, relevant_docs in relevance_judgements.items():
    sims_with_idx = None
    for rel_doc_id in relevant_docs:
        if rel_doc_id not in doc_ids_inv:
            skipped += 1
            continue
print("skipped", skipped)

skipped 1142


In [87]:
def get_documents():
    documents = {}
    for file in os.listdir(document_folder)[80:100]:
        with open(os.path.join(document_folder, file), "r") as fp:
            data = json.load(fp)
            for doc in data:
                doc_id = doc['id']
                documents[doc_id] = doc['contents']
    return documents

In [93]:
documents = get_documents()
queries = read_queries()

In [98]:
print(queries["q06223196"])
print("===== POS =====")
print([documents['doc062200205493']])
print("===== NEG =====")
print([documents['doc062201708464']])
print([documents['doc062200206319']])

Car shelter
===== POS =====
['Car shelter, campsite, terrace …\nProtec Car\nThis shelter car stable, lands on all sorts of terrain even elevated (No need for building permits)\nNews 04\nMar 2019\nTerrace Shelter\nParatou (Click here to enlarge)\nCar shelter / terrace with deported foot for more space under the shelter.\nShelter, carport...\n28 Fév 2019 LE NOUVEAU CATALOGUE\nCAR 2019\n!! \xa0 The new 2019 brochure is online, Check it out!!\nYou will find all our...\n28 Feb 2019\nGarantie\nProteccar\nFrance\nSTRUCTURE/OSSATURE SPECIFICATIONS:\nStructure:\nHot galvanized steel inside/outside (line... WHO ARE WE?\nSpecialist in car shelters, car camps, terraces, boats and many other applications.\nA simple, practical and economical solution that combines FULLY, SECURITY, CONFORT AND PRESTIGE.\nCONTACT US\nAbris\nProtecCar\nContact:\n03 44 84 82 11 Fax: 03 44 84 48 68 100% French manufacturing (Made in France ) 20 years of experience in the manufacture of shelters No requirement to attach t

In [97]:
print(queries["q062287"])
print("===== POS =====")
print([documents['doc062200209981']])
print("===== NEG =====")
print([documents['doc062200201728']])
print([documents['doc062200203904']])

antivirus comparison
===== POS =====
["Best antivirus 2022 (comparative): Secure PC, Mac and smartphone Share Tweet\nThe years pass and the threat of virus coming from the net continues to exponentially believe in the great lady of users.\nAnti-virus software publishers block thousands of attacks every day: malware, exploit zero\n-day, ransomware, crypto-jacking and other hacks have become commonplace.\nNo one is spared, and this concerns both individuals and the largest companies.\nThe hackers realized that the personal data had a very high value.\nThis is why\nthat they no longer hesitate to take them hostage or steal them in order to monetize their actions.\nIt is then possible to sell this data on the dark net with an increased risk of subsequent hacks.\nIn today’s world, having one of the best antiviruses on the market is essential.\nThis will allow you to be protected at any time, without noticing. World No.1 (500 million customers)\nDetects viruses, malware, phishing\nFastest on