# Natural Language Processing - Information Retrieval

### Ziv Attias


In [3]:
import os

from transformers import AutoTokenizer, TFAutoModel

# Initialize tokenizer and model for DistilBERT

LOCAL_DISTILBERT = os.path.join(
    "model", "information-retrieval", "distilbert-base-multilingual-cased"
)

tokenizer = AutoTokenizer.from_pretrained(LOCAL_DISTILBERT)
model = TFAutoModel.from_pretrained(LOCAL_DISTILBERT)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [4]:
import tensorflow as tf


def compute_embedding(text):
    encoded_input = tokenizer(
        text, return_tensors="tf", padding=True, truncation=True, max_length=512
    )
    outputs = model(**encoded_input)
    embeddings = tf.reduce_mean(outputs.last_hidden_state, axis=1)
    return embeddings.numpy()

In [35]:
from datasets import load_dataset

# Load a subset of the wikipedia dataset

BASE_DATASET_PATH = os.path.join("data", "wikipedia-22-12-en-embeddings")

ds = load_dataset(
    "parquet",
    data_files=os.path.join(
        BASE_DATASET_PATH, "train-00000-of-00253-8d3dffb4e6ef0304.parquet"
    ),
)

df = ds["train"].to_pandas()
df.head()

Unnamed: 0,id,title,text,url,wiki_id,views,paragraph_id,langs,emb
0,0,Deaths in 2022,The following notable deaths occurred in 2022....,https://en.wikipedia.org/wiki?curid=69407798,69407798,5674.449219,0,38,"[0.28656968, -0.031816833, 0.06668472, 0.03292..."
1,1,YouTube,YouTube is a global online video sharing and s...,https://en.wikipedia.org/wiki?curid=3524766,3524766,5409.561035,0,184,"[-0.09689382, 0.16192119, -0.097957894, 0.1022..."
2,2,YouTube,"In October 2006, YouTube was bought by Google ...",https://en.wikipedia.org/wiki?curid=3524766,3524766,5409.561035,1,184,"[0.13020493, 0.26573685, 0.40181553, -0.407551..."
3,3,YouTube,"Since its purchase by Google, YouTube has expa...",https://en.wikipedia.org/wiki?curid=3524766,3524766,5409.561035,2,184,"[-0.09791257, 0.13586107, -0.015414367, -0.140..."
4,4,YouTube,YouTube has had an unprecedented social impact...,https://en.wikipedia.org/wiki?curid=3524766,3524766,5409.561035,3,184,"[-0.2641527, 0.069682166, -0.14558455, 0.37028..."


In [36]:
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm


# ========Exercise 3.1 ===========
# Fill in the following code
# ===============================
def find_most_relevant_article(query_embedding, dataset, max_num_of_articles=None):
    most_relevant_article_text = None
    most_relevant_article_id = None
    max_similarity = -1

    for i, article in tqdm(enumerate(dataset["train"])):
        if max_num_of_articles and i >= max_num_of_articles:
            break

        # Compute the embedding for the current article
        article_embedding = compute_embedding(article["text"])

        # Calculate cosine similarity
        similarity = cosine_similarity(query_embedding, article_embedding)[0][0]

        # Check if this is the most similar article
        if similarity > max_similarity:
            max_similarity = similarity
            most_relevant_article_text, most_relevant_article_id = (
                article["text"],
                article["id"],
            )

    return most_relevant_article_text, most_relevant_article_id, max_similarity


# ========End Exercise 3.1 ===========

In [37]:
# Example input string
input_text = "Deep Learning"

# Compute the embedding for the input text
input_embedding = compute_embedding(input_text)
print(input_embedding.shape)

# Find the most relevant article
# To reduce the runtime, look at only the first N articles
article, id, similarity = find_most_relevant_article(input_embedding, ds, 1000)
print("Most Relevant Article:", article)
print("Article ID:", id)
print("Similarity Score:", similarity)

(1, 768)


1000it [01:55,  8.65it/s]

Most Relevant Article: SharePoint, a web collaboration platform codenamed as Office Server, has integration and compatibility with Office 2003 and so on.
Article ID: 860
Similarity Score: 0.56057405





In [38]:
queries = ["Leonardo DiCaprio", "France", "Python", "Deep Learning"]

for query in queries:
    print(f"Query: {query}")
    query_embedding = compute_embedding(query)
    print(f"Embedding computed successfully. Embedding: {query_embedding}")

    article, id, similarity = find_most_relevant_article(query_embedding, ds, 1000)

    print(f"Most Relevant Article: {article}")
    print("Article ID:", id)
    print(f"Similarity Score: {similarity}", end="\n\n")

Query: Leonardo DiCaprio
Embedding computed successfully. Embedding: [[-9.59519967e-02  7.45836049e-02  6.77049756e-01  2.40724474e-01
   4.61874962e-01 -3.45661908e-01 -3.03949028e-01 -6.71341345e-02
  -1.73854023e-01  2.93704838e-01  4.64068115e-01  4.69575554e-01
  -8.87735561e-02 -3.38958979e-01 -2.81628937e-01 -2.21938938e-01
   9.94865075e-02  4.78033453e-01  2.39752159e-01  1.17325187e-01
  -3.02006751e-01 -3.74641418e-02 -9.69213620e-02  2.47045770e-01
   1.96393400e-01  3.63059729e-01 -4.63838398e-01  3.68772775e-01
   3.19366902e-01 -2.10952535e-01  3.88392985e-01  5.25062494e-02
  -3.84739749e-02  2.54368216e-01 -1.06025815e-01 -2.52615273e-01
  -3.20317835e-01  5.48730671e-01 -2.69383758e-01 -8.64390843e-03
  -2.50041746e-02 -5.83599389e-01 -1.54916167e-01 -1.05179735e-01
  -5.13592780e-01 -7.06235832e-03 -2.79554039e-01 -1.91702455e-01
  -4.23847735e-02 -3.01972777e-01 -3.37759793e-01  7.29808658e-02
   1.31848589e-01 -4.39206690e-01 -6.96096420e-01  5.26870549e-01
   3.24

1000it [02:06,  7.89it/s]


Most Relevant Article: Elizabeth was portrayed in a variety of media by many notable artists, including painters Pietro Annigoni, Peter Blake, Chinwe Chukwuogo-Roy, Terence Cuneo, Lucian Freud, Rolf Harris, Damien Hirst, Juliet Pannett and Tai-Shan Schierenberg. Notable photographers of Elizabeth included Cecil Beaton, Yousuf Karsh, Anwar Hussein, Annie Leibovitz, Lord Lichfield, Terry O'Neill, John Swannell and Dorothy Wilding. The first official portrait photograph of Elizabeth was taken by Marcus Adams in 1926.
Article ID: 430
Similarity Score: 0.536819338798523

Query: France
Embedding computed successfully. Embedding: [[ 4.60443906e-02  3.09329685e-02  7.71312416e-01  6.71014711e-02
   6.37139529e-02 -1.39061799e-02 -1.99432686e-01  3.49264652e-01
  -1.52028814e-01  1.44522175e-01  2.24077851e-02 -8.19406379e-03
  -1.91598728e-01  1.43503383e-01 -1.39692739e-01 -3.53699714e-01
  -1.82728410e-01  5.69853187e-01 -3.86621624e-01  3.89882118e-01
   8.43310952e-02  2.42281277e-02 -1.29

1000it [02:06,  7.93it/s]


Most Relevant Article: In May 2022, FIFA announced the list of 36 referees, 69 assistant referees, and 24 video assistant referees for the tournament. Of the 36 referees, FIFA included two each from Argentina, Brazil, England, and France.
Article ID: 620
Similarity Score: 0.36249446868896484

Query: Python
Embedding computed successfully. Embedding: [[ 3.83164883e-01 -4.75612879e-01  6.08251750e-01  4.60513204e-01
  -1.78723216e-01 -3.77886258e-02  1.23519808e-01  2.12811545e-01
  -1.15663528e-01 -1.45645335e-01  1.63524702e-01 -2.11859524e-01
   4.69711833e-02 -2.09794287e-02 -1.71504244e-01 -2.75040865e-01
  -1.14274472e-01  1.95351258e-01 -4.77621317e-01  4.16695565e-01
   2.92886019e-01 -8.98475051e-02 -1.66130528e-01 -2.81221718e-01
   5.24618149e-01 -6.46402299e-01 -5.13503551e-01  3.39638501e-01
   1.20092995e-01  6.20646894e-01  3.74220282e-01  4.81431969e-02
  -1.23465806e-03  3.08863014e-01  4.03591543e-02  2.53654599e-01
  -4.98392969e-01  1.48067608e-01 -2.37882614e-01 -7.5

1000it [02:06,  7.91it/s]


Most Relevant Article: SharePoint, a web collaboration platform codenamed as Office Server, has integration and compatibility with Office 2003 and so on.
Article ID: 860
Similarity Score: 0.5572224259376526

Query: Deep Learning
Embedding computed successfully. Embedding: [[ 1.87335446e-01 -2.07780629e-01  6.53357983e-01  2.46475823e-02
  -1.97819591e-01 -1.86817311e-02 -2.79832035e-02  1.37425661e-01
   1.67555109e-01 -2.03151122e-01  1.94314823e-01  1.39369890e-01
   1.72604695e-01 -1.40898190e-02  1.25575081e-01 -1.58772990e-01
  -4.35108662e-01  4.21629816e-01  7.01710731e-02  6.39143109e-01
  -1.17417708e-01  2.80907843e-02 -3.52639735e-01  7.20019452e-03
   3.30647379e-01 -4.04433072e-01 -4.50802505e-01  5.02763391e-01
   1.28057703e-01  1.48254782e-01  4.07883108e-01  2.17219323e-01
  -4.10110503e-02 -2.60563314e-01  1.92807436e-01  2.74912477e-01
  -3.40661556e-01 -3.71794283e-01 -1.40324950e-01 -2.75714755e-01
   5.17266512e-01 -3.67767513e-01  2.53014475e-01  6.11164887e-03
 

1000it [02:06,  7.89it/s]

Most Relevant Article: SharePoint, a web collaboration platform codenamed as Office Server, has integration and compatibility with Office 2003 and so on.
Article ID: 860
Similarity Score: 0.5605740547180176




