In [None]:
!pip install datasets

In [None]:
!pip install tqdm

In [None]:
from datasets import load_dataset

In [None]:
dataset = load_dataset("pubmed_qa", "pqa_labeled")

In [None]:
dataset

In [None]:
type(dataset)

In [None]:
dataset["train"][0]

In [None]:
len(dataset["train"])

In [None]:
import torch
from transformers import AutoModel, AutoTokenizer

In [None]:
syllable_tokenizer = AutoTokenizer.from_pretrained("vinai/bartpho-syllable")
bartpho_syllable = AutoModel.from_pretrained("vinai/bartpho-syllable")

In [None]:
syllable_tokenizer

In [None]:
bartpho_syllable

In [None]:
TXT = dataset["train"][0]['question']

In [None]:
input_ids = syllable_tokenizer(TXT, return_tensors='pt')['input_ids']
features = bartpho_syllable(input_ids)

In [None]:
features

In [None]:
word_tokenizer = AutoTokenizer.from_pretrained("vinai/bartpho-word")
bartpho_word = AutoModel.from_pretrained("vinai/bartpho-word")

In [None]:
TXT = dataset["train"][0]['question']
input_ids = word_tokenizer(TXT, return_tensors='pt')['input_ids']
features = bartpho_word(input_ids)

In [None]:
features

In [None]:
# Function to create embeddings
def get_embedding(text):
    inputs = word_tokenizer(text, return_tensors="pt")['input_ids']
    with torch.no_grad():
        outputs = bartpho_word(inputs)
    # Use the last hidden state of the first token as the sentence embedding
    embedding = outputs.last_hidden_state[:, 0, :]
    return embedding.squeeze().numpy()

In [None]:
embedding = get_embedding(TXT)

In [None]:
type(embedding)

In [None]:
len(embedding)

In [8]:
from qdrant_client import QdrantClient
from dotenv import load_dotenv
from qdrant_client.models import Distance, VectorParams
from qdrant_client.models import PointStruct
from qdrant_client.models import Filter, FieldCondition, MatchValue
import os

In [9]:
load_dotenv()

qdrant_url = os.getenv('QDRANT_URL')

client = QdrantClient(url=qdrant_url)


In [None]:
dataset["train"][0]

In [None]:
import numpy as np
import tqdm

In [None]:
embeddings = []
for i in  tqdm.tqdm(range(len(dataset["train"]))):
    vector = get_embedding(dataset["train"][0]['long_answer'])
    embeddings.append(vector)

vector = [
    list(map(float, vector))
    for vector in embeddings
]

vector

In [None]:
len(vector)

In [None]:
len(vector[0])  

In [None]:
ids = [entry["pubid"] for entry in dataset["train"]]

In [None]:
len(ids)

In [11]:
from qdrant_client.http import models as rest

In [None]:
client.create_collection(
    collection_name="pubmed_qa",
    vectors_config=VectorParams(size=1024, distance=Distance.DOT),
)

In [None]:
client.upsert(
    collection_name="pubmed_qa", 
    points=rest.Batch(
        ids=ids,
        vectors=vector,
        payloads=list(dataset["train"]),
    )
)

In [None]:
question_response = []
for i in  tqdm.tqdm(range(len(dataset["train"]))):
    vector = get_embedding(dataset["train"][0]['question'])
    question_response.append(vector)

In [None]:
zip(question_response, ids)

In [None]:
# Finding the position at which Qdrant provided the expected answer for each question. 
# That allows to calculate accuracy@k for different values of k.
k_max = 10
answer_positions = []
for embedding, pubid in tqdm.tqdm(zip(question_response, ids)):
    response = client.search(
        collection_name="pubmed_qa",
        query_vector=embedding,
        limit=k_max,
    )

    answer_ids = [record.id for record in response]
    if pubid in answer_ids:
        answer_positions.append(answer_ids.index(pubid))
    else:
        answer_positions.append(-1)

In [None]:
c = 0
for i in answer_positions:
    if i != -1:
        c += 1
c

## bert-large-cased

In [None]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
model = BertModel.from_pretrained("bert-large-cased")
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

In [None]:
output

In [None]:
# Function to create embeddings
def get_embedding_common(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt")['input_ids']
    with torch.no_grad():
        outputs = model(inputs)
    # Use the last hidden state of the first token as the sentence embedding
    embedding = outputs.last_hidden_state[:, 0, :]
    return embedding.squeeze().numpy()

In [None]:
answer_embeddings = []
for i in  tqdm.tqdm(range(len(dataset["train"]))):
    vector = get_embedding_common(dataset["train"][0]['long_answer'], tokenizer, model)
    answer_embeddings.append(vector)

question_response = []
for i in  tqdm.tqdm(range(len(dataset["train"]))):
    ebd_vt = get_embedding_common(dataset["train"][0]['question'], tokenizer, model)
    question_response.append(ebd_vt)

vector = [
    list(map(float, vector))
    for vector in embeddings
]

In [None]:
from qdrant_client import QdrantClient, models

client.create_collection(
    collection_name="pubmed_qa_bert_cosin",
    vectors_config=VectorParams(size=1024, distance=Distance.COSINE),
)

In [None]:
client.upsert(
    collection_name="pubmed_qa_bert_cosin", 
    points=rest.Batch(
        ids=ids,
        vectors=vector,
        payloads=list(dataset["train"]),
    )
)

In [None]:
# Finding the position at which Qdrant provided the expected answer for each question. 
# That allows to calculate accuracy@k for different values of k.
k_max = 10
answer_positions = []
for embedding, pubid in tqdm.tqdm(zip(question_response, ids)):
    response = client.search(
        collection_name="pubmed_qa_bert_cosin",
        query_vector=embedding,
        limit=k_max,
    )

    answer_ids = [record.id for record in response]
    if pubid in answer_ids:
        answer_positions.append(answer_ids.index(pubid))
    else:
        answer_positions.append(-1)

In [None]:
# Prepared answer positions are being used to calculate different values of accuracy@k
for k in range(1, k_max + 1):
    correct_answers = len(
        list(
            filter(lambda x: 0 <= x < k, answer_positions)
        )
    )
    print(f"accuracy@{k} =", correct_answers / len(dataset["train"]))

## Cohere

In [None]:
!pip install cohere==4.21

In [3]:
import cohere

COHERE_API_KEY = "KqApPE9kQ9TSKjc8OkD0X9W2YEJlCkQBmg1Xb1tv"
cohere_client = cohere.Client(COHERE_API_KEY)

In [4]:
# Generating the embeddings with Cohere client library
embeddings = cohere_client.embed(
    texts=["A test sentence"],
    model="large",
)
vector_size = len(embeddings.embeddings[0])
print(vector_size) # output: 4096

unknown field: parameter compress is not a valid field


4096


In [5]:
answer_response = cohere_client.embed(
    texts=dataset["train"]["long_answer"],
    model="large",
)
vectors = [
    # Conversion to float is required for Qdrant
    list(map(float, vector)) 
    for vector in answer_response.embeddings
]
ids = [entry["pubid"] for entry in dataset["train"]]

unknown field: parameter compress is not a valid field
unknown field: parameter compress is not a valid field
unknown field: parameter compress is not a valid field
unknown field: parameter compress is not a valid field
unknown field: parameter compress is not a valid field
unknown field: parameter compress is not a valid field
unknown field: parameter compress is not a valid field
unknown field: parameter compress is not a valid field
unknown field: parameter compress is not a valid field
unknown field: parameter compress is not a valid field


unknown field: parameter compress is not a valid field
unknown field: parameter compress is not a valid field


In [20]:
client.create_collection(
    collection_name="pubmed_qa_cohere_4096",
    vectors_config=VectorParams(size=4096, distance=Distance.COSINE),
)

True

In [36]:
client.upsert(
    collection_name="pubmed_qa_cohere_4096", 
    points=rest.Batch(
        ids=ids[0:100],
        vectors=vectors[0:100],
        payloads=list(dataset["train"])[:100],
    )
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [13]:
question_response = cohere_client.embed(
    texts=dataset["train"]["question"],
    model="large",
)

unknown field: parameter compress is not a valid field
unknown field: parameter compress is not a valid field
unknown field: parameter compress is not a valid field
unknown field: parameter compress is not a valid field
unknown field: parameter compress is not a valid field
unknown field: parameter compress is not a valid field
unknown field: parameter compress is not a valid field
unknown field: parameter compress is not a valid field
unknown field: parameter compress is not a valid field
unknown field: parameter compress is not a valid field


In [14]:
type(question_response)

cohere.responses.embeddings.Embeddings

In [18]:
len(question_response.embeddings[0])

4096

In [39]:
import tqdm

# Finding the position at which Qdrant provided the expected answer for each question. 
# That allows to calculate accuracy@k for different values of k.
k_max = 10
answer_positions = []
for embedding, pubid in tqdm.tqdm(zip(question_response.embeddings[:100], ids)):
    response = client.search(
        collection_name="pubmed_qa_cohere_4096",
        query_vector=embedding,
        limit=k_max,
    )

    answer_ids = [record.id for record in response]
    if pubid in answer_ids:
        answer_positions.append(answer_ids.index(pubid))
    else:
        answer_positions.append(-1)

100it [00:00, 237.01it/s]


In [40]:
# Prepared answer positions are being used to calculate different values of accuracy@k
for k in range(1, k_max + 1):
    correct_answers = len(
        list(
            filter(lambda x: 0 <= x < k, answer_positions)
        )
    )
    print(f"accuracy@{k} =", correct_answers / len(dataset["train"]))

accuracy@1 = 0.094
accuracy@2 = 0.095
accuracy@3 = 0.095
accuracy@4 = 0.097
accuracy@5 = 0.098
accuracy@6 = 0.098
accuracy@7 = 0.098
accuracy@8 = 0.099
accuracy@9 = 0.099
accuracy@10 = 0.099
