In [1]:
import torch
import os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from sentence_transformers import SentenceTransformer, util
from evaluate import load
from huggingface_hub import login

In [2]:
login(token=os.getenv('HF_TOKEN'))

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
wikipedia_text_corpus = load_dataset('rag-datasets/rag-mini-wikipedia', 'text-corpus')
wikipedia_question_answer = load_dataset('rag-datasets/rag-mini-wikipedia', 'question-answer')

README.md:   0%|          | 0.00/719 [00:00<?, ?B/s]

data/passages.parquet/part.0.parquet:   0%|          | 0.00/797k [00:00<?, ?B/s]

Generating passages split:   0%|          | 0/3200 [00:00<?, ? examples/s]

data/test.parquet/part.0.parquet:   0%|          | 0.00/54.4k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/918 [00:00<?, ? examples/s]

In [4]:
documents = wikipedia_text_corpus['passages']
passages = [doc['passage'] for doc in documents]

In [5]:
embeddings_model = SentenceTransformer('all-MiniLM-L6-v2')
document_embeddings = embeddings_model.encode(passages, batch_size=64, show_progress_bar=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/50 [00:00<?, ?it/s]

In [6]:
model_name = 'meta-llama/Llama-2-7b-hf'
bitsandbytes_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16,
                                         bnb_4bit_quant_type='nf4')

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             quantization_config=bitsandbytes_config,
                                             device_map='cuda:0')

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [8]:
bleu = load('bleu')
bertscore = load('bertscore')

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [9]:
# Експеримент 1: RAG со 5 документи
sample_qa = wikipedia_question_answer['test'][0]
question = sample_qa['question']
question

'Was Abraham Lincoln the sixteenth President of the United States?'

In [10]:
answer = sample_qa['answer']
answer

'yes'

In [11]:
question_embedding = embeddings_model.encode(question, batch_size=64, show_progress_bar=True)
context_results = util.semantic_search(question_embedding, document_embeddings, top_k=5)
context_results

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[[{'corpus_id': 288, 'score': 0.7095187902450562},
  {'corpus_id': 278, 'score': 0.5840359330177307},
  {'corpus_id': 697, 'score': 0.5568779706954956},
  {'corpus_id': 2227, 'score': 0.5566985011100769},
  {'corpus_id': 319, 'score': 0.5500737428665161}]]

In [12]:
doc_ids = [c['corpus_id'] for c in context_results[0]]
doc_ids

[288, 278, 697, 2227, 319]

In [13]:
docs = [passages[d] for d in doc_ids]
docs

['Young Abraham Lincoln',
 'Abraham Lincoln (February 12, 1809 â\x80\x93 April 15, 1865) was the sixteenth President of the United States, serving from March 4, 1861 until his assassination. As an outspoken opponent of the expansion of slavery in the United States, "[I]n his short autobiography written for the 1860 presidential campaign, Lincoln would describe his protest in the Illinois legislature as one that \'briefly defined his position on the slavery question, and so far as it goes, it was then the same that it is now." This was in reference to the anti-expansion sentiments he had then expressed. Doris Kearns Goodwin, Team of Rivals: The Political Genius of Abraham Lincoln (2005) p. 91.  Holzer pg. 232.  Writing of the Cooper Union  speech, Holzer notes, "Cooper Union proved a unique confluence of political culture, rhetorical opportunity, technological innovation, and human genius, and it brought Abraham Lincoln to the center stage of American politics at precisely the right tim

In [14]:
context = f'Context:\nDocument 1: {docs[0]}\nDocument 2: {docs[1]}\nDocument 3: {docs[2]}\nDocument 4: {docs[3]}\nDocument 5: {docs[4]}'
context

'Context:\nDocument 1: Young Abraham Lincoln\nDocument 2: Abraham Lincoln (February 12, 1809 â\x80\x93 April 15, 1865) was the sixteenth President of the United States, serving from March 4, 1861 until his assassination. As an outspoken opponent of the expansion of slavery in the United States, "[I]n his short autobiography written for the 1860 presidential campaign, Lincoln would describe his protest in the Illinois legislature as one that \'briefly defined his position on the slavery question, and so far as it goes, it was then the same that it is now." This was in reference to the anti-expansion sentiments he had then expressed. Doris Kearns Goodwin, Team of Rivals: The Political Genius of Abraham Lincoln (2005) p. 91.  Holzer pg. 232.  Writing of the Cooper Union  speech, Holzer notes, "Cooper Union proved a unique confluence of political culture, rhetorical opportunity, technological innovation, and human genius, and it brought Abraham Lincoln to the center stage of American polit

In [15]:
prompt = f'{context}\n\nAnswer the following question: {question}\n\nAnswer: '
prompt

'Context:\nDocument 1: Young Abraham Lincoln\nDocument 2: Abraham Lincoln (February 12, 1809 â\x80\x93 April 15, 1865) was the sixteenth President of the United States, serving from March 4, 1861 until his assassination. As an outspoken opponent of the expansion of slavery in the United States, "[I]n his short autobiography written for the 1860 presidential campaign, Lincoln would describe his protest in the Illinois legislature as one that \'briefly defined his position on the slavery question, and so far as it goes, it was then the same that it is now." This was in reference to the anti-expansion sentiments he had then expressed. Doris Kearns Goodwin, Team of Rivals: The Political Genius of Abraham Lincoln (2005) p. 91.  Holzer pg. 232.  Writing of the Cooper Union  speech, Holzer notes, "Cooper Union proved a unique confluence of political culture, rhetorical opportunity, technological innovation, and human genius, and it brought Abraham Lincoln to the center stage of American polit

In [16]:
tokens = tokenizer(prompt, return_tensors='pt').to('cuda:0')
tokens

{'input_ids': tensor([[    1, 15228, 29901,    13,  6268, 29871, 29896, 29901, 10443, 24763,
         17274,    13,  6268, 29871, 29906, 29901, 24763, 17274,   313, 29943,
          3205,   653, 29871, 29896, 29906, 29892, 29871, 29896, 29947, 29900,
         29929, 19406, 30751, 30344,  3786, 29871, 29896, 29945, 29892, 29871,
         29896, 29947, 29953, 29945, 29897,   471,   278,  4832, 19839,  7178,
           310,   278,  3303,  3900, 29892, 16330,   515,  4779, 29871, 29946,
         29892, 29871, 29896, 29947, 29953, 29896,  2745,   670, 20105,  3381,
         29889,  1094,   385,   714,  1028,  4476, 23995,   296,   310,   278,
         13184,   310,  8370,  1201,   297,   278,  3303,  3900, 29892, 14704,
         29902, 29962, 29876,   670,  3273,  1120, 15647,  5275,  3971,   363,
           278, 29871, 29896, 29947, 29953, 29900,  6673,   616, 11531, 29892,
         17274,   723,  8453,   670, 10021,   297,   278, 17066, 13332,  1535,
           408,   697,   393,   525,  

In [17]:
output_ids = model.generate(tokens.input_ids, max_new_tokens=50)
output_ids

tensor([[    1, 15228, 29901,    13,  6268, 29871, 29896, 29901, 10443, 24763,
         17274,    13,  6268, 29871, 29906, 29901, 24763, 17274,   313, 29943,
          3205,   653, 29871, 29896, 29906, 29892, 29871, 29896, 29947, 29900,
         29929, 19406, 30751, 30344,  3786, 29871, 29896, 29945, 29892, 29871,
         29896, 29947, 29953, 29945, 29897,   471,   278,  4832, 19839,  7178,
           310,   278,  3303,  3900, 29892, 16330,   515,  4779, 29871, 29946,
         29892, 29871, 29896, 29947, 29953, 29896,  2745,   670, 20105,  3381,
         29889,  1094,   385,   714,  1028,  4476, 23995,   296,   310,   278,
         13184,   310,  8370,  1201,   297,   278,  3303,  3900, 29892, 14704,
         29902, 29962, 29876,   670,  3273,  1120, 15647,  5275,  3971,   363,
           278, 29871, 29896, 29947, 29953, 29900,  6673,   616, 11531, 29892,
         17274,   723,  8453,   670, 10021,   297,   278, 17066, 13332,  1535,
           408,   697,   393,   525,  1182,  2575,  

In [18]:
tokenizer.decode(output_ids[0], skip_special_tokens=True)

'Context:\nDocument 1: Young Abraham Lincoln\nDocument 2: Abraham Lincoln (February 12, 1809 â\x80\x93 April 15, 1865) was the sixteenth President of the United States, serving from March 4, 1861 until his assassination. As an outspoken opponent of the expansion of slavery in the United States, "[I]n his short autobiography written for the 1860 presidential campaign, Lincoln would describe his protest in the Illinois legislature as one that \'briefly defined his position on the slavery question, and so far as it goes, it was then the same that it is now." This was in reference to the anti-expansion sentiments he had then expressed. Doris Kearns Goodwin, Team of Rivals: The Political Genius of Abraham Lincoln (2005) p. 91.  Holzer pg. 232.  Writing of the Cooper Union  speech, Holzer notes, "Cooper Union proved a unique confluence of political culture, rhetorical opportunity, technological innovation, and human genius, and it brought Abraham Lincoln to the center stage of American polit

In [19]:
predictions_5 = []
references_5 = []

In [20]:
for i in range(10):
    qa = wikipedia_question_answer['test'][i]
    question = qa['question']
    answer = qa['answer']

    question_embedding = embeddings_model.encode(question)
    context_results = util.semantic_search(question_embedding, document_embeddings, top_k=5)
    doc_ids = [c['corpus_id'] for c in context_results[0]]
    docs = [passages[d] for d in doc_ids]
    context = f'Context:\nDocument 1: {docs[0]}\nDocument 2: {docs[1]}\nDocument 3: {docs[2]}\nDocument 4: {docs[3]}\nDocument 5: {docs[4]}'

    prompt = f'{context}\n\nAnswer the following question: {question}\n\nAnswer: '
    tokens = tokenizer(prompt, return_tensors='pt').to('cuda:0')
    output_ids = model.generate(tokens.input_ids, max_new_tokens=50)
    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    if 'Answer:' in output:
        pred = output.split('Answer:')[-1].strip()
    else:
        pred = output.split('\n')[-1].strip()

    predictions_5.append(pred)
    references_5.append(answer)

In [21]:
bleu.compute(predictions=predictions_5, references=[[ref] for ref in references_5])

{'bleu': 0.016003152801946127,
 'precisions': [0.033707865168539325,
  0.01937984496124031,
  0.012048192771084338,
  0.008333333333333333],
 'brevity_penalty': 1.0,
 'length_ratio': 9.88888888888889,
 'translation_length': 267,
 'reference_length': 27}

In [22]:
bertscore.compute(predictions=predictions_5, references=references_5, model_type='microsoft/deberta-xlarge-mnli')

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/792 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.04G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.04G [00:00<?, ?B/s]



{'precision': [0.29502129554748535,
  0.2819629907608032,
  0.29926958680152893,
  0.3322839140892029,
  0.316226065158844,
  0.0,
  0.4748987555503845,
  0.4729628562927246,
  0.19764584302902222,
  0.2928624451160431],
 'recall': [0.48487550020217896,
  0.4733036458492279,
  0.5308688282966614,
  0.7164347171783447,
  0.6733449101448059,
  0.0,
  0.7308385372161865,
  0.5957132577896118,
  0.49220478534698486,
  0.5988884568214417],
 'f1': [0.36683982610702515,
  0.3533959984779358,
  0.3827624022960663,
  0.45400112867355347,
  0.4303464889526367,
  0.0,
  0.5757046341896057,
  0.5272883772850037,
  0.28203853964805603,
  0.3933653235435486],
 'hashcode': 'microsoft/deberta-xlarge-mnli_L40_no-idf_version=0.3.12(hug_trans=4.57.3)'}

In [23]:
# Експеримент 2: RAG со 3 документи
sample_qa = wikipedia_question_answer['test'][5]
question = sample_qa['question']
question

'What did The Legal Tender Act of 1862 establish?'

In [24]:
question_embedding = embeddings_model.encode(question, batch_size=64, show_progress_bar=True)
context_results = util.semantic_search(question_embedding, document_embeddings, top_k=3)
doc_ids = [c['corpus_id'] for c in context_results[0]]
docs = [passages[d] for d in doc_ids]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [25]:
context = f'Context:\nDocument 1: {docs[0]}\nDocument 2: {docs[1]}\nDocument 3: {docs[2]}'
context

"Context:\nDocument 1: The Legal Tender Act of 1862 established the United States Note, the first paper currency in United States history.  This was done to increase the money supply to pay for fighting the war.\nDocument 2: The Panic of 1873 hit the country hard during his presidency, and he never attempted decisive action, one way or the other, to alleviate distress. The first law that he signed, in March 1869, established the value of the greenback currency issued during the Civil War, pledging to redeem the bills in gold. In 1874, he vetoed a bill to increase the amount of a legal tender currency, which defused the currency crisis on Wall Street but did little to help the economy as a whole. The depression led to Democratic victories in the 1874 off-year elections, as that party took control of the House for the first time since 1856.\nDocument 3: Lincoln believed in the Whig theory of the presidency, which left Congress to write the laws while he signed them, vetoing only those bi

In [26]:
predictions_3 = []
references_3 = []

In [27]:
for i in range(10):
    qa = wikipedia_question_answer['test'][i]
    question = qa['question']
    answer = qa['answer']

    question_embedding = embeddings_model.encode(question)
    context_results = util.semantic_search(question_embedding, document_embeddings, top_k=3)
    doc_ids = [c['corpus_id'] for c in context_results[0]]
    docs = [passages[d] for d in doc_ids]
    context = f'Context:\nDocument 1: {docs[0]}\nDocument 2: {docs[1]}\nDocument 3: {docs[2]}'

    prompt = f'{context}\n\nAnswer the following question: {question}\n\nAnswer: '
    tokens = tokenizer(prompt, return_tensors='pt').to('cuda:0')
    output_ids = model.generate(tokens.input_ids, max_new_tokens=50)
    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    if 'Answer:' in output:
        pred = output.split('Answer:')[-1].strip()
    else:
        pred = output.split('\n')[-1].strip()

    predictions_3.append(pred)
    references_3.append(answer)

In [28]:
bleu.compute(predictions=predictions_3, references=[[ref] for ref in references_3])

{'bleu': 0.04761350742353424,
 'precisions': [0.05970149253731343,
  0.04923076923076923,
  0.044444444444444446,
  0.03934426229508197],
 'brevity_penalty': 1.0,
 'length_ratio': 12.407407407407407,
 'translation_length': 335,
 'reference_length': 27}

In [29]:
bertscore.compute(predictions=predictions_3, references=references_3, model_type='microsoft/deberta-xlarge-mnli')

{'precision': [0.30287081003189087,
  0.28886860609054565,
  0.2067977637052536,
  0.2350398153066635,
  0.29240137338638306,
  0.5698068737983704,
  0.4748987853527069,
  0.3632277846336365,
  0.3104514479637146,
  0.2769851088523865],
 'recall': [0.4824071526527405,
  0.48484158515930176,
  0.2889077365398407,
  0.29268333315849304,
  0.7306355237960815,
  0.8801860213279724,
  0.7308386564254761,
  0.6891894340515137,
  0.47645998001098633,
  0.6212334036827087],
 'f1': [0.37211549282073975,
  0.3620360791683197,
  0.2410523146390915,
  0.2607133686542511,
  0.4176561236381531,
  0.6917772889137268,
  0.5757047533988953,
  0.4757290780544281,
  0.37594494223594666,
  0.38314151763916016],
 'hashcode': 'microsoft/deberta-xlarge-mnli_L40_no-idf_version=0.3.12(hug_trans=4.57.3)'}

In [30]:
# Експеримент 3: RAG со 1 документ
sample_qa = wikipedia_question_answer['test'][10]
question = sample_qa['question']
question

'Did Lincoln start his political career in 1832?'

In [31]:
question_embedding = embeddings_model.encode(question, batch_size=64, show_progress_bar=True)
context_results = util.semantic_search(question_embedding, document_embeddings, top_k=1)
doc_ids = [c['corpus_id'] for c in context_results[0]]
docs = [passages[d] for d in doc_ids]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [32]:
context = f'Context:\nDocument 1: {docs[0]}'
context

'Context:\nDocument 1: Lincoln began his political career in 1832, at age 23, with an unsuccessful campaign for the Illinois General Assembly, as a member of the Whig Party.  He ran eighth in a field of 13 candidates.  The centerpiece of his platform was the undertaking of navigational improvements on the Sangamon River. He believed that this would attract steamboat traffic, which would allow the sparsely populated, poorer areas along the river to flourish.'

In [33]:
predictions_1 = []
references_1 = []

In [34]:
for i in range(10):
    qa = wikipedia_question_answer['test'][i]
    question = qa['question']
    answer = qa['answer']

    question_embedding = embeddings_model.encode(question)
    context_results = util.semantic_search(question_embedding, document_embeddings, top_k=1)
    doc_ids = [c['corpus_id'] for c in context_results[0]]
    docs = [passages[d] for d in doc_ids]
    context = f'Context:\nDocument 1: {docs[0]}'

    prompt = f'{context}\n\nAnswer the following question: {question}\n\nAnswer: '
    tokens = tokenizer(prompt, return_tensors='pt').to('cuda:0')
    output_ids = model.generate(tokens.input_ids, max_new_tokens=50)
    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    if 'Answer:' in output:
        pred = output.split('Answer:')[-1].strip()
    else:
        pred = output.split('\n')[-1].strip()

    predictions_1.append(pred)
    references_1.append(answer)

In [35]:
bleu.compute(predictions=predictions_1, references=[[ref] for ref in references_1])

{'bleu': 0.04901445173556202,
 'precisions': [0.06296296296296296, 0.05, 0.044, 0.041666666666666664],
 'brevity_penalty': 1.0,
 'length_ratio': 10.0,
 'translation_length': 270,
 'reference_length': 27}

In [36]:
bertscore.compute(predictions=predictions_1, references=references_1, model_type='microsoft/deberta-xlarge-mnli')

{'precision': [0.3446699380874634,
  0.2870997488498688,
  0.31690746545791626,
  0.29597997665405273,
  0.3881893455982208,
  0.5301364660263062,
  0.38583457469940186,
  0.3071010708808899,
  0.2960708737373352,
  0.33442503213882446],
 'recall': [0.48071399331092834,
  0.49126267433166504,
  0.6894477605819702,
  0.7938142418861389,
  0.7737314701080322,
  0.8446286916732788,
  0.4865265190601349,
  0.7003804445266724,
  0.47819072008132935,
  0.6183140873908997],
 'f1': [0.4014802575111389,
  0.3624054491519928,
  0.43422266840934753,
  0.43118807673454285,
  0.5169962048530579,
  0.6514108777046204,
  0.43036937713623047,
  0.4269806742668152,
  0.36571192741394043,
  0.43407413363456726],
 'hashcode': 'microsoft/deberta-xlarge-mnli_L40_no-idf_version=0.3.12(hug_trans=4.57.3)'}

In [37]:
# Експеримент 4: Друг embedding модел (all-distilroberta-v1)
embeddings_model_distil = SentenceTransformer('all-distilroberta-v1')
document_embeddings_distil = embeddings_model_distil.encode(passages, batch_size=64, show_progress_bar=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/50 [00:00<?, ?it/s]

In [38]:
sample_qa = wikipedia_question_answer['test'][8]
question = sample_qa['question']
question

'Did Lincoln beat John C. Breckinridge in the 1860 election?'

In [39]:
question_embedding = embeddings_model_distil.encode(question, batch_size=64, show_progress_bar=True)
context_results = util.semantic_search(question_embedding, document_embeddings_distil, top_k=5)
doc_ids = [c['corpus_id'] for c in context_results[0]]
docs = [passages[d] for d in doc_ids]
docs

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

['On November 6, 1860, Lincoln was elected as the 16th President of the United States, beating Democrat Stephen A. Douglas, John C. Breckinridge of the Southern Democrats, and John Bell of the new Constitutional Union Party. He was the first Republican president, winning entirely on the strength of his support in the North: he was not even on the ballot in nine states in the South, and won only 2 of 996 counties in the other Southern states. Lincoln gained 1,865,908 votes (39.9% of the total), for 180 electoral votes; Douglas, 1,380,202 (29.5%) for 12 electoral votes; Breckenridge, 848,019 (18.1%) for 72 electoral votes; and Bell, 590,901 (12.5%) for 39 electoral votes. There were fusion tickets in some states, but even if his opponents had combined in every state, Lincoln had a majority vote in all but two of the states in which he won the electoral votes and would still have won the electoral college and the election.',
 'Abraham Lincoln (February 12, 1809 â\x80\x93 April 15, 1865) w

In [40]:
predictions_distil = []
references_distil = []

In [41]:
for i in range(10):
    qa = wikipedia_question_answer['test'][i]
    question = qa['question']
    answer = qa['answer']

    question_embedding = embeddings_model_distil.encode(question)
    context_results = util.semantic_search(question_embedding, document_embeddings_distil, top_k=5)
    doc_ids = [c['corpus_id'] for c in context_results[0]]
    docs = [passages[d] for d in doc_ids]
    context = f'Context:\nDocument 1: {docs[0]}\nDocument 2: {docs[1]}\nDocument 3: {docs[2]}\nDocument 4: {docs[3]}\nDocument 5: {docs[4]}'

    prompt = f'{context}\n\nAnswer the following question: {question}\n\nAnswer: '
    tokens = tokenizer(prompt, return_tensors='pt').to('cuda:0')
    output_ids = model.generate(tokens.input_ids, max_new_tokens=50)
    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    if 'Answer:' in output:
        pred = output.split('Answer:')[-1].strip()
    else:
        pred = output.split('\n')[-1].strip()

    predictions_distil.append(pred)
    references_distil.append(answer)

In [42]:
bleu.compute(predictions=predictions_distil, references=[[ref] for ref in references_distil])

{'bleu': 0.06142341002964031,
 'precisions': [0.08058608058608059,
  0.06463878326996197,
  0.05533596837944664,
  0.04938271604938271],
 'brevity_penalty': 1.0,
 'length_ratio': 10.11111111111111,
 'translation_length': 273,
 'reference_length': 27}

In [43]:
bertscore.compute(predictions=predictions_distil, references=references_distil, model_type='microsoft/deberta-xlarge-mnli')

{'precision': [0.2843165397644043,
  0.26719722151756287,
  0.21631431579589844,
  0.46412596106529236,
  0.3906598687171936,
  0.5875048041343689,
  0.46162205934524536,
  0.3667731285095215,
  0.3161858916282654,
  0.2939101755619049],
 'recall': [0.47856056690216064,
  0.5074849724769592,
  0.37042075395584106,
  0.6686205863952637,
  0.3487814664840698,
  0.8585176467895508,
  0.8656978607177734,
  0.7151029109954834,
  0.49759846925735474,
  0.5469262003898621],
 'f1': [0.3567093014717102,
  0.350075364112854,
  0.2731294631958008,
  0.5479145646095276,
  0.36853477358818054,
  0.6976146697998047,
  0.6021536588668823,
  0.48486244678497314,
  0.3866715133190155,
  0.3823506534099579],
 'hashcode': 'microsoft/deberta-xlarge-mnli_L40_no-idf_version=0.3.12(hug_trans=4.57.3)'}

In [44]:
# Експеримент 5: Zero-shot (без контекст)
sample_qa = wikipedia_question_answer['test'][12]
question = sample_qa['question']
question

'Which county was Lincoln born in?'

In [45]:
answer = sample_qa['answer']
answer

'Hardin County'

In [46]:
prompt = f'Answer the following question: {question}\n\nAnswer: '
prompt

'Answer the following question: Which county was Lincoln born in?\n\nAnswer: '

In [47]:
tokens = tokenizer(prompt, return_tensors='pt').to('cuda:0')
output_ids = model.generate(tokens.input_ids, max_new_tokens=50)
tokenizer.decode(output_ids[0], skip_special_tokens=True)

'Answer the following question: Which county was Lincoln born in?\n\nAnswer: \n\nAnswer:\n\nAbraham Lincoln was born in Hardin County, Kentucky.\n\n\n\n## Hints\n\n 1. Look for a hint in the question.\n 2. Look for another hint in the question'

In [48]:
predictions_zero = []
references_zero = []

In [49]:
for i in range(10):
    qa = wikipedia_question_answer['test'][i]
    question = qa['question']
    answer = qa['answer']

    prompt = f'Answer the following question: {question}\n\nAnswer: '
    tokens = tokenizer(prompt, return_tensors='pt').to('cuda:0')
    output_ids = model.generate(tokens.input_ids, max_new_tokens=50)
    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    if 'Answer:' in output:
        pred = output.split('Answer:')[-1].strip()
    else:
        pred = output.split('\n')[-1].strip()

    predictions_zero.append(pred)
    references_zero.append(answer)

In [50]:
bleu.compute(predictions=predictions_zero, references=[[ref] for ref in references_zero])

{'bleu': 0.0,
 'precisions': [0.04296875, 0.016260162601626018, 0.00423728813559322, 0.0],
 'brevity_penalty': 1.0,
 'length_ratio': 9.481481481481481,
 'translation_length': 256,
 'reference_length': 27}

In [51]:
bertscore.compute(predictions=predictions_zero, references=references_zero, model_type='microsoft/deberta-xlarge-mnli')

{'precision': [0.2840512990951538,
  0.3143410086631775,
  0.27587831020355225,
  0.4341427683830261,
  0.38089194893836975,
  0.652838945388794,
  0.3798534870147705,
  0.33406955003738403,
  0.2492079734802246,
  0.30116039514541626],
 'recall': [0.4406528174877167,
  0.4957261383533478,
  0.5760110020637512,
  0.6122863292694092,
  0.36886873841285706,
  0.7623876333236694,
  0.5260615944862366,
  0.34053826332092285,
  0.4971342086791992,
  0.627040445804596],
 'f1': [0.34543201327323914,
  0.38472625613212585,
  0.37307417392730713,
  0.5080509781837463,
  0.3747839331626892,
  0.703373372554779,
  0.44115906953811646,
  0.33727291226387024,
  0.331991970539093,
  0.40689414739608765],
 'hashcode': 'microsoft/deberta-xlarge-mnli_L40_no-idf_version=0.3.12(hug_trans=4.57.3)'}

In [52]:
# Во оваа задача резултатите за bleu се подобри од првата ззадача, бидејќи тука има одговори а не само една буква
# RAG со 5 документи даде просечен bertscore од 0.015
# RAG со 3 документи даде просечен bertscore од 0.35
# RAG со 1 документ даде просечен bertscore од 0.40
# RAG со DistilRoberta даде просечен bertscore од 0.50
# Zero-shot даде просечен bertscore од 0.50
#
# Според добиените резултати можеме да заклучиме дека RAG со 1 догумент даде најдобри резултати од сите останати тестови.
# RAG со DistilRoberta даде најдобар резултат од сите други тестови
# Со помош на zero-shot prompting се добија резултати со иста пецизност како и со DistilRoberta, 
# односно добивме најдобри резултати