In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")



model_name = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)



Downloading (…)okenizer_config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Downloading model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [2]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Define a function for TF-IDF retrieval
def tfidf_retrieval(query, paragraphs, top_n):
    # Initialize the TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Compute TF-IDF scores for the paragraphs
    tfidf_scores = vectorizer.fit_transform(paragraphs)

    # Compute TF-IDF scores for the query
    query_vector = vectorizer.transform([query])

    # Calculate cosine similarity between query and paragraphs
    similarities = np.dot(query_vector, tfidf_scores.T).toarray().squeeze()

    # Get the indices of top-N similar paragraphs
    top_indices = similarities.argsort()[-top_n:][::-1]

    # Retrieve the top-N paragraphs
    top_paragraphs = [paragraphs[i].strip() for i in top_indices]

    return top_paragraphs

In [3]:
import os

def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def split_into_paragraphs(text):
    return text.split('\n\n')  # Adjust the paragraph delimiter as needed

def process_files_in_folder(folder_path):
    paragraph_list = []
    files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

    for file_name in files:
        file_path = os.path.join(folder_path, file_name)
        text = read_file(file_path)
        paragraphs = split_into_paragraphs(text)
        paragraph_list.extend(paragraphs)

    return paragraph_list

In [4]:
paragraphs_list = process_files_in_folder("/kaggle/input/wiki-data")

In [6]:
len(paragraphs_list)

416

In [10]:
query = "when was the bitcoin invented ?"
top_n = 3  # Number of top paragraphs to retrieve

top_paragraphs = tfidf_retrieval(query, paragraphs_list, top_n=top_n)
top_paragraphs

['Bitcoin transactions are verified by network nodes through cryptography and recorded in a public distributed ledger called a blockchain. The cryptocurrency was invented in 2008 by an unknown person or group of people using the name Satoshi Nakamoto. The currency began use in 2009, when its implementation was released as open-source software.:\u200ach. 1\u200a\nThe word "bitcoin" was defined in a white paper published on October 31, 2008. It is a compound of the words bit and coin.',
 'The first timestamping scheme invented was the proof-of-work scheme. The most widely used proof-of-work schemes are based on SHA-256 and scrypt.',
 'The first wallet program, simply named Bitcoin, and sometimes referred to as the Satoshi client, was released in 2009 by Satoshi Nakamoto as open-source software. In version 0.5 the client moved from the wxWidgets user interface toolkit to Qt, and the whole bundle was referred to as Bitcoin-Qt. After the release of version 0.9, the software bundle was renam

In [11]:
predictions = []
for paragraph in top_paragraphs:
    input = tokenizer(paragraph,query, truncation=True, return_tensors="pt").to(device)
    output = model(input["input_ids"])

    prediction = torch.softmax(output.logits[0], -1).tolist()
    label_names = ["entailment", "neutral", "contradiction"]
    prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
    predictions.append(prediction)
    

    
print("Predictions for each retrieved paragraph:")
for i, (paragraph, prediction) in enumerate(zip(top_paragraphs, predictions)):
    if prediction["neutral"] > prediction["contradiction"] and prediction["neutral"] > prediction["entailment"]:
        result = "No"
    else:
        result = "Yes"
        
    print("\n") 
    print("********************")
    print("\n") 
    print("Paragraph : ",f"{i+1} : " , paragraph )
    print("Prediction : " , prediction)
    print("Result : " , result)

Predictions for each retrieved paragraph:


********************


Paragraph :  1 :  Bitcoin transactions are verified by network nodes through cryptography and recorded in a public distributed ledger called a blockchain. The cryptocurrency was invented in 2008 by an unknown person or group of people using the name Satoshi Nakamoto. The currency began use in 2009, when its implementation was released as open-source software.: ch. 1 
The word "bitcoin" was defined in a white paper published on October 31, 2008. It is a compound of the words bit and coin.
Prediction :  {'entailment': 99.1, 'neutral': 0.5, 'contradiction': 0.4}
Result :  Yes


********************


Paragraph :  2 :  The first timestamping scheme invented was the proof-of-work scheme. The most widely used proof-of-work schemes are based on SHA-256 and scrypt.
Prediction :  {'entailment': 0.3, 'neutral': 99.2, 'contradiction': 0.6}
Result :  No


********************


Paragraph :  3 :  The first wallet program, simply nam