In [3]:
import os
import pandas as pd
from datetime import datetime
import numpy as np
import nltk
import torch
from io import BytesIO

from tqdm.notebook import tqdm

from transformers import BertTokenizer, BertModel
from transformers import AutoModel, AutoTokenizer

# Ensure that the Punkt Tokenizer Models are downloaded
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\johnny\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

# Initialize MinIO client and BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def generate_vectors(text):
    # Check if GPU is available and use it; otherwise, use CPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Send model to device (GPU or CPU)
    model.to(device)
    model.eval()  # Ensure the model is in evaluation mode

    # Ensure no gradient calculations
    with torch.no_grad():
        # Prepare inputs and send them to the device
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
        
        # Forward pass, send model outputs back to CPU
        outputs = model(**inputs).last_hidden_state.mean(dim=1).squeeze().to('cpu').numpy()

    # Convert the output to float32 for compatibility and ensure it's flat
    return outputs.astype(np.float32)

# Example usage with a DataFrame
# df_sentence['Summary_vector'] = df_sentence['Summary'].progress_apply(lambda x: generate_vectors(x) if isinstance(x, str) else np.nan)


In [5]:
import faiss


In [6]:
pd.set_option('display.max_colwidth', None)  # For pandas versions that support None as an option
# or
pd.set_option('display.max_colwidth', 1000)  # Set a large value explicitly


In [7]:
#download_dir = 'D:\\downloads'
download_dir = 'D:\\downloads\\amazon_customer_reviews'
print(download_dir)

D:\downloads\amazon_customer_reviews


In [8]:
#df_pickle_filename = os.path.join(download_dir,"amazon_reviews_pickle_paragraphs_20240219.pkl")
df_pickle_filename = os.path.join(download_dir,"amazon_reviews_pickle_paragraphs_similarity_20240219.pkl")

print(df_pickle_filename)

D:\downloads\amazon_customer_reviews\amazon_reviews_pickle_paragraphs_similarity_20240219.pkl


In [9]:
index_paragraph_filename = os.path.join(download_dir,"amazon_reviews_pickle_sentences_similarity_20240219_index_paragraph.faiss")
index_paragraph = faiss.read_index(index_paragraph_filename)


In [10]:
index_summary_filename = os.path.join(download_dir,"amazon_reviews_pickle_sentences_similarity_20240219_index_summary.faiss")
index_summary = faiss.read_index(index_summary_filename)


In [11]:
#df_pickle_filename = os.path.join(download_dir,"amazon_reviews_pickle_paragraphs_similarity_20240219.pkl")

df_pickle_reduced_filename = os.path.join(download_dir,"amazon_reviews_pickle_paragraphs_similarity_reduced_20240219.pkl")

df_paragraph = pd.read_pickle(df_pickle_reduced_filename )

In [12]:
df_paragraph.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Paragraph',
       'Paragraph_Cluster', 'Paragraph_Distance', 'Summary_Cluster',
       'Summary_Distance', 'Summary_similarity_distance',
       'Summary_similarity_index', 'Paragraph_similarity_distance',
       'Paragraph_similarity_index'],
      dtype='object')

In [23]:
paragraph_similarity_distance = df_paragraph.iloc[0].Paragraph_similarity_distance

In [24]:
paragraph_similarity_distance

array([ 0.       ,  9.213089 ,  9.347366 ,  9.460304 ,  9.700714 ,
        9.70076  ,  9.70076  ,  9.70076  ,  9.70076  ,  9.70076  ,
        9.70076  ,  9.70076  ,  9.70076  ,  9.70076  ,  9.701889 ,
       10.113693 , 10.255371 , 10.263527 , 10.2729645, 10.301964 ,
       10.332123 , 10.355682 , 10.420395 , 10.544197 , 10.554474 ,
       10.554474 , 10.554474 , 10.575752 , 10.576309 , 10.576309 ,
       10.576309 , 10.576309 , 10.576324 , 10.579697 , 10.589935 ,
       10.589935 , 10.589943 , 10.589943 , 10.589943 , 10.589943 ,
       10.589943 , 10.589943 , 10.589943 , 10.589943 , 10.589943 ,
       10.630844 , 10.635422 , 10.635422 , 10.635422 , 10.635422 ,
       10.635422 , 10.635422 , 10.635422 , 10.635422 , 10.635437 ,
       10.6716   , 10.676559 , 10.694992 , 10.694992 , 10.699684 ,
       10.753281 , 10.765335 , 10.767212 , 10.7866745, 10.787247 ,
       10.789093 , 10.81221  , 10.820984 , 10.820984 , 10.858795 ,
       10.868179 , 10.89344  , 10.904137 , 10.927902 , 10.9339

In [115]:
paragraph_similarity_index = df_paragraph.iloc[1].Paragraph_similarity_index

In [116]:
paragraph_similarity_index

array([     1, 408473, 459824, 230876, 502381, 245616, 527311, 438658,
       349804, 194089, 159471, 142505, 499358, 298688, 491070,  41768,
       267807, 334483,  68894, 417287, 309025, 187245, 315377, 275071,
       498950, 484232, 438995,  29075, 505004,  31922, 435531,  31813,
       435422,  12366, 294553, 147117,  81647,  59750,  58152, 135081,
       381083, 381082, 564975, 342293, 387457, 135092, 236790, 231867,
       389426, 290343, 540431, 469678, 388590, 238012,  85890, 241155,
       126047, 160163, 438134, 142328, 310480, 196808, 368942, 369417,
        79443, 420516, 398799, 496167, 265497, 238369, 297794, 198701,
        17860, 438705, 432557, 567515, 522180, 118525,  29674,  29122,
       239297, 214398, 382645, 477098, 257118, 308293, 508396, 389794,
       309039, 444614, 358197, 241758, 435155, 410436, 239152, 502018,
       381149, 369686, 348656,   7394], dtype=int64)

In [117]:
df_selected_paragraphs = df_paragraph.loc[paragraph_similarity_index, ['Paragraph']]

In [118]:
df_selected_paragraphs['Product Type'] = ""

In [119]:
df_selected_paragraphs['Original Index'] = df_selected_paragraphs.index

In [120]:
df_selected_paragraphs

Unnamed: 0,Paragraph,Product Type,Original Index
1,"Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as ""Jumbo"".",,1
408473,I highly recommend this to all vegtable lovers. It's just fantastic.. Try it and you will see what I mean...,,408473
459824,"This is the best marshmallow creme<br />I have ever eaten. It is great in fudge<br />recipes. It is also great as a quick snack,<br />just grab a spoonful. It takes the craving<br />right away.<br /><br />I have also used it to make rice krispie<br />marshmallow treats since I did not have<br />marshmallows. It was pretty good, but not<br />as good as with marshmallows.<br /><br />However, I must add that since I am having<br />blood sugar problems, I restrict this to once<br />a year for Christmas fudge recipes. The fudge<br />just can't survive without it!",,459824
230876,"My Father's Day Tower of Sweets arrived immediately, but the product was a little short and was a concern. I left feedback letting the seller know of my concerns. They immediately contacted me with an apology and a promise to investigate. Today I received another email and they will be sending a replacement package. I know it will be a big surprise to Dad and I know he will be happy when he gets to try the treats! Thank you Broadway Basketeers for outstanding customer service and concern about the satisfaction of your customers. You are greatly appreciated.",,230876
502381,"Thrown into a bag, some were broken, selection was terrible. Not even 50% were coffee! Would never purchase again or recommend. Shame on you!",,502381
...,...,...,...
502018,"My cats go crazy over this food! I was worried they would not like it, because it is shredded, but it did not bother them one bit. The ingredient list is great, as well. Also, it is one of the few canned cat foods that does not contain carrageenan, which I recently found out causes GI problems in cats (and possibly cancer).",,502018
381149,"My dad drank chocolate malted milk for years and got me hooked on it. I have since developed a preference for the original (i.e., non-chocolate) product. I can't find it in stores anymore, so the alternative for me is through Amazon or other online store. The quality is consistent, batch after batch. If malted milk is your thing, Carnation is the best.",,381149
369686,Best cookies ever! Honestly my fav!! And they are WAYYY healthier than almost any other cookie. I love these and the price is great on here especially with subscribe and save!,,369686
348656,Product arrived quickly and in good condition.<br />I used this for confectionery work and it worked nicely.<br />This sugar is finer than the 10X confecctioner's sugar you get at the grocery store.,,348656


In [122]:
selected_paragraphs_excel_filename = os.path.join(download_dir,"selected_paragraphs_for_annotation.xlsx")
df_selected_paragraphs.to_excel(selected_paragraphs_excel_filename, index=False)

In [123]:
selected_paragraphs_excel_filename

'D:\\downloads\\amazon_customer_reviews\\selected_paragraphs_for_annotation.xlsx'

In [None]:
#df_selected_paragraphs_reset = df_selected_paragraphs.reset_index().rename(columns={'index': 'Original Index'})

In [None]:
#stop

In [124]:
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.styles import Alignment

# Create an Excel writer using openpyxl
#excel_path = "/mnt/data/df_selected_paragraphs_formatted.xlsx"
excel_path =  selected_paragraphs_excel_filename
writer = pd.ExcelWriter(excel_path, engine='openpyxl')

# Convert the DataFrame to an openpyxl Workbook object
df_selected_paragraphs.to_excel(writer, index=False, sheet_name='Sheet1')

# Get the openpyxl workbook and worksheet objects
workbook = writer.book
worksheet = writer.sheets['Sheet1']

# Set the column width for the Paragraph column (B)
worksheet.column_dimensions['A'].width = 50
worksheet.column_dimensions['B'].width = 50

for cell in worksheet['A'][1:]:  # Skip the header row
    cell.alignment = Alignment(wrapText=True)


workbook.save(selected_paragraphs_excel_filename)
# Save the workbook
writer.close()

excel_path


'D:\\downloads\\amazon_customer_reviews\\selected_paragraphs_for_annotation.xlsx'

In [125]:
df_with_product_types = pd.read_excel(selected_paragraphs_excel_filename)


In [126]:
df_with_product_types['Original Index'] = df_selected_paragraphs.index

In [127]:
df_with_product_types['Product Type'] = df_with_product_types['Product Type'].str.lower()

In [128]:
df_with_product_types

Unnamed: 0,Paragraph,Product Type,Original Index
0,"Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as ""Jumbo"".",snack food,1
1,I highly recommend this to all vegtable lovers. It's just fantastic.. Try it and you will see what I mean...,snack food,408473
2,"This is the best marshmallow creme<br />I have ever eaten. It is great in fudge<br />recipes. It is also great as a quick snack,<br />just grab a spoonful. It takes the craving<br />right away.<br /><br />I have also used it to make rice krispie<br />marshmallow treats since I did not have<br />marshmallows. It was pretty good, but not<br />as good as with marshmallows.<br /><br />However, I must add that since I am having<br />blood sugar problems, I restrict this to once<br />a year for Christmas fudge recipes. The fudge<br />just can't survive without it!",snack food,459824
3,"My Father's Day Tower of Sweets arrived immediately, but the product was a little short and was a concern. I left feedback letting the seller know of my concerns. They immediately contacted me with an apology and a promise to investigate. Today I received another email and they will be sending a replacement package. I know it will be a big surprise to Dad and I know he will be happy when he gets to try the treats! Thank you Broadway Basketeers for outstanding customer service and concern about the satisfaction of your customers. You are greatly appreciated.",snack food,230876
4,"Thrown into a bag, some were broken, selection was terrible. Not even 50% were coffee! Would never purchase again or recommend. Shame on you!",coffee,502381
...,...,...,...
95,"My cats go crazy over this food! I was worried they would not like it, because it is shredded, but it did not bother them one bit. The ingredient list is great, as well. Also, it is one of the few canned cat foods that does not contain carrageenan, which I recently found out causes GI problems in cats (and possibly cancer).",pet food,502018
96,"My dad drank chocolate malted milk for years and got me hooked on it. I have since developed a preference for the original (i.e., non-chocolate) product. I can't find it in stores anymore, so the alternative for me is through Amazon or other online store. The quality is consistent, batch after batch. If malted milk is your thing, Carnation is the best.",drink,381149
97,Best cookies ever! Honestly my fav!! And they are WAYYY healthier than almost any other cookie. I love these and the price is great on here especially with subscribe and save!,snack food,369686
98,Product arrived quickly and in good condition.<br />I used this for confectionery work and it worked nicely.<br />This sugar is finer than the 10X confecctioner's sugar you get at the grocery store.,baking,348656


In [129]:
df_paragraph.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Paragraph',
       'Paragraph_Cluster', 'Paragraph_Distance', 'Summary_Cluster',
       'Summary_Distance', 'Summary_similarity_distance',
       'Summary_similarity_index', 'Paragraph_similarity_distance',
       'Paragraph_similarity_index', 'Product Types'],
      dtype='object')

In [94]:
#df_paragraph = df_paragraph.drop('Product Types',axis=1)

In [96]:
#if 'Product Types' not in df_paragraph.columns:
#    df_paragraph['Product Types'] = [[] for _ in range(len(df_paragraph))]

In [130]:
# Function to update product types in df_paragraph
def update_product_types(original_df, updates_df):
    for _, row in updates_df.iterrows():
        original_index = row['Original Index']
        product_type = row['Product Type']
        
        # Append the new product type to the existing list, ensuring no duplicates
        if product_type not in original_df.at[original_index, 'Product Types']:
            original_df.at[original_index, 'Product Types'].append(product_type)


In [131]:
df_with_product_types.columns

Index(['Paragraph', 'Product Type', 'Original Index'], dtype='object')

In [132]:
# Call the function to update df_paragraph with the new product types
update_product_types(df_paragraph, df_with_product_types)

In [135]:
df_paragraph.iloc[1]

Id                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

In [81]:
df_paragraph.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Paragraph',
       'Paragraph_Cluster', 'Paragraph_Distance', 'Summary_Cluster',
       'Summary_Distance', 'Summary_similarity_distance',
       'Summary_similarity_index', 'Paragraph_similarity_distance',
       'Paragraph_similarity_index', 'Product Types'],
      dtype='object')

In [134]:
from itertools import chain

# Flatten the list of lists of product types
all_product_types = list(chain.from_iterable(df_paragraph['Product Types']))

# Convert to a Series to use value_counts
product_type_series = pd.Series(all_product_types)

# View the value counts
product_type_counts = product_type_series.value_counts()

product_type_counts


pet food            61
snack food          53
unclear             13
coffee              12
tea                  8
condiment            8
can food             5
juice                5
baby formula         4
baking               3
dog treat            3
pet treat            3
drink                2
pasta                2
cereal               2
sweetener            2
spice                2
baby food            1
bake decoration      1
flowers              1
noodles              1
wine kit             1
canning supplies     1
dog food             1
creamer              1
pancake mix          1
sauce                1
Name: count, dtype: int64

In [None]:
stop

In [71]:
# Generate the vector for "dog food"
dog_food_vector = generate_vectors("dog food")


In [81]:
zero_vector = generate_vectors(df_paragraph.iloc[0].Paragraph)

In [83]:
import numpy as np

def cosine_similarity(vec_a, vec_b):
    # Compute the cosine similarity between two vectors
    dot_product = np.dot(vec_a, vec_b)
    norm_a = np.linalg.norm(vec_a)
    norm_b = np.linalg.norm(vec_b)
    return dot_product / (norm_a * norm_b)




In [84]:
# Assuming dog_food_vector and paragraph_vector are your vectors
similarity = cosine_similarity(dog_food_vector, zero_vector)
print(f"Cosine Similarity: {similarity}")

Cosine Similarity: [0.5313073]


In [89]:
k = 10000  # Adjust based on how many similar paragraphs you want to find
D, I = index_paragraph.search(dog_food_vector.reshape(1, -1), k)  # Reshape dog_food_vector for FAISS compatibility


In [93]:
4726 in I

True

In [92]:
I

array([[553307, 519333, 393596, ...,  50369, 520330, 520938]], dtype=int64)

In [94]:
D

array([[56.16536 , 57.149216, 59.001614, ..., 71.07399 , 71.07418 ,
        71.07418 ]], dtype=float32)

In [72]:
import faiss

# Ensure your FAISS index is properly loaded
# For example, if you saved your index to disk: faiss_index = faiss.read_index('path_to_your_index_file')

# Prepare the dog food vector for search
# The vector should be reshaped to match the dimensions FAISS expects ([1, vector_length])
dog_food_vector = dog_food_vector.reshape(1, -1)

# Perform the search
k = 10  # Number of nearest neighbors you want to find
D, I = index_paragraph.search(dog_food_vector, k)  # D: Distances, I: Indices of the nearest neighbors

# D and I are arrays containing distances to the query and the indices of the nearest vectors
print("Indices of nearest neighbors:", I)
print("Distances:", D)

Indices of nearest neighbors: [[553307 519333 393596 508464 204030 335883 366224  19617 165598 392395]]
Distances: [[56.16536  57.149216 59.001614 59.131626 59.13763  59.13763  59.13763
  59.13763  59.13763  59.13763 ]]


In [73]:
# Assuming df_paragraph is your DataFrame and it's indexed in a way that aligns with the FAISS index
for idx in I[0]:  # Loop through each index found by FAISS
    # Retrieve and print the corresponding paragraph
    print(df_paragraph.loc[idx, 'Paragraph'])
    print("---")


I first tried these after my sister gave me a bag of the <a href="http://www.amazon.com/gp/product/B000YT5NFO">Snack Factory Original Pretzel Crisps</a>.  My family really liked them as a quick snack they could grab a handful of.  I also found them dipping them in sour cream, peanut butter, cream cheese and also adding sliced cheese.  When I saw they had flavored ones, I had to buy them and so far the <a href="http://www.amazon.com/gp/product/B000YTC9MO">Snack Factory Buffalo Wing Pretzel Crisps</a> are our favorites. They don't work well dipped in peanut butter, but are good with all the others including ranch dressing or just by themselves.  Although the size of the bags seem small, because they are so light and crispy, you do receive quite a bit in them. On average I've paid anywhere from $2.50 to $3.50 a bag and I'm okay with that. Definitely give them a try if you like Buffalo Wings!
---
We switched from the sensitive to the soy similac because my daughter was FREAKING out each ni

In [None]:
for 

In [38]:
paragraph = df_paragraph.iloc[0:30].Paragraph
type(paragraph)

pandas.core.series.Series

In [29]:
summary = df_paragraph.iloc[0].Summary
print(summary)

Good Quality Dog Food


In [30]:
df_paragraph.iloc[0:30].Summary

0                                 Good Quality Dog Food
1                                     Not as Advertised
2                                 "Delight" says it all
3                                        Cough Medicine
4                                           Great taffy
5                                            Nice Taffy
6         Great!  Just as good as the expensive brands!
7                                Wonderful, tasty taffy
8                                            Yay Barley
9                                      Healthy Dog Food
10                      The Best Hot Sauce in the World
11    My cats LOVE this "diet" food better than thei...
12                 My Cats Are Not Fans of the New Food
13                                    fresh and greasy!
14                         Strawberry Twizzlers - Yummy
15             Lots of twizzlers, just what you expect.
16                                           poor taste
17                                             L

In [15]:
context = df_paragraph.iloc[0].Paragraph
print(context)

I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.


In [27]:
from transformers import pipeline

# Load a pre-trained model and tokenizer for question answering
question_answering_pipeline = pipeline("question-answering", model="bert-base-uncased")

# Your sample paragraph
#context = 'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than most.'

# Formulate a question
question = "What product is the paragraph about?"

# Use the model to find the answer
answer = question_answering_pipeline({'question': question, 'context': context})

print(f"Question: {question}")
print(f"Answer: {answer['answer']}")


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Question: What product is the paragraph about?
Answer: of good quality. The


In [20]:
from transformers import BertForQuestionAnswering, Trainer, TrainingArguments

model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

training_args = TrainingArguments(
    output_dir='./models/',
    num_train_epochs=3,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs/',
    evaluation_strategy='steps', # Adjust this based on how often you want to evaluate
    logging_steps=50, # Log metrics every 50 steps
)

# Assuming `train_dataset` and `eval_dataset` are already defined
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

trainer.train()


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NameError: name 'train_dataset' is not defined

In [21]:
from transformers import BertTokenizerFast
import torch
from torch.utils.data import Dataset, DataLoader

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

class QuestionAnsweringDataset(Dataset):
    def __init__(self, contexts, questions, answers):
        self.contexts = contexts
        self.questions = questions
        self.answers = answers
    
    def __len__(self):
        return len(self.contexts)
    
    def __getitem__(self, idx):
        context = self.contexts[idx]
        question = self.questions[idx]
        answer_text = self.answers[idx]['text']
        start_position = self.answers[idx]['answer_start']
        end_position = start_position + len(answer_text)

        # Tokenize context and question
        encodings = tokenizer.encode_plus(question, context, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
        input_ids = encodings['input_ids'].squeeze()
        attention_mask = encodings['attention_mask'].squeeze()

        # Find positions of answer tokens in context
        answer_ids = tokenizer.encode(answer_text, add_special_tokens=False)
        tokens = tokenizer.convert_ids_to_tokens(input_ids)
        answer_start_positions = [i for i, token in enumerate(tokens) if token == tokenizer.convert_ids_to_tokens(answer_ids)[0]]
        answer_end_positions = [i for i, token in enumerate(tokens) if token == tokenizer.convert_ids_to_tokens(answer_ids)[-1]]
        
        # Handling cases where the answer is truncated due to max_length
        start_position = answer_start_positions[0] if len(answer_start_positions) > 0 else 0
        end_position = answer_end_positions[0] if len(answer_end_positions) > 0 else 0

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'start_positions': torch.tensor(start_position),
            'end_positions': torch.tensor(end_position)
        }

# Example data (replace with your actual data)
contexts = ["This is a context paragraph about the Transformers library. Transformers provides access to thousands of pre-trained models."]
questions = ["What does the Transformers library provide?"]
answers = [{"text": "access to thousands of pre-trained models", "answer_start": 37}]

# Create dataset
dataset = QuestionAnsweringDataset(contexts, questions, answers)

# Splitting the dataset into training and evaluation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, eval_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])




In [22]:
print(len(contexts), len(questions), len(answers))


1 1 1


In [23]:
contexts

['This is a context paragraph about the Transformers library. Transformers provides access to thousands of pre-trained models.']

In [24]:
questions

['What does the Transformers library provide?']

In [25]:
answers

[{'text': 'access to thousands of pre-trained models', 'answer_start': 37}]