In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/computer-science-theory-qa-dataset/intents.json
/kaggle/input/data-science-glossary-for-qa/glossary1.json


A.

In [None]:
import json
import re

# Step 1: Load the dataset
def load_qa_data(file_path):
    try:
        with open(file_path, 'r') as f:
            qa_data = json.load(f)
        return qa_data
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return {}

# Step 2: Clean and preprocess text (optional stop word removal)
def clean_text(text):
    stop_words = {'the', 'is', 'in', 'at', 'which', 'on', 'and', 'a', 'an'}
    # Remove non-alphabet characters and split into words
    words = re.sub(r'[^a-zA-Z\s]', '', text.lower()).split()
    # Remove stop words
    return [word for word in words if word not in stop_words]

# Step 3: Keyword matching function
def find_answer(question, qa_data):
    cleaned_question = clean_text(question)
    
    for q, a in qa_data.items():
        cleaned_q = clean_text(q)
        if all(keyword in cleaned_q for keyword in cleaned_question):
            return a

    return "Sorry, I don't know the answer to that question."

# Step 4: Main function
def main():
    file_path = '/kaggle/input/data-science-glossary-for-qa/glossary1.json'
    qa_data = load_qa_data(file_path)  # Load the Q&A dataset

    if not qa_data:
        return  # Exit if dataset could not be loaded

    while True:
        user_question = input("Ask a question (or type 'exit' to quit): ")
        if user_question.lower() in ['exit', 'quit']:
            print("Exiting the Q&A system. Goodbye!")
            break

        answer = find_answer(user_question, qa_data)
        print("Answer:", answer)

if __name__ == "__main__":
    main()


B.

In [None]:
# Import required libraries
from datasets import load_dataset
from transformers import BertTokenizer, BertForQuestionAnswering, Trainer, TrainingArguments
import torch

# Load the dataset
dataset = load_dataset('squad')

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Preprocessing function
def preprocess_data(examples):
    inputs = tokenizer(
        examples['context'], 
        examples['question'], 
        truncation=True, 
        padding='max_length'
    )

    start_positions = []
    end_positions = []

    # Assuming 'examples['answers']' is a list of dictionaries
    for answer in examples['answers']:
        start_positions.append(answer['answer_start'][0])  # Assuming single answer per example
        end_positions.append(answer['answer_start'][0] + len(answer['text'][0]))

    inputs.update({
        'start_positions': start_positions,
        'end_positions': end_positions
    })
    
    return inputs


# Map the preprocessing function to the dataset
train_dataset = dataset['train'].map(preprocess_data, batched=True)

# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'start_positions', 'end_positions'])

# Step 5: Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Load the BERT model for question answering
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dataset['validation'],
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

# Example: Predict on a single question-context pair
question = "What is the capital of France?"
context = "The capital of France is Paris."

# Tokenize the input example
inputs = tokenizer(question, context, return_tensors='pt', padding=True, truncation=True)

# Get model prediction
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

# Get the predicted start and end positions
predicted_start = torch.argmax(start_logits)
predicted_end = torch.argmax(end_logits)

# Get the answer from the context
answer = context[predicted_start.item():predicted_end.item() + 1]
print(f"Predicted Answer: {answer}")


README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: