In [2]:
import json
# Read training data to finetune custom BERT model
with open(r"data/amazon_data_train.json", "r") as read_file:
    train = json.load(read_file)
# Read test data to evaluate finetuned model
with open(r"data/amazon_data_test.json", "r") as read_file:
    test = json.load(read_file)

In [None]:
import logging
from simpletransformers.question_answering import QuestionAnsweringModel, QuestionAnsweringArgs
from transformers import BertTokenizer
# Load your custom tokenizer
tokenizer = BertTokenizer.from_pretrained('../bert-mask/data-set/phone_review-vocab.txt')
# Save the tokenizer to the model directory
tokenizer.save_pretrained('../bert-mask/custom_bert_output/')

In [None]:
import logging
from simpletransformers.question_answering import QuestionAnsweringModel, QuestionAnsweringArgs
from transformers import BertTokenizer, BertForMaskedLM
 
# Define model type and custom bert model path
model_type="bert"
model_name=('../bert-mask/custom_bert_output/')
 
# Create a output folder to save fine tuned custom bert
import os
output_dir = 'finetune_bert_outputs'
os.mkdir(output_dir)
 
# Set up training arguments
train_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "use_cached_eval_features": True,
    "output_dir": f"outputs/{model_type}",
    "best_model_dir": f"{output_dir}/{model_type}/best_model",
    "evaluate_during_training": True,
    "max_seq_length": 128,
    "num_train_epochs": 30,
    "evaluate_during_training_steps": 1000,
    "save_model_every_epoch": False,
    "save_eval_checkpoints": False,
    "n_best_size":8,
    "train_batch_size": 16,
    "eval_batch_size": 16
}
 
# Load custom model with tokenizer
model = QuestionAnsweringModel(model_type, model_name, args=train_args, use_cuda=False)

In [None]:
# train model
model.train_model(train, eval_data=test)

In [None]:
# Evaluate the model
result, texts = model.eval_model(test)
result

In [None]:
# Make predictions with the model
to_predict = [
    {
        "context": "Samsung Galaxy M14 5G (Smoky Teal, 6GB, 128GB Storage) | 50MP Triple Cam | 6000 mAh Battery | 5nm Octa-Core Processor | 12GB RAM with RAM Plus | Android 13 | Without Charger",
        "qas": [
            {
                "question": "What is the model name of the Samsung smartphone?",
                "id": "0",
            }
        ],
    }
]
 
answers, probabilities = model.predict(to_predict, n_best_size=None)
print(answers)

In [None]:
# Make predictions with the model
to_predict = [
    {
        "context": "Samsung Galaxy M14 5G (Smoky Teal, 6GB, 128GB Storage) | 50MP Triple Cam | 6000 mAh Battery | 5nm Octa-Core Processor | 12GB RAM with RAM Plus | Android 13 | Without Charger",
        "qas": [
            {
                "question": "Does the Samsung Galaxy M14 5G come with a charger?",
                "id": "0",
            }
        ],
    }
]
 
answers, probabilities = model.predict(to_predict, n_best_size=None)
print(answers)