In [4]:
import pandas as pd
from transformers import BertTokenizer, LineByLineTextDataset, BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from tokenizers import BertWordPieceTokenizer

'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [5]:
data_set_dir = "./data-set"
model_dir = "./saved_model"
model_output_dir = "./saved_model_output"

In [6]:
df = pd.read_csv(f'{data_set_dir}/reviews.csv')
df.dropna(inplace=True)
mlm_df = df[['title', 'body']].copy()

In [7]:
with open(f'{data_set_dir}/review_data.txt', 'w', encoding='utf-8') as f:
    for title, body in zip(mlm_df.title.values, mlm_df.body.values):
        f.write(title + '\n')
        f.write(body + '\n')

In [8]:
tokenizer = BertWordPieceTokenizer()
# vocab_size adalah jumlah vocab/kata yang diinginkan
tokenizer.train(files=f"{data_set_dir}/review_data.txt", vocab_size=30522)
tokenizer.save_model(f'{model_dir}/', 'phone_review')
# load model
vocab_file_dir = f'{model_dir}/phone_review-vocab.txt'
custom_tokenizer = BertTokenizer.from_pretrained(vocab_file_dir)








In [9]:
sentence = 'Motorola V860 is a good phone'
encoded_input = custom_tokenizer.tokenize(sentence)
print(encoded_input)

['motorola', 'v860', 'is', 'a', 'good', 'phone']


In [10]:
# Load BERT default tokenizer -> checkin result is not all tokenizer
bert_default_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
sentence = 'Motorola V860 is a good phone'
encoded_input = bert_default_tokenizer.tokenize(sentence)
print(encoded_input)

['motorola', 'v8', '##60', 'is', 'a', 'good', 'phone']


In [11]:
dataset= LineByLineTextDataset(
    tokenizer = custom_tokenizer,
    file_path = f'{data_set_dir}/review_data.txt',
    block_size = 128
)
print('No. of lines: ', len(dataset))



No. of lines:  54409


In [12]:
# Define model parameters to train BERT model from scratch
config = BertConfig(
    vocab_size=30522,
    hidden_size=768, 
    num_hidden_layers=6, 
    num_attention_heads=12,
    max_position_embeddings=512
)
  
model = BertForMaskedLM(config)
print('No of parameters: ', model.num_parameters())
data_collator = DataCollatorForLanguageModeling(
    tokenizer=custom_tokenizer, mlm=True, mlm_probability=0.15
)

No of parameters:  66987066


In [13]:
training_args = TrainingArguments(
    output_dir=f'{model_output_dir}/',
    overwrite_output_dir=True,
    num_train_epochs=30,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    report_to="none"
)
 
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

In [14]:
import torch
# check eaither cuda or mps is available
device = torch.device("mps")
torch.backends.mps.is_available()

True

In [None]:
trainer.train().to(device)
trainer.save_model(f'{model_dir}/').to(device)

Step,Training Loss
500,6.5921
1000,5.9121
1500,5.7757
2000,5.6725
2500,5.619
3000,5.553
3500,5.446


In [16]:
# Load custom trained BERT model
from transformers import pipeline
model = BertForMaskedLM.from_pretrained(model_output_dir)
fill_mask = pipeline(
    "fill-mask",
    model=model,
    tokenizer=custom_tokenizer
)
# Actual Text: the battery life is bad
fill_mask('the battery [MASK] is bad')

OSError: ./saved_model_output does not appear to have a file named config.json. Checkout 'https://huggingface.co/./saved_model_output/main' for available files.