In [None]:
# Install to be able to use BERT

!pip install transformers

In [None]:
# Notes for future improvement:
# Worth trying more granular segment representation models than IO, such as IOE, IOB, IOBES.

In [2]:
# Import dependencies

import pandas as pd
import pickle
from urllib.request import urlopen
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split

import torch 
import numpy as np
from transformers import BertTokenizerFast, BertForTokenClassification
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.optim import SGD

In [19]:
# Load variables used during development

# with open('X_test.pkl', 'rb') as f:
#   X_test = pickle.load(f)

In [12]:
# Read data

df = pd.read_csv('store_pages.csv')

In [13]:
# Inspect data

stores = df['max(page)'].to_list()

stores[:5]

['https://www.factorybuys.com.au/products/euro-top-mattress-king',
 'https://dunlin.com.au/products/beadlight-cirrus',
 'https://themodern.net.au/products/hamar-plant-stand-ash',
 'https://furniturefetish.com.au/products/oslo-office-chair-white',
 'https://hemisphereliving.com.au/products/']

In [228]:
# Count data

len(stores)

704

In [5]:
# Split data into training and testing (only 30% for training)

X = stores
y = [0 for elem in range(len(stores))]

X_train, X_test, _, _ = train_test_split(X, y, test_size=0.7, random_state=42)

In [407]:
# Count data

len(X_train)

211

In [203]:
# Export variables used during development

# with open('test_full_extracted.pkl', 'wb') as f:
#   pickle.dump(test_full_extracted, f)

In [198]:
# Define function to extract text from a given URL

def extract_text(url):
  output = []
  try:
    html = urlopen(url).read()
    soup = BeautifulSoup(html, features="html.parser")
    for script in soup(["script", "style"]):
        script.extract()
    text = soup.body.get_text()
    # Break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # Break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # Drop blank lines
    # output = '\n'.join(chunk+'.' for chunk in chunks if chunk)
    for chunk in chunks:
      if chunk:
        words = chunk.split()
        output.append(words)
        output.append('.')
  except:
    output = None
  return output

In [425]:
# Make a list with the outputted texts

full_extracted = []
X_train_new = []

for elem in X_train:
  extracted = extract_text(elem)
  if extracted:
    extracted = [item for sublist in extracted for item in sublist]
    full_extracted.append(extracted)
    X_train_new.append(elem)

In [21]:
# Count data

len(full_extracted)

79

In [20]:
# Define a function to manually tag a word as a "Product" if it matches a given string

def tag_elem(words, tag):
  output = []
  for word in words:
    new_word = ''.join(filter(str.isalpha, word))
    if new_word.lower() in tag.split():
      output.append(new_word + ' I-PROD')
    else:
      output.append(word + ' O')
  return output

In [108]:
# Inspect data

X_train_new[70]

'https://shackletonsretail.co.uk/products/'

In [429]:
# Create an empty list

tagged = []

In [107]:
# Manually tag 79 product pages

tagged.append(tag_elem(full_extracted[78], 'table barstools'))

In [110]:
# Count data

len(tagged)

79

In [112]:
# Create sentences by putting together words and their labels

sents = []
labels = []

for elem_2 in tagged:
  sent = ''
  label = ''
  for elem in elem_2:
    word = elem.split()[0]
    if word not in '.?!':
      sent += ' ' + word
      label += ' ' + elem.split()[1]
    else:
      sents.append(sent)
      labels.append(label)
      sent = ''
      label = ''

In [113]:
# Create a dataframe using the sentences and labels

fixed_df = pd.DataFrame(sents)
fixed_df['labels'] = labels
fixed_df.columns = ['text', 'labels']
fixed_df.head(5)

Unnamed: 0,text,labels
0,-,O
1,|,O
2,/,O
3,Save up to % Save % Save up to,O O O O O O O O O
4,Save,O


In [114]:
# Count data

len(fixed_df)

15898

In [None]:
# Instantiate pre-trained BERT tokenizer

tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

In [117]:
# Reference: https://towardsdatascience.com/named-entity-recognition-with-bert-in-pytorch-a454405e0b6a

# Each splitted token shares the same word_ids. 
# BERT tokens such as [CLS], [SEP], and [PAD] all do not have specific word_ids.

# These word_ids will be very useful to adjust the length of the label by applying either of these two methods:
# 1. We only provide a label to the first sub-word of each splitted token.
# The continuation of the sub-word then will simply have ‘-100’ as a label. 
# All tokens that don’t have word_ids will also be labeled with ‘-100’.
# 2. We provide the same label among all of the sub-words that belong to the same token. 
# All tokens that don’t have word_ids will be labeled with ‘-100’.
# The function below will do exactly the step defined above.

label_all_tokens = False

def align_label(texts, labels):
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=128, truncation=True)
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]])
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx
    return label_ids

In [118]:
# Define a class to load data using torch utilities

class DataSequence(torch.utils.data.Dataset):
    def __init__(self, df):
        lb = [i.split() for i in df['labels'].values.tolist()]
        txt = df['text'].values.tolist()
        self.texts = [tokenizer(str(i),
                               padding='max_length', max_length = 128, truncation=True, return_tensors="pt") for i in txt]
        self.labels = [align_label(i,j) for i,j in zip(txt, lb)]

    def __len__(self):
        return len(self.labels)

    def get_batch_data(self, idx):
        return self.texts[idx]

    def get_batch_labels(self, idx):
        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):
        batch_data = self.get_batch_data(idx)
        batch_labels = self.get_batch_labels(idx)
        return batch_data, batch_labels

In [134]:
# Further split the data into training/validation/testing

df = fixed_df

labels = [i.split() for i in df['labels'].values.tolist()]
unique_labels = set()

for lb in labels:
        [unique_labels.add(i) for i in lb if i not in unique_labels]
labels_to_ids = {k: v for v, k in enumerate(unique_labels)}
ids_to_labels = {v: k for v, k in enumerate(unique_labels)}

df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
                            [int(.8 * len(df)), int(.9 * len(df))])

In [135]:
# Define a class for the BERT model

class BertModel(torch.nn.Module):
    def __init__(self):
        super(BertModel, self).__init__()
        self.bert = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(unique_labels))

    def forward(self, input_id, mask, label):
        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)
        return output

In [136]:
# Define a function for a custom training loop

def train_loop(model, df_train, df_val):
    train_dataset = DataSequence(df_train)
    val_dataset = DataSequence(df_val)
    train_dataloader = DataLoader(train_dataset, num_workers=4, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(val_dataset, num_workers=4, batch_size=BATCH_SIZE)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    optimizer = SGD(model.parameters(), lr=LEARNING_RATE)
    if use_cuda:
        model = model.cuda()
    for epoch_num in range(EPOCHS):
        total_acc_train = 0
        total_loss_train = 0
        model.train()
        for train_data, train_label in tqdm(train_dataloader):
            train_label = train_label.to(device)
            mask = train_data['attention_mask'].squeeze(1).to(device)
            input_id = train_data['input_ids'].squeeze(1).to(device)
            optimizer.zero_grad()
            loss, logits = model(input_id, mask, train_label)
            for i in range(logits.shape[0]):
              logits_clean = logits[i][train_label[i] != -100]
              label_clean = train_label[i][train_label[i] != -100]
              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              if isinstance(acc.item(), float) and acc.item() >= 0 and acc.item() <= 1:
                total_acc_train += acc.item()
              total_loss_train += loss.item()
            loss.backward()
            optimizer.step()
        model.eval()
        total_acc_val = 0
        total_loss_val = 0
        for val_data, val_label in val_dataloader:
            val_label = val_label.to(device)
            mask = val_data['attention_mask'].squeeze(1).to(device)
            input_id = val_data['input_ids'].squeeze(1).to(device)
            loss, logits = model(input_id, mask, val_label)
            for i in range(logits.shape[0]):
              logits_clean = logits[i][val_label[i] != -100]
              label_clean = val_label[i][val_label[i] != -100]
              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              if isinstance(acc.item(), float) and acc.item() >= 0 and acc.item() <= 1:
                total_acc_val += acc.item()
              total_loss_val += loss.item()
        val_accuracy = total_acc_val / len(df_val)
        val_loss = total_loss_val / len(df_val)
        print(f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(df_train): .3f} | Accuracy: {total_acc_train / len(df_train): .3f} | Val_Loss: {total_loss_val / len(df_val): .3f} | Accuracy: {total_acc_val / len(df_val): .3f}')

In [137]:
# Specify paramters
LEARNING_RATE = 5e-3
EPOCHS = 2
BATCH_SIZE = 2

# Instantiate model
model = BertModel()

# Uncomment line to load the model from the .pth file
# model = torch.load('ML-Assignment/model.pth')

# Start training (CUDA GPU has been used during development)
train_loop(model, df_train, df_val)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

Epochs: 1 | Loss:  0.071 | Accuracy:  0.982 | Val_Loss:  0.075 | Accuracy:  0.979


100%|██████████| 6359/6359 [06:01<00:00, 17.57it/s]


Epochs: 2 | Loss:  0.051 | Accuracy:  0.984 | Val_Loss:  0.062 | Accuracy:  0.982


In [138]:
# Define a function for evaluating the model

def evaluate(model, df_test):
    test_dataset = DataSequence(df_test)
    test_dataloader = DataLoader(test_dataset, num_workers=4, batch_size=1)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()
    total_acc_test = 0.0
    for test_data, test_label in test_dataloader:
            test_label = test_label.to(device)
            mask = test_data['attention_mask'].squeeze(1).to(device)
            input_id = test_data['input_ids'].squeeze(1).to(device)
            loss, logits = model(input_id, mask, test_label)
            for i in range(logits.shape[0]):
              logits_clean = logits[i][test_label[i] != -100]
              label_clean = test_label[i][test_label[i] != -100]
              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              if isinstance(acc.item(), float) and acc.item() >= 0 and acc.item() <= 1:
                total_acc_test += acc.item()
    val_accuracy = total_acc_test / len(df_test)
    print(f'Test Accuracy: {total_acc_test / len(df_test): .3f}')

In [139]:
# Evaluate the model (accuracy)

evaluate(model, df_test)

Test Accuracy:  0.988


In [140]:
# Align word IDs when dealing with a single text for inference

def align_word_ids(texts):
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=128, truncation=True)
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            try:
                label_ids.append(1)
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(1 if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx
    return label_ids

In [221]:
# Define a function for inference on a single text

def evaluate_one_text(model, sentence):
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()
    text = tokenizer(sentence, padding='max_length', max_length = 128, truncation=True, return_tensors="pt")
    mask = text['attention_mask'].to(device)
    input_id = text['input_ids'].to(device)
    label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device)
    logits = model(input_id, mask, None)
    logits_clean = logits[0][label_ids != -100]
    predictions = logits_clean.argmax(dim=1).tolist()
    prediction_label = [ids_to_labels[i] for i in predictions]
    return prediction_label
    # print(sentence)
    # print(prediction_label)

In [142]:
# Inference on a given text

evaluate_one_text(model, 'Increase quantity for Black Chair')

Increase quantity for Black Chair
['O', 'O', 'O', 'O', 'I-PROD']


In [None]:
count = 0
for index, label in enumerate(labels):
  if 'I-PROD' in label:
    count += 1
    print(sents[index])

In [145]:
count

737

In [143]:
# Save model

torch.save(model.state_dict(), 'model.pth')

In [None]:
# Below are git-related OS commands to export the model

!git clone https://github.com/teodortita/ML-Assignment.git

In [186]:
!cp model.pth ML-Assignment/model.pth

In [None]:
%cd ML-Assignment

In [None]:
!git lfs install

In [None]:
!git lfs track "*.pth"

In [190]:
!git add .gitattributes

In [191]:
!git add model.pth

In [None]:
!git commit -m "Add model file"

In [None]:
# Extract texts from test data URLs

test_full_extracted = []
X_test_new = []

for elem in X_test:
  extracted = extract_text(elem)
  if extracted:
    extracted = [item for sublist in extracted for item in sublist]
    test_full_extracted.append(extracted)
    X_test_new.append(elem)

In [208]:
# Count data

len(test_full_extracted)

227

In [209]:
# Create sentences from outputted texts

test_sents = []

for elem_2 in test_full_extracted:
  sent = ''
  for elem in elem_2:
    word = elem.split()[0]
    if word not in '.?!':
      sent += ' ' + word
    else:
      test_sents.append(sent)
      sent = ''

In [223]:
# Perform inference using the model and output results as a text file

results = []

for elem in test_sents:
  output_labels = evaluate_one_text(model, elem)
  if output_labels and 'I-PROD' in output_labels:
    results.append((elem, output_labels))

In [224]:
# Count data

len(results)

810

In [225]:
# Inspect data

results[0]

(' Cow Horn Chair', ['O', 'O', 'I-PROD'])

In [227]:
# Export results (file is now in the repo)

with open("results.txt", 'w') as output:
    for row in results:
        output.write(str(row) + '\n')