In [1]:
import pandas as pd
import numpy as np

In [2]:
myData = pd.read_csv('2021_10_20_products_series_labelled.csv')

In [22]:
import json
from pathlib import Path

def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

train_contexts, train_questions, train_answers = read_squad('squad/train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad('squad/dev-v2.0.json')

def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [35]:
train_contexts[1][207:226],len('singing and dancing')

('singing and dancing', 19)

In [23]:
train_answers[:10]

[{'text': 'in the late 1990s', 'answer_start': 269, 'answer_end': 286},
 {'text': 'singing and dancing', 'answer_start': 207, 'answer_end': 226},
 {'text': '2003', 'answer_start': 526, 'answer_end': 530},
 {'text': 'Houston, Texas', 'answer_start': 166, 'answer_end': 180},
 {'text': 'late 1990s', 'answer_start': 276, 'answer_end': 286},
 {'text': "Destiny's Child", 'answer_start': 320, 'answer_end': 335},
 {'text': 'Dangerously in Love', 'answer_start': 505, 'answer_end': 524},
 {'text': 'Mathew Knowles', 'answer_start': 360, 'answer_end': 374},
 {'text': 'late 1990s', 'answer_start': 276, 'answer_end': 286},
 {'text': 'lead singer', 'answer_start': 290, 'answer_end': 301}]

In [3]:
myData = myData.loc[:,['omsid','product_name','series_name']]

In [4]:
myData.loc[:,'answer_start'] = "N"
myData.loc[:,'answer_end'] = "N"

In [5]:
counter = 0
for i, row in myData.iterrows():
    if row.series_name in row.product_name:
        myData.loc[i,'answer_start'] = row.product_name.index(row.series_name)
        myData.loc[i,'answer_end'] = myData.loc[i,'answer_start'] + len(row.series_name)
        counter += 1
print(f"out of {myData.shape[0]}, {counter} are in the title")

out of 29675, 23391 are in the title


In [6]:
myData_cleaned = myData.loc[myData.loc[:,'answer_start'] != "N", :]

In [7]:
myData_cleaned

Unnamed: 0,omsid,product_name,series_name,answer_start,answer_end
4,315696655,AN-4500 and AN-4512 Aria Single Handle Single ...,Aria,20,24
5,309122846,Hilo 8 in. Widespread 2-Handle Bathroom Faucet...,Hilo,0,4
6,309122896,Hilo 4 in. Centerset 2-Handle Bathroom Faucet ...,Hilo,0,4
7,309122853,Hilo 8 in. Widespread 2-Handle Bathroom Faucet...,Hilo,0,4
8,309122906,Hilo 4 in. Centerset 2-Handle Bathroom Faucet ...,Hilo,0,4
...,...,...,...,...,...
29670,313341432,Colinet 2-Handle Wall Mount Bathroom Faucet Tr...,Colinet,0,7
29671,313341429,Colinet 2-Handle Wall Mount Bathroom Faucet Tr...,Colinet,0,7
29672,313341430,Colinet 2-Handle Wall Mount Bathroom Faucet Tr...,Colinet,0,7
29673,313341434,Colinet 2-Handle Wall Mount Bathroom Faucet Tr...,Colinet,0,7


In [8]:
from sklearn.model_selection import train_test_split

In [9]:
train, test = train_test_split(myData_cleaned, test_size=0.2)

In [67]:
test

[]

In [10]:
train_answers, train_contexts, train_questions = [], [], []
for i, row in train.iterrows():
    train_contexts.append(row.product_name)
    train_answers.append({'text': row.series_name, 'answer_start': row.answer_start, 'answer_end': row.answer_end})
    train_questions.append('What is the series name of this product?')
    
test_answers, test_contexts, test_questions = [], [], []
for i, row in test.iterrows():
    test_contexts.append(row.product_name)
    test_answers.append({'text': row.series_name, 'answer_start': row.answer_start, 'answer_end': row.answer_end})
    test_questions.append('What is the series name of this product?')
    

In [109]:
len(train_answers), len(test_answers), test_answers[:10]

(18712,
 4679,
 [{'text': 'Meridian', 'answer_start': 0, 'answer_end': 8},
  {'text': 'Decora Smart', 'answer_start': 0, 'answer_end': 12},
  {'text': 'Surge', 'answer_start': 0, 'answer_end': 5},
  {'text': 'Diva', 'answer_start': 0, 'answer_end': 4},
  {'text': 'Claro', 'answer_start': 0, 'answer_end': 5},
  {'text': 'Riverby', 'answer_start': 0, 'answer_end': 7},
  {'text': 'Ashlyn', 'answer_start': 0, 'answer_end': 6},
  {'text': 'Builders', 'answer_start': 0, 'answer_end': 8},
  {'text': 'Woodhurst', 'answer_start': 0, 'answer_end': 9},
  {'text': 'Kaiser', 'answer_start': 0, 'answer_end': 6}])

In [112]:
test_contexts[1][0:12]

'Decora Smart'

In [121]:
# from transformers import DistilBertTokenizerFast
# tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-uncased')
len(train_answers)


18712

In [76]:
ls roberta-base-squad2/

README.md           merges.txt         special_tokens_map.json
config.json         pytorch_model.bin  tokenizer_config.json
flax_model.msgpack  rust_model.ot      vocab.json


In [113]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(test_contexts, test_questions, truncation=True, padding=True)

In [114]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
        # if None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, test_answers)

In [129]:
val_encodings.char_to_token(1, 7)

3

In [17]:

import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

# build datasets for both our training and validation sets
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [18]:
from transformers import DistilBertForQuestionAnswering

model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

In [19]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# move model over to detected device
model.to(device)
# activate training mode of model
model.train()
# initialize adam optimizer with weight decay (reduces chance of overfitting)
optim = AdamW(model.parameters(), lr=5e-5)

# initialize data loader for training data
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

for epoch in range(3):
    # set model to train mode
    model.train()
    # setup loop (we use tqdm for the progress bar)
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all the tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        # train model on batch and return outputs (incl. loss)
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        # extract loss
        loss = outputs[0]
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|██████████| 1170/1170 [01:54<00:00, 10.18it/s, loss=0.0529] 
Epoch 1: 100%|██████████| 1170/1170 [01:55<00:00, 10.15it/s, loss=0.00708] 
Epoch 2: 100%|██████████| 1170/1170 [01:55<00:00, 10.14it/s, loss=0.00322] 


In [54]:
from transformers import pipeline

In [68]:
qa = pipeline("question-answering", model=model, tokenizer=tokenizer)

In [76]:
val_loader = DataLoader(val_dataset, batch_size=16)

for i, a in enumerate(val_loader):
    #images = a.to(device)

    #print(i, a.keys())
    pass



In [77]:
question="What is the series name of this product?"
#qa(question=question, context=test.iloc[0,1])

In [88]:
val_dataset

<__main__.SquadDataset at 0x7f05aac78910>

In [152]:
# switch model out of training mode
model.eval()
# initialize validation set data loader
val_loader = DataLoader(val_dataset, batch_size=16)
# initialize list to store accuracies
acc = []
# loop through batches
all_preds, all_truths = [],[]
for batch in val_loader:
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        # we will use true positions for accuracy calc
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        # make predictions
        outputs = model(input_ids, attention_mask=attention_mask)
        # pull prediction tensors out and argmax to get predicted tokens
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        # calculate accuracy for both and append to accuracy list
        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_true).sum()/len(end_pred)).item())
        pred_answer_batch = [ tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[x][start_pred[x]:end_pred[x]+1])) for x in range(input_ids.shape[0])]
        true_answer_batch = [ tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[x][start_true[x]:end_true[x]+1])) for x in range(input_ids.shape[0])]
        all_preds.extend(pred_answer_batch)
        all_truths.extend(true_answer_batch)
# calculate average accuracy in total
acc = sum(acc)/len(acc)

In [151]:
(pred_answer_batch == true_answer_batch)

True

In [157]:
import numpy as np

np.sum(np.array(all_truths) == np.array(all_preds))/len(all_preds)

0.9805513998717674

In [160]:
model_path = "distilbert-custom/"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('distilbert-custom/tokenizer_config.json',
 'distilbert-custom/special_tokens_map.json',
 'distilbert-custom/vocab.txt',
 'distilbert-custom/added_tokens.json',
 'distilbert-custom/tokenizer.json')

In [162]:
##

model_path = 'distilbert-custom/'
model = DistilBertForQuestionAnswering.from_pretrained(model_path)
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
