In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from collections import defaultdict
import re
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
from torch.utils.data import Dataset, DataLoader
import qa_sys
import rules
#!pip install transformers

### Load data
*Load data and split them into train, dev and test set.*

In [None]:
PATH = "./data/"

In [None]:
def load_data(path):
    res = []
    with open(path) as f:
        for line in f.readlines():
            res.append(line.strip('\n'))
    return res

In [None]:
train_utt = load_data(PATH + 'WOZ_train_utt.txt')
train_ans = load_data(PATH + 'WOZ_train_ans.txt')
dev_utt = load_data(PATH + 'WOZ_dev_utt.txt')
dev_ans = load_data(PATH + 'WOZ_dev_ans.txt')
test_utt = load_data(PATH + 'WOZ_test_utt.txt')
test_ans = load_data(PATH + 'WOZ_test_ans.txt')

### Intent prediction
*Predict intents using LogisticRegression classifier and test it on the dev set.*

In [None]:
# create intent dataset for train, dev and test set
train_intent = [item.split('|')[0] for item in train_ans]
dev_intent = [item.split('|')[0] for item in dev_ans]
test_intent = [item.split('|')[0] for item in test_ans]

In [None]:
# vectorization
vectorizer = CountVectorizer(stop_words = 'english')
x_train = vectorizer.fit_transform(train_utt)
x_dev = vectorizer.transform(dev_utt)

In [None]:
# build classifier
clf = LogisticRegression()
clf.fit(x_train, train_intent)
dev_pred = clf.predict(x_dev)

In [None]:
# check accuracy score on dev set
accuracy_score(dev_intent, dev_pred)

0.9951573849878934

### Prepare data
*Generate and prepare QA datasets for train, dev and test set*

In [None]:
# train
train_food_ans, train_name_ans, train_food_Q, train_name_Q = qa_sys.generate_QA(train_utt, train_ans)
train_food_dataset = qa_sys.prepare_QA_dataset(train_food_Q, train_utt, train_food_ans, split = 'train')
train_food_dataloader = DataLoader(train_food_dataset, batch_size=batch_size, shuffle=False)

train_name_dataset = qa_sys.prepare_QA_dataset(train_name_Q, train_utt, train_name_ans, split = 'train')
train_name_dataloader = DataLoader(train_name_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# dev
dev_food_ans, dev_name_ans, dev_food_Q, dev_name_Q = qa_sys.generate_QA(dev_utt, dev_ans)
dev_food_dataset = qa_sys.prepare_QA_dataset(dev_food_Q, dev_utt, dev_food_ans, split = 'dev')
dev_food_dataloader = DataLoader(dev_food_dataset, batch_size=batch_size, shuffle=False)
dev_name_dataset = qa_sys.prepare_QA_dataset(dev_name_Q, dev_utt, dev_name_ans, split = 'dev')
dev_name_dataloader = DataLoader(dev_name_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# test
test_food_ans, test_name_ans, test_food_Q, test_name_Q = qa_sys.generate_QA(test_utt, test_ans)
test_food_dataset = qa_sys.prepare_QA_dataset(test_food_Q, test_utt, test_food_ans, split = 'test')
test_food_dataloader = DataLoader(test_food_dataset, batch_size=batch_size, shuffle=False)

test_name_dataset = qa_sys.prepare_QA_dataset(test_name_Q, test_utt, test_name_ans, split = 'test')
test_name_dataloader = DataLoader(test_name_dataset, batch_size=batch_size, shuffle=False)

### Train model
*Train two pretrained DistilBertForQuestionAnswering models used for predicting "food" and "name" slots respectively.*

In [None]:
# initialize tokenizer and models for food and name slots
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

MODEL_food = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
MODEL_name = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')

In [None]:
EPOCHS = 1
batch_size = 8
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
def train_model(model, epochs, dataloader, dataset):
    loss_function = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.00003)

    EPOCHS = 1
    model.to(device)
    for epoch in range(epochs):
        epoch_loss = 0
        batch_counter = 0
        for train_text_batch, train_span_batch, masks in dataloader:       
            model.zero_grad()
            train_text_batch, train_span_batch, masks = train_text_batch.to(device), train_span_batch.to(device), masks.to(device)
            output = model(train_text_batch,attention_mask=masks)
            loss = loss_function(output.start_logits, train_span_batch[:,0])
            loss += loss_function(output.end_logits, train_span_batch[:,1])

            loss.backward()
            optimizer.step()
            batch_counter += 1
            if batch_counter % 10 == 0:
                print("Processed ", batch_counter*batch_size, "QA pairs of ", len(dataset))
                print("Last loss:", loss.item())
            epoch_loss += loss.item()
        print('After epoch:', epoch, 'Loss is:', epoch_loss)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

In [None]:
# start training for "food" slot
train_model(MODEL_food, EPOCHS, train_food_dataloader, train_food_dataset)

Processed  80 QA pairs of  3760
Last loss: 4.312615394592285
Processed  160 QA pairs of  3760
Last loss: 1.0133447647094727
Processed  240 QA pairs of  3760
Last loss: 0.6027958393096924
Processed  320 QA pairs of  3760
Last loss: 0.44490280747413635
Processed  400 QA pairs of  3760
Last loss: 0.5862692594528198
Processed  480 QA pairs of  3760
Last loss: 0.2224215865135193
Processed  560 QA pairs of  3760
Last loss: 0.6620270013809204
Processed  640 QA pairs of  3760
Last loss: 0.020175576210021973
Processed  720 QA pairs of  3760
Last loss: 0.14330537617206573
Processed  800 QA pairs of  3760
Last loss: 0.012889356352388859
Processed  880 QA pairs of  3760
Last loss: 0.0809469074010849
Processed  960 QA pairs of  3760
Last loss: 0.061416760087013245
Processed  1040 QA pairs of  3760
Last loss: 0.20455601811408997
Processed  1120 QA pairs of  3760
Last loss: 0.009376998990774155
Processed  1200 QA pairs of  3760
Last loss: 0.0372556671500206
Processed  1280 QA pairs of  3760
Last loss

In [None]:
# start training for "name" slot
train_model(MODEL_name, EPOCHS, train_name_dataloader, train_name_dataset)

Processed  80 QA pairs of  3760
Last loss: 4.08853816986084
Processed  160 QA pairs of  3760
Last loss: 0.32913580536842346
Processed  240 QA pairs of  3760
Last loss: 0.1239340603351593
Processed  320 QA pairs of  3760
Last loss: 1.2988331317901611
Processed  400 QA pairs of  3760
Last loss: 0.9637863039970398
Processed  480 QA pairs of  3760
Last loss: 0.056811340153217316
Processed  560 QA pairs of  3760
Last loss: 0.4704493582248688
Processed  640 QA pairs of  3760
Last loss: 0.28271085023880005
Processed  720 QA pairs of  3760
Last loss: 0.021310556679964066
Processed  800 QA pairs of  3760
Last loss: 0.02728232741355896
Processed  880 QA pairs of  3760
Last loss: 0.09154394268989563
Processed  960 QA pairs of  3760
Last loss: 0.049079105257987976
Processed  1040 QA pairs of  3760
Last loss: 0.00849387887865305
Processed  1120 QA pairs of  3760
Last loss: 0.04806038364768028
Processed  1200 QA pairs of  3760
Last loss: 0.026445701718330383
Processed  1280 QA pairs of  3760
Last lo

### QA system evaluation on dev set
*Evaluate the Question and Answer system for "food" and "name" slots on dev set.*

In [None]:
def evaluate(model, dataloader):
    predicted_starts = []
    gold_starts = []
    predicted_ends = []
    gold_ends = []
    model.eval()
    with torch.no_grad():
        for dev_text_batch, dev_span_batch, masks in dataloader:
            dev_text_batch, masks = dev_text_batch.to(device), masks.to(device)
            output = model(dev_text_batch,attention_mask=masks)
            start_scores = output.start_logits.to('cpu').detach().numpy()
            end_scores = output.end_logits.to('cpu').detach().numpy()
            targets = dev_span_batch.detach().numpy()
            predicted_starts.extend(list(np.argmax(start_scores, axis=1)))
            gold_starts.extend(list(targets[:,0]))
            predicted_ends.extend(list(np.argmax(end_scores, axis=1)))
            gold_ends.extend(list(targets[:,1]))

    print("Starts accuracy")
    print(accuracy_score(gold_starts,predicted_starts))
    print("Ends accuracy")
    print(accuracy_score(gold_ends,predicted_ends))

In [None]:
# evaluate for "food" slot
evaluate(MODEL_food, dev_food_dataloader)

Starts accuracy
0.9878934624697336
Ends accuracy
0.9830508474576272


In [None]:
# evaluate for "name" slot
evaluate(MODEL_name, dev_name_dataloader)

Starts accuracy
0.9757869249394673
Ends accuracy
0.9709443099273608


### Prediction
*Predict intents and slots. Then, concatenate answers(e.g."find_intent|intent-slot1=ans1|intent-slot2=ans2") and set up the pipeline.*

In [None]:
def select_best_answer_span(start_probs, end_probs, distance):
    '''
    Given 2 matrices of probabilities associated with indicies of a text being the start or end of an answer spans, respectively,
    finds the highest probability spans under the restriction that the end index must be no more than distance after the start. 
    Returns a list (start index, end index) 2-plues corresponding to the best solution for each row of start/end_probs.
    '''
    best_starts = np.argsort(start_probs*-1, axis=1)
    best_ends = np.argsort(end_probs*-1, axis=1)
    output_spans = []
    for i in range(len(start_probs)):
        step = 0
        found = False
        sorted_spans = []
        bound = 0
        while not found:
            sorted_spans.extend([(start_probs[i, best_starts[i,j]] + end_probs[i,best_ends[i,step]], 
                                     best_starts[i,j], best_ends[i,step]) for j in range(step + 1)])
            sorted_spans.extend([(start_probs[i, best_starts[i,step]] + end_probs[i,best_ends[i,j]], 
                                     best_starts[i,step], best_ends[i,j]) for j in range(step)])
              
            sorted_spans.sort()
            bound = max(start_probs[i,best_starts[i,0]] + end_probs[i,best_ends[i,step]], 
                        start_probs[i,best_starts[i,step]] + end_probs[i,best_ends[i,0]]) 
            step += 1
            while sorted_spans and sorted_spans[-1][0] >= bound and not found:
                curr = sorted_spans.pop()
                if curr[1] <= curr[2] <= curr[1] + distance:
                    found = (curr[1],curr[2])
                    
        output_spans.append(found)
    return output_spans

In [None]:
def predict(model, dataloader, string):
    answers = []
    with torch.no_grad():
        model.to(device)
        for test_text_batch, test_span_batch, masks in dataloader:
            test_text_batch, masks = test_text_batch.to(device), masks.to(device)
            output = model(test_text_batch,attention_mask=masks)
            start_scores = output.start_logits.to('cpu').detach()
            end_scores = output.end_logits.to('cpu').detach()
            start_probs = F.log_softmax(start_scores,dim=1).numpy()
            end_probs = F.log_softmax(end_scores,dim=1).numpy()
            spans = select_best_answer_span(start_probs, end_probs, distance)
            for i in range(len(spans)):
                answers.append(string+"="+tokenizer.decode(test_text_batch[i,spans[i][0]: spans[i][1] + 1]))
    return answers

In [None]:
# predict for intents 
vectorizer = CountVectorizer(stop_words = 'english')
x_train = vectorizer.fit_transform(train_utt)
x_dev = vectorizer.transform(dev_utt)
clf = LogisticRegression()
clf.fit(x_train, train_intent)
dev_pred = clf.predict(x_dev)

In [76]:
distance = 20

# predict for "food" slot
dev_food = predict(MODEL_food, dev_food_dataloader, string='food')
# predict for "name" slot
dev_name = predict(MODEL_name, dev_name_dataloader, string='name')
assert len(dev_food) == len(dev_name) == len(dev_pred)

In [85]:
def pipeline_res(intents, food_lst, name_lst, utts):
    '''
    Generates the end-result for comparison. 
    Returns a list of results.
    '''
    res = []
    for intent, food, name, utt in zip(intents, food_lst, name_lst, utts):
        answer = []
        pricerange = rules.slot_pricerange(utt)
        area = rules.slot_area(utt)
        internet = rules.slot_internet(utt)
        parking = rules.slot_parking(utt)
        # intent name
        i = intent.split('_')[1] + '-'
        
        # construct answers
        if pricerange:
            answer.append(i + pricerange)
        if area:
            answer.append(i + area)
        if internet:
            answer.append(i + internet)
        if parking:
            answer.append(i + parking)
        if name != 'name=[CLS]':
            answer.append(i + name)
            
        # if intent is find_hotel
        if intent == 'find_hotel':
            # hotel specific slots
            hoteltype = rules.slot_hoteltype(utt)
            hotelstars = rules.slot_hotelstars(utt)
            if hoteltype:
                answer.append('hotel-' + hoteltype)
            if hotelstars:
                answer.append('hotel-' + hotelstars)
        # if intent is restaurant
        else:
            # restaurant specific slots
            if food != 'food=[CLS]':
                answer.append('restaurant-' + food)

        if len(answer) > 0:
            res.append(intent + '|' +  '|'.join(sorted(answer)))
        else:
            res.append(intent)

    return res

In [91]:
# check accuracy score for dev set
dev_res = pipeline_res(dev_pred, dev_food, dev_name, dev_utt)
accuracy_score(dev_ans, dev_res)

0.8062953995157385

### Evaluation on test set
*Predict intents and answers for their corresponding slots. Evaluate on test set by computing accuracy scores.*

In [93]:
# predict for intents
x_test = vectorizer.transform(test_utt)
test_pred = clf.predict(x_test)

In [94]:
distance = 20
# predict for "food" slot
test_food = predict(MODEL_food, test_food_dataloader, string='food')
# predict for "name" slot
test_name = predict(MODEL_name, test_name_dataloader, string='name')

In [101]:
# check accuracy score for test set
test_res = pipeline_res(test_pred, test_food, test_name, test_utt)
accuracy_score(test_ans, test_res)

0.8375