In [2]:
from platform import python_version
import torch
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
squad_dataset = load_dataset('squad')
print(squad_dataset['train'][0])


{'id': '5733be284776f41900661182', 'title': 'University_of_Notre_Dame', 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}


In [4]:
# tokenizing context with only space.

def preprocess(example):
    
    out = {}

    context_token = example['context'].strip().split(' ')
    question_token = example['question'].strip().split(' ')
    
    out['context'] = context_token
    out['question'] = question_token
    
    if 'answers' not in example:
        return out
    
    answer_start = example['answers']['answer_start']
    out['answers'] = []
    for i, ans_st in enumerate(answer_start):
        c_token_len = len(example['context'][:ans_st].strip().split(' '))
        a_token_len = len(example['answers']['text'][i].strip().split(' '))
        out['answers'].append({'start' : c_token_len, 'end' : c_token_len + a_token_len})
    
    return out

# tokenizing conext with space and symbolic letter.

def tokenizing1(text):
    text = text.lower()
    tokens = []
    tokens_start = []
    tokens_len = []
    where_space = []
    i, token_start, token_len = 0, 0, 0
    
    while i < len(text):
        # is character alphbet?
        if text[i].isalpha(): # alphabet
            while True:
                token_len += 1
                if i + token_len >= len(text) or not text[i + token_len].isalpha():
                    break
            tokens.append(text[i:i+token_len])
            tokens_len.append(token_len)
            tokens_start.append(i)
            i += token_len
            token_len = 0
        elif text[i] == ' ': # space
            where_space.append(i)
            i += 1
        else: # symbolic char
            tokens.append(text[i])
            tokens_len.append(1)
            tokens_start.append(i)
            i += 1
    return tokens, tokens_start, tokens_len, where_space

def advanced_preprocess1(data):
    
    out = {}
    tokens, tokens_start, tokens_len, where_space = tokenizing1(data['context'])
    tokens_q, _, _, _ = tokenizing1(data['question'])
    
    answer_start = data['answers']['answer_start']
    
    out = {'context' : tokens, 'question' : tokens_q, 'tokens_start' : tokens_start, 'tokens_len' : tokens_len, 'where_space' : where_space}
    
    out['answers'] = []
    
    for i, ans in enumerate(answer_start):
        start_index = tokens_start.index(ans)
        tokens, _, _, _ = tokenizing1(data['answers']['text'][i])
        end_index = start_index + len(tokens)
        out['answers'].append({'start' : start_index, 'end' : end_index})
    return out

def token2string(context, tokens_start, tokens_len, tokens_answers, where_space, test = []):
    
    cont = ''

    for i, token in enumerate(context):
        if tokens_start[i] + tokens_len[i] in where_space:
            cont += (token + ' ')
        else:
            cont += token
    
    if len(test) > 0 :
            start_index = tokens_start[test[0]] if test[0] < len(context) else tokens_start[-1]
            end_index = tokens_start[test[1]] if test[1] < len(context) else tokens_start[-1]
    else:
        try:
            tokens_answers_start = tokens_answers['start']
            tokens_answers_end = tokens_answers['end']
            start_index = tokens_start[tokens_answers_start]
            end_index = tokens_start[tokens_answers_end-1] + tokens_len[tokens_answers_end -1]
        except:
             pass
    
    ans = cont[start_index : end_index]
    return cont, ans


In [5]:
# make dataloader 


def preprocessing(squad_dataset):
	train_data = []
	del_train_data_index = []
	count = 0
	for i, data in enumerate(squad_dataset['train']):
	    try:
	        train_data.append(advanced_preprocess1(data))
	    except:
	        train_data.append([])
	        del_train_data_index.append(i)
	        count+=1
	        

	valid_data = []
	del_valid_data_index = []
	count = 0
	for i, data in enumerate(squad_dataset['validation']):
	    try:
	        valid_data.append(advanced_preprocess1(data))
	    except:
	        valid_data.append([])
	        del_valid_data_index.append(i)
	        count += 1

	return train_data, del_train_data_index, valid_data, del_valid_data_index
	

In [6]:
train_data, del_train_data_index, valid_data, del_valid_data_index = preprocessing(squad_dataset)

In [7]:
# make batch . 

import numpy as np
import random
import torch
import torch.nn.functional as F

def make_batch(data, batch_size = 64, index = [], random = True, question = False):
    print(data)
    data = np.array(data)
    if random:
        indice = np.random.choice(len(data), batch_size, replace = False)
        for i, idx in enumerate(indice):
            if idx in del_train_data_index:
                indice[i] = 0
        data_batch = data[indice]
        
    else:
        for i,idx in enumerate(index):
            if idx in del_valid_data_index:
                index[i] = 0
        data_batch = data[index]

    context_max_len = 0
    question_max_len = 0
    
    for i in range(batch_size):
        if question:
            context_max_len = max(context_max_len, len(data_batch[i]['context']) + len(data_batch[i]['question']))
        else:
            context_max_len = max(context_max_len, len(data_batch[i]['context']))
    
    context_batch = []
    answer_start_batch = []
    answer_end_batch = []
    context_mask = []
    mask_loc = []
    
    for i, d in enumerate(data_batch):
        if d == []:
            continue
            
        context, questions, answer = wordToid(d)
        
        if question:
            context = np.concatenate([context, questions])
        
        context_len = len(context)
        context_padding = np.zeros(context_max_len - len(context))
        context = np.concatenate([context, context_padding])
        context_batch.append(context)
        
        for answers in data_batch[i]['answers']:
            answer_start_batch.append(answers['start'])
            answer_end_batch.append(answers['end'])
        
        context_mask.append(np.concatenate([np.ones(context_len), np.zeros(len(context_padding))], axis = 0))
        mask_loc.append(context_len)
        
    return torch.LongTensor(context_batch), torch.LongTensor(answer_start_batch), torch.LongTensor(answer_end_batch), torch.LongTensor(context_mask), torch.LongTensor(mask_loc)

In [8]:
import torch
import torch.nn as nn
import numpy as np

device = 'cuda:4' if torch.cuda.is_available() else 'cpu'

# word2id, id2word

def make_dict(train_data, del_train_data_index):

	word2id = {}
	tokens = []
	del_idx = []
	for i in range(len(train_data)):
	    if i in del_train_data_index:
	        continue
	    context_tokens = train_data[i]['context']
	    question_tokens = train_data[i]['question']
	    tokens.extend(context_tokens)
	    tokens.extend(question_tokens)
	    if not train_data[i]['answers']:
	        del_idx.extend([i])

	vocab = ['UNK'] + list(set(tokens))
	    
	word2id = {word : id_ for id_ , word in enumerate(vocab)}
	id2word = {id_ : word for word, id_ in word2id.items()}

	return word2id, id2word

def wordToid(data):
    context = data['context']
    question = data['question']
    
    context = [word2id[word] if word in word2id else 0 for word in context]
    question = [word2id[word] if word in word2id else 0 for word in question]
    
    answer = []
    
    for dic in data['answers']:
        start = dic['start']
        end = dic['end']
        answer.append(context[start:end])
    return context, question, answer



def valid_answers(data_index, index = []):
    data = valid_data[data_index]
    context = data['context']
    tokens_start = data['tokens_start']
    tokens_len = data['tokens_len']
    where_space = data['where_space']
    
    if index[-1] > len(context):
        index[-1] = len(context)
    
    ans_tokens = context[index[0]:index[1]]

    ans = ''
    for i, token in enumerate(ans_tokens):
        if tokens_start[index[0] + i] + tokens_len[index[0] + i] in where_space:
            ans += (token + ' ')
        else:
            ans += token
    
    return ans.strip().lower()
    
def id2word_answer(data_index, start, end):
    ### valid 비어있는지 확인하기 !!!
    
    data = valid_data[data_index] 
    origin_data = squad_dataset['validation'][data_index]
    
    ans = valid_answers(data_index, [start,end])
    
    for i, a in enumerate(origin_data['answers']['text']):
        origin_data['answers']['text'][i] = a.lower()
    
    prediction_ = {'prediction_text': ans, 'id': origin_data['id']}
    reference = {'answers' : origin_data['answers'], 'id': origin_data['id']}
    
    return prediction_, reference


In [9]:
# 1 - layer lstm with drop out using pytorch lstm
class LSTM_dropout(nn.Module):
    def __init__(self, voca_size, embed_dim, hidden_dim, dropout):
        super(LSTM_dropout, self).__init__()
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding(voca_size, embed_dim)
        self.lstm = nn.LSTM(input_size = embed_dim, hidden_size = hidden_dim, num_layers = 1,
                            batch_first = True, dropout = dropout)
        self.relu = nn.ReLU()
        self.out_lin = nn.Linear(hidden_dim, 2)
        
        
    def forward(self, x, mask):
        embeded = self.embedding(x)
        batch_size = x.size(0)
        
        h_t, c_t = self.init_state(batch_size)
        
        outputs, (hidden, cell_state)= self.lstm(embeded, (h_t, c_t))
        
        mask = mask.unsqueeze(-1)
         # batch * seq_len * hidden_dim
        
        outputs = self.out_lin(outputs) # batch * seq_len * 2
        
        outputs_masked = outputs * mask
        
        out_start = outputs_masked[:,:,0]
        out_end = outputs_masked[:,:,1]
        
        return out_start, out_end
    
    def init_state(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_dim).to(device), torch.zeros(1, batch_size, self.hidden_dim).to(device)

In [10]:
from tqdm import tqdm

def train(data, model, criterion, optimizer, batch_size = 128, num_iter = 30000, question = False, attention = False):
    device = 'cuda:4' if torch.cuda.is_available() else 'cpu'

    model.to(device)
    model.train()
    loss_ = 0.
    acc_, start_acc, end_acc, val_score = 0, 0, 0, 0
    
    for i in tqdm(range(num_iter)):
        context, answer_start, answer_end, mask, loc = make_batch(data, batch_size, random = True, question = question)
        context, answer_start, answer_end, mask, loc = context.to(device), answer_start.to(device), answer_end.to(device), mask.to(device), loc.to(device)
#         context, answer_start, answer_end = context.to(device), answer_start.to(device), answer_end.to(device), #mask.to(device), loc.to(device) ###
        
        if attention:
            start, end = model(context, mask, loc)
        else:
            start, end = model(context, mask)

        
        loss_start = criterion(start, answer_start)
        loss_end = criterion(end, answer_end-1)
        
        loss = loss_start/2  + loss_end/2
        
        model.zero_grad()
        loss.backward()
        optimizer.step()

        # loss
        loss_ += loss.detach().cpu()
        
        # acc
        start_acc_ = (start.argmax(dim = -1) == answer_start).detach()
        end_acc_ = (end.argmax(dim = -1) == (answer_end -1)).detach()
        start_acc += start_acc_.sum().item()*1./batch_size
        end_acc += end_acc_.sum().item()*1./batch_size
        acc_ += (start_acc_ & end_acc_).sum().item()*1./batch_size
                
        if i % 1000 == 999 :
            print(f'{i + 1 : d}th iters >> loss = {loss_/1000:.4f}, start_acc = {start_acc/1000 * 100 : .4f}, end_acc = {end_acc/1000 * 100 : .4f}, acc = {acc_/1000 * 100 : .4f}')
            loss_, start_acc, end_acc, acc_ = 0., 0, 0, 0
            with torch.no_grad():
                model.eval()
                cur_val_score = validation(valid_data, model = model, batch_size = 128, question = question, attention = attention)
#                 if attention:
#                     if cur_val_score < val_score:
#                         optimizer.factor *= 0.8
#                 val_score = cur_val_score
                model.train()

    
    return model

In [11]:
def validation(data, model, batch_size = 128, question = False, attention = False):
    device = 'cuda:4' if torch.cuda.is_available() else 'cpu'
    predictions = []
    references = []


    for i in range(len(data)//batch_size):
        c, a_s, a_e, m, l= make_batch(data, batch_size = batch_size, index = np.arange(i*batch_size, (i+1)*batch_size), random = False, question = question)
        c, a_s, a_e, m, l = c.to(device), a_s.to(device), a_e.to(device), m.to(device), l.to(device)
        
        if attention:
            start, end = model(c,m,l)
        else:
            start, end = model(c, m) # batch_size * seq_len
        
        start_index = start.argmax(dim = 1).detach()
        end_index = end.argmax(dim = 1).detach()
        end_index += 1
        
        for j in range(batch_size):
            if i * batch_size + j in del_valid_data_index:
                continue
            if start_index[j] >= end_index[j]:
                continue
            pred, refer = id2word_answer(i * batch_size + j, start_index[j], end_index[j])
            predictions.append(pred)
            references.append(refer)

    results = squad_metric.compute(predictions = predictions, references = references)
    print('validation score : ', results)
    return results['f1']

In [12]:
# Define hyperparameters
vocab_size = (len(make_dict(train_data, del_train_data_index)[0]))
embed_dim = 100
hidden_dim = 128
dropout = 0.5
learning_rate = 0.001
batch_size = 64
num_epochs = 5

# Instantiate the model
model = LSTM_dropout(vocab_size, embed_dim, hidden_dim, dropout)

import torch.optim as optim
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Split data into training and validation sets
# train_data, val_data = train_test_split(train_data, test_size=0.1)

val_data = valid_data



In [None]:

# # Train the model
# for epoch in range(num_epochs):
#     model.train()
#     running_loss = 0.0
#     for i in tqdm(range(0, len(train_data), batch_size)):
#         batch_data = train_data[i:i+batch_size]
#         inputs = [torch.tensor(example['context']) for example in batch_data]
#         labels = [torch.tensor([example['start'], example['end']]) for example in batch_data]
#         inputs = nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=0)
#         labels = torch.stack(labels)
#         optimizer.zero_grad()
#         outputs = model(inputs)
#         loss = criterion(outputs, labels)
#         loss.backward()
#         optimizer.step()
#         running_loss += loss.item()
#     print(f"Epoch {epoch+1}, Loss: {running_loss}")

# # Evaluate the model on validation set
# model.eval()
# val_loss = 0.0
# with torch.no_grad():
#     for i in tqdm(range(0, len(val_data), batch_size)):
#         batch_data = val_data[i:i+batch_size]
#         inputs = [torch.tensor(example['context']) for example in batch_data]
#         labels = [torch.tensor([example['start'], example['end']]) for example in batch_data]
#         inputs = nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=0)
#         labels = torch.stack(labels)
#         outputs = model(inputs)
#         loss = criterion(outputs, labels)
#         val_loss += loss.item()
# print(f"Validation Loss: {val_loss}")


In [14]:
# Train the model
model = train(train_data, model, criterion, optimizer, batch_size=batch_size, num_iter=num_epochs, question=False)

# Evaluate the model on validation set
validation(valid_data, model=model, batch_size=batch_size, question=False)


  0%|          | 0/5 [00:04<?, ?it/s]