In [None]:
import numpy as np
import pandas as pd
import os
from sklearn.metrics import f1_score
import torch

In [None]:
import random
from tqdm import tqdm

# 設定超參數

In [None]:
# set random seeds for all, for reproducibility
def random_seed(seed = 1337):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [None]:
max_length = 400
batch_size = 20
epochs = 4
warmup_epochs = 0
save_model = 1
lr = 3e-5
seed = 1337

In [None]:
random_seed(seed)

# 匯入資料

In [None]:
# os.getcwd()
# os.chdir("C:\\Users\\GL75\\OneDrive\\桌面\\自然語言處理\\期末project\\data")

In [None]:
df_train = pd.read_csv('../input/nlp-class-fixed-data/fixed_train.csv')
df_valid = pd.read_csv('../input/nlp-class-fixed-data/fixed_valid.csv')
df_test = pd.read_csv("../input/nlp-class-fixed-data/fixed_test.csv")

# 資料清理與合併

In [None]:
import nltk
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")

In [None]:
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

In [None]:
from collections import defaultdict

In [None]:
# the set of stopwords. we will delete these words from our texts
stop = set(stopwords.words("english"))

In [None]:
# a lemmatizer object to conduct word lemmatization.
lemmatizer = WordNetLemmatizer()

In [None]:
tag_map = defaultdict(lambda: wordnet.NOUN)
tag_map['J'] = wordnet.ADJ
tag_map['V'] = wordnet.VERB
tag_map['R'] = wordnet.ADV

In [None]:
def nltk_process(text):
    # tokenize the sentence
    tokenized = word_tokenize(text)
    lst = []
    for token, tag in pos_tag(tokenized):
        # lemmatize each token based on their pos tagging.
        lst.append(lemmatizer.lemmatize(token, pos = tag_map[tag[0]]))
    for i in lst:
        if i.lower() in stop:
            # if the word is a stopword, remove it
            lst.remove(i)
    # concatenate all word tokens left into a sentence again.
    text_cleaned = ' '.join(lst)
    return text_cleaned

In [None]:
def clean(text):
    text = text.apply(lambda r: r.replace("_comma_", "")) # clean the "_comma_" sign
    text = text.apply(lambda r: r.replace(".", "")) # clean the dots
    text = text.apply(lambda r: r.replace(":(", "")) # clean the emogi
    text = text.apply(lambda r: r.replace("?", "")) # clean the question mark
    text = text.apply(lambda r: r.replace("!", ""))
    text = text.apply(lambda r: r.replace('"', ''))
    text = text.apply(lambda r: r.strip()) # clean the trailing and leading spaces
    text = text.apply(lambda r: r.lower()) # lower the cases
    text = text.apply(nltk_process) # do the those preprocesses using NLTK packages.
    return text

In [None]:
def clean_alittle(text):
    text = text.apply(lambda r: r.replace("_comma_", ""))
    text = text.apply(lambda r: r.replace("..", ""))
    text = text.apply(lambda r: r.replace(":(", ""))
    text = text.apply(lambda r: r.replace("?", ""))
    text = text.apply(lambda r: r.replace("!", ""))
    text = text.apply(lambda r: r.strip())
    text = text.apply(lambda r: r.lower())
    return text

In [None]:
# for utterances, we clean the punctuations, do the lemmatizations and remove stopwords.
# but for prompts, we only clean the punctuations, for prompts are quite important information.
df_train['utterance_cleaned'] = clean(df_train['utterance'])
df_train['prompt_cleaned'] = clean_alittle(df_train['prompt'])

df_valid['utterance_cleaned'] = clean(df_valid['utterance'])
df_valid['prompt_cleaned'] = clean_alittle(df_valid['prompt'])

df_test['utterance_cleaned'] = clean(df_test['utterance'])
df_test['prompt_cleaned'] = clean_alittle(df_test['prompt'])

In [None]:
# Left the cleaned utterances and prompts only
df_train.drop(['utterance_idx', 'prompt', 'utterance'], axis = 1, inplace = True)
df_valid.drop(['utterance_idx', 'prompt', 'utterance'], axis = 1, inplace = True)
df_test.drop(['utterance_idx', 'prompt', 'utterance'], axis = 1, inplace = True)

In [None]:
# groupby the dataframe by "conv_id" only. we will join all sentences in an utterance into a really long sentence.
df_train_gby = df_train.groupby(by = ['conv_id'], as_index = True).agg({'label': 'first', 'utterance_cleaned': ' '.join, 'prompt_cleaned': 'first'}) # trust that utterance_idx are in order
df_valid_gby = df_valid.groupby(by = ['conv_id'], as_index = True).agg({'label': 'first', 'utterance_cleaned': ' '.join, 'prompt_cleaned': 'first'})
df_test_gby = df_test.groupby(by = ['conv_id'], as_index = True).agg({'utterance_cleaned': ' '.join, 'prompt_cleaned': 'first'})

In [None]:
# further concatenate "prompt" and the "utterances" sentence into a single sentence, split by <SEP>
df_train_gby['merged'] = df_train_gby.prompt_cleaned.str.cat(df_train_gby.utterance_cleaned, sep = '<SEP>')
df_valid_gby['merged'] = df_valid_gby.prompt_cleaned.str.cat(df_valid_gby.utterance_cleaned, sep = '<SEP>')
df_test_gby['merged'] = df_test_gby.prompt_cleaned.str.cat(df_test_gby.utterance_cleaned, sep = '<SEP>')

In [None]:
# check if gpu is available and move to gpu
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
    
print("Device:", torch.cuda.get_device_name(0))

In [None]:
from transformers import XLNetTokenizer

# Load the XLNet tokenizer in
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case = True)

In [None]:
# create a function to tokenize a set of texts
def data_preprocessing(corpus, max_length = max_length):
    # create empty lists to store outputs
    input_ids = [] # tensor of token ids to be fed to a XLNet model. (torch.Tensor)
    token_type_ids = [] # tensor indicating token types. 0 for the first sentence (prompt) and 1 for the second sentence (lemmatized and concatenated utterances)
    attention_masks = [] # tensor of indices specifying which tokens should be attended by the model
    
    for data in corpus:
        # split the sentences in "data" by "<SEP>". The first sentence is "prompt" and the second sentence is "utterances"
        sents = data.split("<SEP>")
        # tokenize these 2 sentences. use padding the max length we set earlier. truncate the utterance if too long.
        enc_dict = tokenizer(sents[0], sents[1], add_special_tokens = True, padding = 'max_length', max_length = max_length, truncation = 'only_second')
        # put the tokenized tokens, token types and attention masks to the lists to store.
        input_ids.append(enc_dict.get('input_ids'))
        token_type_ids.append(enc_dict.get('token_type_ids'))
        attention_masks.append(enc_dict.get('attention_mask'))
    # convert lists to torch tensors
    input_ids = torch.tensor(input_ids)
    token_type_ids = torch.tensor(token_type_ids)
    attention_masks = torch.tensor(attention_masks)
    
    return input_ids, token_type_ids, attention_masks

In [None]:
# input the 'merged' column (each data is a long sentence containing the prompt and utterance, separated by <SEP>)
# and get the tokens tensor, token types tensor and attention masks tensor
train_tokens, train_types, train_masks = data_preprocessing(df_train_gby['merged'])
valid_tokens, valid_types, valid_masks = data_preprocessing(df_valid_gby['merged'])
test_tokens, test_types, test_masks = data_preprocessing(df_test_gby['merged'])

In [None]:
# transform labels to torch.tensor as well
train_labels = torch.tensor(df_train_gby['label'])
valid_labels = torch.tensor(df_valid_gby['label'])

# 建立Dataloader

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [None]:
# create dataset and dataloader

# train
train_dataset = TensorDataset(train_tokens, train_types, train_masks, train_labels)
train_sampler = RandomSampler(train_dataset) # randomly sample
train_dataloader = DataLoader(train_dataset, sampler = train_sampler, batch_size = batch_size)

In [None]:
# validation
valid_dataset = TensorDataset(valid_tokens, valid_types, valid_masks, valid_labels)
valid_sampler = SequentialSampler(valid_dataset) # no need to disorganize the order in validation process
valid_dataloader = DataLoader(valid_dataset, sampler = valid_sampler, batch_size = batch_size)

In [None]:
# test
test_dataset = TensorDataset(test_tokens, test_types, test_masks)
test_sampler = SequentialSampler(test_dataset) # no need to disorganize the order in testing process
test_dataloader = DataLoader(test_dataset, sampler = test_sampler, batch_size = batch_size)

# 模型、優化器和學習率調整

In [None]:
# use an improvised verison of Adam, AdamW, which generally yields better training loss and generalization ability.
# https://towardsdatascience.com/why-adamw-matters-736223f31b5d
# Also, we use cosine lr scheduler

from transformers import XLNetForSequenceClassification # we use XLNet for sequence classification
from transformers import get_cosine_schedule_with_warmup
from torch.optim import AdamW

In [None]:
# initialize the XLNet model and optimizer and lr_scheduler
def initialization():
    model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels = 32)
    # move to GPU
    model.to(device)
    
    # optimizer, use AdamW
    optimizer = AdamW(model.parameters(), lr = lr, eps = 1e-8) # default
    
    # lr scheduler
    steps = len(train_dataloader) * epochs # total training steps
    # you can set warmup if you want. we don't set it here since we use few epochs.
    warmup_steps = len(train_dataloader) * warmup_epochs
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps = warmup_steps, num_training_steps = steps)
    
    return model, optimizer, scheduler

In [None]:
# initialize the model, optimizer and scheduler
model, optimizer, scheduler = initialization()

# 損失函數(分類用XLNet已經包含，所以沒用到了)

In [None]:
# we use crossentropy loss, since multiclass classification

# loss_fn = nn.CrossEntropyLoss()
# we don't need this when we are using XLNetForSequenceClassification, since it's taken care of this part already.

# 驗證步驟的函數 (訓練函數會用到，所以先定義)

In [None]:
def validate(model, valid_dataloader):
    # validate in every epoch
    # set model to evaluation mode
    model.eval()
    
    # tracking variables: the validation F1 scores and losses of every validation batches.
    valid_F1 = []
    valid_loss = []
    
    # for each batch in validation set, calculate the F1 and loss
    for batch in valid_dataloader:
        # move the batch to gpu
        tokens_batched, types_batched, masks_batched, labels_batched = tuple(PytorchTensor.to(device) for PytorchTensor in batch)
        
        # close the gradients and compute predictions
        with torch.no_grad():
            # put in token tensors, token types tensor, attention masks tensor and labels tensor, all
            outputs = model(input_ids = tokens_batched, token_type_ids = types_batched, attention_mask = masks_batched, labels = labels_batched)
            # the first element of the outputs are the losses
            loss = outputs[0]
            # the second element of the outputs are the length-32 tensors, containing the scores of each labels.
            scores = outputs[1]
            
            # get the labels with maximum scores as the predictions.
            _, predicted_labels = torch.max(outputs[1], dim = 1)
            
            # move all the ground truth labels and predictions of this batch to cpu, discard the gradient and turn them into numpy arrays
            labels_batched = labels_batched.cpu().detach().numpy()
            predicted_labels = predicted_labels.cpu().detach().numpy()
            # so that we can call "f1_score" to calculate the F1 score of this batch
            valid_F1.append(f1_score(labels_batched, predicted_labels, average = 'macro') * 100)
        
            # compute the loss of this batch. use item() to retrieve the number from the 1-element tensor.
            valid_loss.append(loss.item())
    
    # compute and return the average loss and average F1 score for all batches.
    return np.mean(valid_loss), np.mean(valid_F1)

# 測試步驟的函數

In [None]:
import torch.nn.functional as F
import torch.nn as nn

In [None]:
def predict_test(model, test_dataloader):
    # turn the model into evaluation mode
    model.eval()
    
    # initialize an empty numpy array that stores predictions
    predictions = np.array([], dtype = np.int64)
    
    for batch in test_dataloader:
        # for each batch in testing data, first move it to the gpu
        tokens_batched, types_batched, masks_batched = tuple(PytorchTensor.to(device) for PytorchTensor in batch)
        
        # close the gradient
        with torch.no_grad():
            # pour all token-related tensors into the model and get outputs for this batch.
            outputs = model(input_ids = tokens_batched, token_type_ids = types_batched, attention_mask = masks_batched)
            # Use the softmax function to normalize each row (a length-32 tensor) into probabilities that sum up to 1
            probs = F.softmax(outputs[0], dim = 1)
            
            # get the predictions of this batch by picking the labels with the maximum scores.
            _, predicted_labels = torch.max(probs, dim = 1)
            
            # move the predictions tensor to cpu, discard the gradient and turn it into numpy array.
            predicted_labels = predicted_labels.cpu().detach().numpy()
        # concatenate predictions of every batch.
        predictions = np.concatenate((predictions, predicted_labels))
    
    return predictions

# 訓練步驟的函數

In [None]:
def train(model, train_dataloader, valid_dataloader = None, validation = True):
    print("Start training...\n")
    
    for e in range(epochs):
        '''training'''
        print("Epoch {}:\n".format(e+1))
        print("-" * 70)
        
        # Reset tracking variables at the beginning of each epoch: the losses and f1 scores in each batches.
        total_loss = []
        total_f1 = []
        
        # set the model to training mode
        model.train()
        
        # initialize a progress bar by tqdm module.
        progress = tqdm(total = len(train_dataloader))
        # for each batch of trainin data, we have to
        for step, batch in enumerate(train_dataloader):
            # load the batch to GPU
            tokens_batched, types_batched, masks_batched, labels_batched = tuple(PytorchTensor.to(device) for PytorchTensor in batch)
            
            # clear the gradients from previous epoch
            optimizer.zero_grad()
            
            # forward propagation, returning predictions
            # get all tensors (including labels tensor) into the model, and get the outputs
            outputs = model(input_ids = tokens_batched, token_type_ids = types_batched, attention_mask = masks_batched, labels = labels_batched)
            # the first element of the outputs is the losses of the batches while the second one is the scores tensor (length-32) of the batches.
            loss = outputs[0]
            scores = outputs[1]
            
            # get the labels with the maximum scores as the predictions.
            _, predicted_labels = torch.max(outputs[1], dim = 1)
            
            # move the labels tensor and predictions tensor to the cpu, discard the gradient and turn them into numpy arrays.
            labels_batched = labels_batched.cpu().detach().numpy()
            predicted_labels = predicted_labels.cpu().detach().numpy()
            # calculate the F1 scores and add them in the list to store.
            total_f1.append(f1_score(labels_batched, predicted_labels, average = 'macro') * 100)
            
            # add the losses of the batch to the list to store.
            total_loss.append(loss.item())
            
            # backward propagation
            loss.backward()
            
            # clip the gradients s.t. its norm = 1.0, to prevent gradient problem (vanishment, explosion)
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
            # update parameters by optimizer and lr by scheduler
            optimizer.step()
            scheduler.step()
            
            # progress bar update
            progress.update(1)
        # after each epoch, calculate the average loss and the average F1 socres over the entire training data
        average_loss = np.mean(total_loss)
        average_f1 = np.mean(total_f1)
        # close the progress bar of the epoch.
        progress.close()
        
        # save model every few epochs if specified.
        if save_model:
            if (e+1) % save_model == 0:
                torch.save(model.state_dict(), "./NLP_model_params_epoch{}.pth".format(e+1))
        
        '''validation'''
        if validation:
            # after each epoch, measure model's val loss and F1 score.
            val_loss, val_f1 = validate(model, valid_dataloader)
            
            print("-" * 70)
            print(f"{'Train Loss':^12} | {'Train F1':^10} | {'Val Loss':^10} | {'Val F1':^8}")
            # print(f"{'-':^7} | {average_loss:^12.6f} | {val_loss:^10.6f} | {val_acc:^9.2f} | {elapsed_time:^9.2f}")
            print(f"{average_loss:^12.6f} | {average_f1:^10.2f} | {val_loss:^10.6f} | {val_f1:^8.2f}")
            print("-" * 70)
        print("\n")
        
    print("Training Complete!")

In [None]:
# train the model
train(model, train_dataloader, valid_dataloader)

# 儲存模型

In [None]:
# torch.save(model.state_dict(), "NLP_model_params.pth")

'''if you want to load the model parameters you stored, use this! But make sure that your model structure and hyper-parameters settings look exactly the same.'''
# model, _, _ = initialization()
# model.load_state_dict(torch.load('../input/parameter/1_63best_400_20_4.pth'))

# 預測測試資料、輸出結果

In [None]:
# get the prediction results for the testing data
predicted_results = predict_test(model, test_dataloader)

In [None]:
predicted_results

In [None]:
# make the predictions into a dataframe, with 'conv_id' column of original dataframe as the index.
result_df = pd.DataFrame(predicted_results, columns = ['pred'], index = df_test_gby['utterance_cleaned'].index)

In [None]:
result_df.head()

In [None]:
# merge the predictions to the original testing dataframe, based on the key "conv_id" column.
output_df = pd.merge(df_test, result_df, on = 'conv_id')

In [None]:
# drop all columns except the predictions.
output_df.drop(['conv_id', 'utterance_cleaned', 'prompt_cleaned'], axis = 1, inplace = True)

In [None]:
output_df.head()

In [None]:
# output to a csv
output_df.to_csv("./predicted_results_xlnet.csv")