코드 베이스 : [Fine Tuning Roberta for Sentiment Analysis](https://colab.research.google.com/drive/1AZ3WtoFbM845TxZqePU7_L3ysNPF8Kaa#scrollTo=979OUro5Eac3&uniqifier=1)

In [None]:
# Importing the libraries needed
import torch
import random
import numpy as np
import transformers

from torch import nn

SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
from torch import cuda

device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
import pandas as pd

train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
submit = pd.read_csv('./sample_submission.csv')

test_list = test["Utterance"].values.tolist()

In [None]:
train.head()

In [None]:
train['Target'].unique()

In [None]:
test_list[0]

In [None]:
train['Target'] = train['Target'].replace({'neutral':0, 'surprise':1, 'fear':2, 'sadness':3, 'joy':4, 'disgust':5, 'anger':6})

In [None]:
new_df = train[['Utterance', 'Target']]

In [None]:
from transformers import RobertaModel, RobertaTokenizer
# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
LEARNING_RATE = 1e-05

tokenizer = RobertaTokenizer.from_pretrained('roberta-large', truncation=True)

In [None]:
from torch.utils.data import Dataset, DataLoader

class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.Utterance
        self.targets = self.data.Target      
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),            
            'targets': torch.tensor(self.targets[index], dtype=torch.float)            
            } 

In [None]:
train_size = 0.8
train_data=new_df.sample(frac=train_size,random_state=200)
valid_data=new_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(valid_data.shape))

training_set = SentimentData(train_data, tokenizer, MAX_LEN)
valid_set = SentimentData(valid_data, tokenizer, MAX_LEN)

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

valid_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
valid_loader = DataLoader(valid_set, **valid_params)

In [None]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-large")
        self.pre_classifier = torch.nn.Linear(1024, 1024)
        self.dropout = torch.nn.Dropout(0.25)
        self.classifier = torch.nn.Linear(1024, 7)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
model = RobertaClass()
model.to(device)

In [None]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [None]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [None]:
from tqdm import tqdm

EPOCHS = 10

for epoch in range(EPOCHS):
    train(epoch)

In [None]:
def valid(model, valid_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    
    with torch.no_grad():
        for _, data in tqdm(enumerate(valid_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
                
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu

In [None]:
acc = valid(model, valid_loader)
print("Accuracy on valid data = %0.2f%%" % acc)

In [None]:
class Test_SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.Utterance
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),            
        }

In [None]:
test = Test_SentimentData(test, tokenizer, MAX_LEN)
test_dataloader = DataLoader(test, shuffle=False)

In [None]:
def inference(model, test_loader, device):
    model.to(device)
    model.eval()
    
    test_predict = []
    
    for _, data in tqdm(enumerate(test_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        y_pred = model(ids, mask, token_type_ids)
        test_predict += y_pred.argmax(1).detach().cpu().numpy().tolist()
    
    print('Done.')
    
    return test_predict

In [None]:
finetuned_roberta_pred = inference(model, test_dataloader, device)

In [None]:
for i in range(len(finetuned_roberta_pred)):
    if int(finetuned_roberta_pred[i]) == 0 :
        finetuned_roberta_pred[i] = 'neutral'
    elif int(finetuned_roberta_pred[i]) == 1 :
        finetuned_roberta_pred[i] = 'surprise'
    elif int(finetuned_roberta_pred[i]) == 2 :
        finetuned_roberta_pred[i] = 'fear'
    elif int(finetuned_roberta_pred[i]) == 3 :
        finetuned_roberta_pred[i] = 'sadness'
    elif int(finetuned_roberta_pred[i]) == 4 :
        finetuned_roberta_pred[i] = 'joy'
    elif int(finetuned_roberta_pred[i]) == 5 :
        finetuned_roberta_pred[i] = 'disgust'
    elif int(finetuned_roberta_pred[i]) == 6 :
        finetuned_roberta_pred[i] = 'anger'

In [None]:
finetuned_roberta_pred[0]

In [None]:
submit['Target'] = finetuned_roberta_pred
submit.head()

In [None]:
submit.to_csv('./roberta_large_finetune.csv', index=False)