# BERT Regression 

In [66]:
import pandas as pd
import numpy as np
import random
import torch
import os

In [None]:
# 학습 config는 추후 json 파일로 저장해놓기
CONFIG = dict(
    seed = 12345,
    pretrained_model = 'bert-base-uncased',
    output_dir = '../models/bert_regression_original',
    train_file = '../data/4th/v0/train.csv',
    dev_file = '../data/4th/v0/dev.csv',
    train_batch_size = 64,
    dev_batch_size = 64,
    lr = 5e-5,
    num_classes = 1,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
)

In [11]:
## Dataset
from torch.utils.data import Dataset
from transformers import AutoTokenizer



class RegressionDataset(Dataset):
    '''toxic dataset for BERT regression
    '''
    def __init__(self, tokenizer:AutoTokenizer, file_path, dir_path, mode, force=False) -> None:
        self.file_path = file_path
        self.dir_path = dir_path # output dir
        self.tokenizer = tokenizer
        self.mode = mode
        self.force = force 
        self.inputs, self.labels = self.load_data()
    
    def load_data(self):

        if not os.path.isdir(self.dir_path):
            os.mkdir(self.dir_path)

        if not self.force and os.path.isfile(os.path.join(self.dir_path, f"{self.mode}_X.pt")):
            # torch tensor를 불러오기
            encodings = torch.load(os.path.join(self.dir_path, f"{self.mode}_X.pt"))
            labels = torch.load(os.path.join(self.dir_path, f"{self.mode}_Y.pt"))
        else:
            # 새로 파일 만들고 싶을 때 기존의 파일 지움
            if self.force and os.path.isfile(os.path.join(self.dir_path, f"{self.mode}_X.pt")):
                os.remove(os.path.join(self.dir_path, f"{self.mode}_X.pt"))
                os.remove(os.path.join(self.dir_path, f"{self.mode}_Y.pt"))

            # read csv file
            data = pd.read_csv(self.file_path)

            encodings = self.tokenizer(text=data.comment.tolist(),
                                       add_special_tokens=True,
                                       padding='max_length',
                                       truncation=True,
                                       return_attention_mask=True)

            labels = data.score.to_numpy()

            # save the tensor
            torch.save(encodings, os.path.join(self.dir_path, f"{self.mode}_X.pt"))
            torch.save(labels, os.path.join(self.dir_path, f"{self.mode}_Y.pt"))
        
        return encodings, labels

    def __getitem__(self, idx):
        # return self.inputs[idx, :, :], self.labels[idx]
        item = {key: torch.tensor(val[idx], dtype=torch.long) for key, val in self.inputs.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return self.labels.size
        


In [None]:
def set_seed(seed = 12345):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataloader
from torch import nn

set_seed(CONFIG['seed'])

tokenizer = AutoTokenizer.from_pretrained(CONFIG['pretrained_model'])
train_dataset = RegressionDataset(tokenizer=tokenizer, file_path=CONFIG['train_file'], dir_path=CONFIG['output_dir'], mode='train')
dev_dataset = RegressionDataset(tokenizer=tokenizer, file_path=CONFIG['dev_file'], dir_path=CONFIG['output_dir'], mode='dev')
train_dataloader = Dataloader(train_dataset, batch_size=CONFIG['train_batch_size'], shuffle=True)
dev_dataloader = Dataloader(dev_dataset, batch_size=CONFIG['dev_batch_size'], shuffle=False)

model = AutoModelForSequenceClassification.from_pretrained(CONFIG['pretrained_model'], num_labels=CONFIG['num_classes'])

model.to(CONFIG['device'])

optimizer = AdamW(model.parameters(),
                  lr=CONFIG['lr'],
                  eps=1e-8)

epochs = 5
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

loss_function = nn.MSELoss()



In [None]:
from torch.nn.utils.clip_grad import clip_grad_norm



In [63]:
def clean(data, col):
    '''
    clean text
    '''
    # Clean some punctutations
    # data[col] = data[col].str.replace('\n', ' \n ')
    data[col] = data[col].str.replace(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)',r'\1 \2 \3')
    # Replace repeating characters more than 3 times to length of 3
    data[col] = data[col].str.replace(r'([*!?\'])\1\1{2,}',r'\1\1\1')    
    # Add space around repeating characters
    data[col] = data[col].str.replace(r'([*!?\']+)',r' \1 ')    
    # patterns with repeating characters 
    data[col] = data[col].str.replace(r'([a-zA-Z])\1{2,}\b',r'\1\1')
    data[col] = data[col].str.replace(r'([a-zA-Z])\1\1{2,}\B',r'\1\1\1')
    data[col] = data[col].str.replace(r'[ ]{2,}|\n',' ')
    # filter ibans(국제계좌형식)
    # filter email
    # filter websites
    # filter phone number
    # quotation marks
    pattern = r'(fr\d{2}[ ]\d{4}[ ]\d{4}[ ]\d{4}[ ]\d{4}[ ]\d{2}|fr\d{20}|fr[ ]\d{2}[ ]\d{3}[ ]\d{3}[ ]\d{3}[ ]\d{5})|' \
               '((?:(?!.*?[.]{2})[a-zA-Z0-9](?:[a-zA-Z0-9.+!%-]{1,64}|)|\"[a-zA-Z0-9.+!% -]{1,64}\")@[a-zA-Z0-9][a-zA-Z0-9.-]+(.[a-z]{2,}|.[0-9]{1,}))|' \
               '((https?:\/\/)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*))|' \
               '([0-9]+.[0-9]+.[0-9]+.[0-9]+)|' \
               '((?:(?:\+|00)33[\s.-]{0,3}(?:\(0\)[\s.-]{0,3})?|0)[1-9](?:(?:[\s.-]?\d{2}){4}|\d{2}(?:[\s.-]?\d{3}){2})|(\d{2}[ ]\d{2}[ ]\d{3}[ ]\d{3}))|' \
               '\"'
    data[col] = data[col].str.replace(pattern, '')
    
    return data