# BERT Regression 

In [28]:
import pandas as pd
import numpy as np
import random
import torch
import os

In [29]:
# 학습 config는 추후 json 파일로 저장해놓기
CONFIG = dict(
    seed = 12345,
    pretrained_model = 'bert-base-uncased',
    output_dir = '../models/bert_regression_original',
    train_file = '../data/4th/v0/train.csv',
    dev_file = '../data/4th/v0/dev.csv',
    train_batch_size = 128,
    dev_batch_size = 64,
    lr = 5e-5,
    epochs = 5,
    num_classes = 1,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
)

In [30]:
def set_seed(seed = 12345):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)

In [4]:
## Dataset
from torch.utils.data import Dataset
from transformers import AutoTokenizer



class RegressionDataset(Dataset):
    '''toxic dataset for BERT regression
    '''
    def __init__(self, tokenizer:AutoTokenizer, file_path, dir_path, mode, force=False) -> None:
        self.file_path = file_path
        self.dir_path = dir_path # output dir
        self.tokenizer = tokenizer
        self.mode = mode
        self.force = force 
        self.inputs, self.labels = self.load_data()
    
    def load_data(self):

        if not os.path.isdir(self.dir_path):
            os.mkdir(self.dir_path)

        if not self.force and os.path.isfile(os.path.join(self.dir_path, f"{self.mode}_X.pt")):
            # torch tensor를 불러오기
            encodings = torch.load(os.path.join(self.dir_path, f"{self.mode}_X.pt"))
            labels = torch.load(os.path.join(self.dir_path, f"{self.mode}_Y.pt"))
        else:
            # 새로 파일 만들고 싶을 때 기존의 파일 지움
            if self.force and os.path.isfile(os.path.join(self.dir_path, f"{self.mode}_X.pt")):
                os.remove(os.path.join(self.dir_path, f"{self.mode}_X.pt"))
                os.remove(os.path.join(self.dir_path, f"{self.mode}_Y.pt"))

            # read csv file
            data = pd.read_csv(self.file_path)
            encodings = self.tokenizer(text=data.comment.tolist(),
                                       padding='max_length',
                                       truncation=True)

            labels = data.score.to_numpy()

            # save the tensor
            torch.save(encodings, os.path.join(self.dir_path, f"{self.mode}_X.pt"))
            torch.save(labels, os.path.join(self.dir_path, f"{self.mode}_Y.pt"))
        
        return encodings, labels

    def __getitem__(self, idx):
        # return self.inputs[idx, :, :], self.labels[idx]
        item = {key: torch.tensor(val[idx], dtype=torch.long) for key, val in self.inputs.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return self.labels.size
        


In [31]:
## Dataset
from torch.utils.data import Dataset
from transformers import AutoTokenizer



class RegressionDataset(Dataset):
    '''toxic dataset for BERT regression
    '''
    def __init__(self, tokenizer:AutoTokenizer, file_path, dir_path, mode, force=False) -> None:
        self.file_path = file_path
        self.dir_path = dir_path # output dir
        self.tokenizer = tokenizer
        self.mode = mode
        self.force = force 
        
        # read csv file
        self.data = pd.read_csv(self.file_path)
        self.labels = self.data.score.to_numpy()
        

    def __getitem__(self, idx):
        encodings = self.tokenizer(text=self.data.comment[idx],
                                   padding='max_length',
                                   truncation=True)

        item = {key: torch.tensor(val[idx], dtype=torch.long) for key, val in encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return self.labels.size
        


In [32]:
from torch.utils.data import DataLoader


set_seed(CONFIG['seed'])

tokenizer = AutoTokenizer.from_pretrained(CONFIG['pretrained_model'])
train_dataset = RegressionDataset(tokenizer=tokenizer, file_path=CONFIG['train_file'], dir_path=CONFIG['output_dir'], mode='train')
train_dataloader = DataLoader(train_dataset, batch_size=CONFIG['train_batch_size'], shuffle=True)
dev_dataset = RegressionDataset(tokenizer=tokenizer, file_path=CONFIG['dev_file'], dir_path=CONFIG['output_dir'], mode='dev')
dev_dataloader = DataLoader(dev_dataset, batch_size=CONFIG['dev_batch_size'], shuffle=False)

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from torch import nn


model = AutoModelForSequenceClassification.from_pretrained(CONFIG['pretrained_model'], num_labels=CONFIG['num_classes'])

model.to(CONFIG['device'])

optimizer = AdamW(model.parameters(),
                  lr=CONFIG['lr'],
                  eps=1e-8)

epochs = CONFIG['epochs']
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

loss_function = nn.MSELoss()

In [None]:
from torch.nn.utils.clip_grad import clip_grad_norm



In [63]:
def clean(data, col):
    '''
    clean text
    '''
    # Clean some punctutations
    # data[col] = data[col].str.replace('\n', ' \n ')
    data[col] = data[col].str.replace(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)',r'\1 \2 \3')
    # Replace repeating characters more than 3 times to length of 3
    data[col] = data[col].str.replace(r'([*!?\'])\1\1{2,}',r'\1\1\1')    
    # Add space around repeating characters
    data[col] = data[col].str.replace(r'([*!?\']+)',r' \1 ')    
    # patterns with repeating characters 
    data[col] = data[col].str.replace(r'([a-zA-Z])\1{2,}\b',r'\1\1')
    data[col] = data[col].str.replace(r'([a-zA-Z])\1\1{2,}\B',r'\1\1\1')
    data[col] = data[col].str.replace(r'[ ]{2,}|\n',' ')
    # filter ibans(국제계좌형식)
    # filter email
    # filter websites
    # filter phone number
    # quotation marks
    pattern = r'(fr\d{2}[ ]\d{4}[ ]\d{4}[ ]\d{4}[ ]\d{4}[ ]\d{2}|fr\d{20}|fr[ ]\d{2}[ ]\d{3}[ ]\d{3}[ ]\d{3}[ ]\d{5})|' \
               '((?:(?!.*?[.]{2})[a-zA-Z0-9](?:[a-zA-Z0-9.+!%-]{1,64}|)|\"[a-zA-Z0-9.+!% -]{1,64}\")@[a-zA-Z0-9][a-zA-Z0-9.-]+(.[a-z]{2,}|.[0-9]{1,}))|' \
               '((https?:\/\/)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*))|' \
               '([0-9]+.[0-9]+.[0-9]+.[0-9]+)|' \
               '((?:(?:\+|00)33[\s.-]{0,3}(?:\(0\)[\s.-]{0,3})?|0)[1-9](?:(?:[\s.-]?\d{2}){4}|\d{2}(?:[\s.-]?\d{3}){2})|(\d{2}[ ]\d{2}[ ]\d{3}[ ]\d{3}))|' \
               '\"'
    data[col] = data[col].str.replace(pattern, '')
    data = data.dropna(axis=0)
    
    return data

In [2]:
import pandas as pd
train_data = pd.read_csv('../data/4th/v0/train.csv')
train_data.head()



Unnamed: 0,comment,score
0,"Above user has no en talk page, but does have...",0.0
1,Adelson Thanks for the tip ! Over at WP:RIGH...,0.0
2,YOU FAT SLIMY PIECE OF SHIT I HOPE YOU FALL ...,0.8
3,Lots More Abstracts Smithsonian / NASA Astroph...,0.0
4,1 May 2006 Please do not add nonsense to Wiki...,0.0


In [3]:
len(train_data)

1778810

In [4]:
from transformers import AutoTokenizer

t = AutoTokenizer.from_pretrained('bert-base-uncased')

In [11]:
a = t(train_data['comment'][0], padding='max_length', truncation=True)
type(a), type(a['input_ids'])

(transformers.tokenization_utils_base.BatchEncoding, list)

In [13]:
type(a['input_ids'][0])

int

In [14]:
len(a['input_ids'])

512

In [15]:
1700000 * 512

870400000

In [12]:
a

{'input_ids': [101, 2682, 5310, 2038, 2053, 4372, 2831, 3931, 1010, 2021, 2515, 2031, 2028, 2012, 2139, 1024, 3841, 20267, 2121, 9785, 17854, 3258, 1024, 21541, 10484, 2099, 1045, 2123, 1005, 1056, 2156, 1037, 3291, 2007, 2240, 1011, 7807, 1999, 1996, 3793, 1025, 2738, 1010, 2009, 1005, 1055, 2070, 5098, 10466, 2029, 2024, 4394, 1012, 1996, 18407, 2024, 3492, 2172, 2035, 2006, 7674, 1010, 2061, 2151, 4394, 2013, 4372, 2024, 3497, 4394, 2013, 2139, 2036, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [24]:
b = t.encode_plus(train_data['comment'][0], padding='max_length', truncation=True)

In [26]:
type(b['input_ids'])

list

In [27]:
train_data['comment'][0:5].to_list()

[" Above user has no en talk page, but does have one at de:Benutzer Diskussion:Istiller I don ' t see a problem with line-breaks in the text; rather, it ' s some junction shapes which are missing. The icons are pretty much all on commons, so any missing from en are likely missing from de also. ",
 "Adelson   Thanks for the tip ! Over at WP:RIGHT we could really use your help. Please consider becoming a part of the fastest growing most influential ensemblages of editors in the entire wiki: WP:WikiProject Conservatism / About us. – Lionel (talk)    Republican Party presidential candidates, 2012   I for some reason still have this page on my watchilist, and see you ' ve been quite busy here lately. I subsequently have had the page protected for a while. Hopefully that helps. 1992  Heh, thanks... I ' ve been resisting the impulse to hit RFPP over the single intermittent but persistently disruptive IP I ' ve been (along with others) reverting, but get the feeling from other edits over const