In [1]:
import json
import math
import torch
import pickle
import transformers

import numpy as np
import pandas as pd

In [2]:
from pathlib import Path
from itertools import chain
from tqdm import tqdm

In [3]:
from sklearn import metrics
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import BertTokenizer
from transformers import BertModel

In [4]:
from tokenizer import NormalizerSGZChat
from tokenizer import TokenizerSGZChat

2021-04-06 16:36:47 DEBUG Starting new HTTPS connection (1): huggingface.co:443
2021-04-06 16:36:47 DEBUG https://huggingface.co:443 "HEAD /xlm-roberta-base/resolve/main/config.json HTTP/1.1" 200 0
2021-04-06 16:36:47 DEBUG Starting new HTTPS connection (1): huggingface.co:443
2021-04-06 16:36:48 DEBUG https://huggingface.co:443 "HEAD /xlm-roberta-base/resolve/main/sentencepiece.bpe.model HTTP/1.1" 200 0
2021-04-06 16:36:48 DEBUG Starting new HTTPS connection (1): huggingface.co:443
2021-04-06 16:36:49 DEBUG https://huggingface.co:443 "HEAD /xlm-roberta-base/resolve/main/tokenizer.json HTTP/1.1" 200 0
Building model [5m[33m...[0m[0m2021-04-06 16:36:50 DEBUG Starting new HTTPS connection (1): huggingface.co:443
2021-04-06 16:36:50 DEBUG https://huggingface.co:443 "HEAD /xlm-roberta-base/resolve/main/config.json HTTP/1.1" 200 0
2021-04-06 16:36:51 DEBUG Starting new HTTPS connection (1): huggingface.co:443
2021-04-06 16:36:51 DEBUG https://huggingface.co:443 "HEAD /xlm-roberta-base/r

In [5]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [6]:
class CustomDataset(Dataset):

    def __init__(self, X, y, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.max_len = max_len
        
        self.X = X
        self.y = y
        

    def __len__(self):
        return len(self.X)
    

    def __getitem__(self, index):
        text = str(self.X[index])

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask'] # sentence 有效token位置掩码
        token_type_ids = inputs["token_type_ids"] # 多sentence(s)合并为一个sentence时，不同sentence的掩码


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            
            'targets': torch.tensor(self.y[index], dtype=torch.float)
        }

In [7]:
def get_dataset(model_name, X_train, y_train, X_test, y_test, MAX_LEN):

    # tokenizer = BertTokenizer.from_pretrained(model_name)
    tokenizer = TokenizerSGZChat.from_pretrained(model_name)
    vocab = []
    vocab.extend(NormalizerSGZChat().special_tokens)
    for text in X_train:
        vocab.extend(tokenizer.tokenize(text))
    vocab = {token:i for i,token in enumerate(sorted(list(set(vocab))))}
    tokenizer.vocab = vocab
    
    training_set = CustomDataset(X_train, y_train, tokenizer, MAX_LEN)
    testing_set = CustomDataset(X_test, y_test, tokenizer, MAX_LEN)
    
    return training_set, testing_set