In [None]:
import json
import math
import torch
import pickle
import transformers

import numpy as np
import pandas as pd

In [None]:
from pathlib import Path
from itertools import chain
from tqdm import tqdm

In [None]:
from sklearn import metrics
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import BertTokenizer
from transformers import BertModel

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

## Dataset

In [None]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_len = max_len
        
        self.comment_text = dataframe['内容']    # X
        self.targets = self.data['标签']    # y(s)
        

    def __len__(self):
        return len(self.comment_text)
    

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split()) # split, 默认为所有的空字符，包括空格、换行(\n)、制表符(\t)等

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            # pad_to_max_length=True,  # @deprecated
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask'] # sentence 有效token位置掩码
        token_type_ids = inputs["token_type_ids"] # 多sentence(s)合并为一个sentence时，不同sentence的掩码


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            
            'targets': torch.tensor(self.targets[index], dtype=torch.float) # y(s) 没有特别处理
        }

### load

In [None]:
%store -r df_comments
%store -r df_comments_length
%store -r df_comments_topics

### mu 1std

In [None]:
k = 1
mu_1std = df_comments_length.mean(axis=0) + k * df_comments_length.std(axis=0) 

print(df_comments_length.shape)
print(df_comments_length[df_comments_length < mu_1std].shape)
print(df_comments_length[df_comments_length < mu_1std].shape[0] / df_comments_length.shape[0])

### multi label

In [None]:
mlb = MultiLabelBinarizer()
# df_comments_topics.loc[:,'标签'] = mlb.fit_transform(df_comments_topics['标签']).tolist()
df_comments_topics['标签'] = mlb.fit_transform(df_comments_topics['标签']).tolist()
print(df_comments_topics.shape)
df_comments_topics.head(3)

In [None]:
df_comments_topics[df_comments_topics['标签'].apply(lambda x: 1 if int == type(x) else len(x)) > 1].head(3)

In [None]:
mlb.classes_

In [None]:
with open('./model/mlb.pkl', 'wb') as f:
    pickle.dump(mlb, f)

### to dataset

In [None]:
int(mu_1std)

In [None]:
new_df = df_comments_topics

In [None]:
# split train & test

train_size = 0.8

train_dataset = new_df.sample(frac=train_size, random_state=200)
test_dataset = new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

### save

In [None]:
%store train_dataset
%store test_dataset
%store mlb

### api

In [None]:
def get_dataset(model_name, train_dataset, test_dataset, MAX_LEN):

    tokenizer = BertTokenizer.from_pretrained(model_name)

    training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN) # diff MultiLabelDataset
    testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)
    
    return training_set, testing_set