In [1]:
!pip install -q transformers

In [2]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [5]:
df = pd.read_csv("../data/train_v3_relabel3.csv")
# df = pd.read_csv("../data/train_v2(clean)_fix.csv")

In [6]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,mobile,url,tm,title,content,source,location,type,tags,emotion,negative_cnt,comment_cnt,fee,speed,signal,5G,other,--,tag_num,one_tag,multi_tag
0,0,亞太,[https://www.facebook.com/162174107155573/post...,2021-11-15T20:00:01+08:00,請問振興券可以繳電話費嗎？,\n亞太帳單，我在門市繳振興?加現金，服務很好！\n,fans,,主文,資費,中立,0,0.0,0,0,0,0,1,0,1,其他,其他


In [7]:
df['list'] = df[['fee','speed','signal','5G','other']].values.tolist()

In [8]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,mobile,url,tm,title,content,source,location,type,tags,emotion,negative_cnt,comment_cnt,fee,speed,signal,5G,other,--,tag_num,one_tag,multi_tag,list
0,0,亞太,[https://www.facebook.com/162174107155573/post...,2021-11-15T20:00:01+08:00,請問振興券可以繳電話費嗎？,\n亞太帳單，我在門市繳振興?加現金，服務很好！\n,fans,,主文,資費,中立,0,0.0,0,0,0,0,1,0,1,其他,其他,"[0, 0, 0, 0, 1]"


In [9]:
df['comment_text'] = df[['title', 'content']].replace('\\n', '', regex=True).agg(''.join, axis=1)

In [10]:
new_df = df.filter(['comment_text','list'], axis=1)

In [11]:
new_df

Unnamed: 0,comment_text,list
0,請問振興券可以繳電話費嗎？亞太帳單，我在門市繳振興?加現金，服務很好！,"[0, 0, 0, 0, 1]"
1,"台星的雙11沒吸引力,白等了.那u15跟60+那個好??我也是之前等了很久，把小孩用的台灣之...","[1, 1, 0, 0, 0]"
2,"亞太電信88元12MB悄悄的續約18個月方案感謝幾位前輩寶貴資訊亞太除了剛開始使用時,進入山...","[1, 0, 1, 0, 0]"
3,[方案] 該續約中華還是找其他家更便宜的？便宜就選亞太啊,"[1, 0, 0, 0, 0]"
4,[方案] 請問有推薦499↓吃到飽嗎？(不限流量)亞太啊199不限速不限流量吃到飽,"[1, 1, 0, 0, 0]"
...,...,...
403,有台灣大哥大的也跟我一樣嗎？我是5G方案，但我手機用哀鳳12，開低耗電會跑4G，兩種都跑很順欸,"[1, 1, 0, 1, 0]"
404,台哥大5G免費30天體驗台北也可以辦了台哥5G能試用到30天很讚耶，還是要多測試生活圈狀況，...,"[0, 0, 1, 1, 0]"
405,[新聞] 上半年5G 網速PK誰最快？Speedtest 公布台哥5G方案是要你再入坑它家的凱擘,"[0, 1, 0, 1, 0]"
406,台哥大4G這網速是一般的速度嗎？5G 是未來趨勢，現在4G一直被打壓，4G訊號只會愈來愈差…,"[0, 0, 1, 1, 0]"


In [12]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 7
LEARNING_RATE = 1e-05
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
num_added_toks = tokenizer.add_tokens(['5G','4G','3G','2G','NP','LTE','MB', 'WiFi','2CA','3CA','4CA'])

In [13]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.comment_text
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [14]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=new_df.sample(frac=train_size,random_state=200)
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (408, 2)
TRAIN Dataset: (326, 2)
TEST Dataset: (82, 2)


In [15]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [16]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
#         self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l1 = transformers.BertModel.from_pretrained('bert-base-chinese')
        self.l1.resize_token_embeddings(len(tokenizer))
        self.l2 = torch.nn.Dropout(0.3)
#         self.l3 = torch.nn.Linear(768, 6)
        self.l3 = torch.nn.Linear(768, 5)
    def forward(self, ids, mask, token_type_ids):
#         _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output
    
model = BERTClass()
model.to(device)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21139, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (drop

In [17]:
# attrs = vars(model)
# # {'kids': 0, 'name': 'Dog', 'color': 'Spotted', 'age': 10, 'legs': 2, 'smell': 'Alot'}
# # now dump this in some way or another
# print(', '.join("%s: %s" % item for item in attrs.items()))
# # model.save_pretrained()


In [18]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [19]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [20]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
#         if _%5000==0:
        if _%50==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [21]:
for epoch in range(EPOCHS):
#     for param in model.l1.parameters():
#         param.requires_grad = False  #False代表不會fine tune到pre-train，預設為True代表會訓練到
    train(epoch)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0, Loss:  0.7009071707725525
Epoch: 1, Loss:  0.6456233859062195
Epoch: 2, Loss:  0.45128241181373596
Epoch: 3, Loss:  0.3281286358833313
Epoch: 4, Loss:  0.30641645193099976
Epoch: 5, Loss:  0.260616660118103
Epoch: 6, Loss:  0.2326328307390213


In [22]:
def validation():
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            print(tokenizer.decode(ids[0]))
            print(targets[0])
            print(torch.sigmoid(outputs).cpu().detach().numpy().tolist()[0])
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [32]:
outputs, targets = validation()
outputs = np.array(outputs) >= 0.45
accuracy = metrics.accuracy_score(targets, outputs)
f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

[CLS] 請 問 振 興 券 可 以 繳 電 話 費 嗎 ？ 亞 太 帳 單 ， 我 在 門 市 繳 振 興? 加 現 金 ， 服 務 很 好 ！ [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [

In [24]:
outputs, targets = validation()

[CLS] 請 問 振 興 券 可 以 繳 電 話 費 嗎 ？ 亞 太 帳 單 ， 我 在 門 市 繳 振 興? 加 現 金 ， 服 務 很 好 ！ [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [

In [25]:
my_predict = test_dataset.copy()

In [26]:
outputs = np.array(outputs)
outputs.shape

(82, 5)

In [27]:
np.set_printoptions(precision=2) 
my_predict = my_predict.join(pd.DataFrame(outputs).round(decimals=1))

In [28]:
my_predict

Unnamed: 0,comment_text,list,0,1,2,3,4
0,請問振興券可以繳電話費嗎？亞太帳單，我在門市繳振興?加現金，服務很好！,"[0, 0, 0, 0, 1]",0.9,0.1,0.1,0.1,0.3
1,"台星的雙11沒吸引力,白等了.那u15跟60+那個好??我也是之前等了很久，把小孩用的台灣之...","[1, 1, 0, 0, 0]",0.9,0.5,0.2,0.2,0.0
2,[方案] 該續約中華還是找其他家更便宜的？便宜就選亞太啊,"[1, 0, 0, 0, 0]",1.0,0.1,0.1,0.1,0.1
3,中華電稱冠全台 5G網速飆471.73Mbps連續兩次報告中華目前看來還是第一遠傳跟亞太這組...,"[0, 1, 0, 1, 0]",0.1,0.6,0.1,0.8,0.4
4,[心得] 台灣之星網路訊號不穩就叫你用亞太了 也沒有比較貴,"[0, 0, 1, 0, 0]",0.2,0.1,1.0,0.1,0.0
...,...,...,...,...,...,...,...
77,[問題] 台哥上傳網速不到個位數?升到5G(無誤,"[0, 1, 0, 1, 0]",0.2,0.9,0.3,0.1,0.1
78,[問題] 台哥店員說499吃到飽要辦要快?5G還超多地方收不到 4G要消失還早,"[1, 0, 1, 1, 0]",1.0,0.1,0.1,0.2,0.1
79,台哥大5G免費30天體驗台北也可以辦了台哥5G能試用到30天很讚耶，還是要多測試生活圈狀況，...,"[0, 0, 1, 1, 0]",0.2,0.3,0.1,0.8,0.5
80,[新聞] 上半年5G 網速PK誰最快？Speedtest 公布台哥5G方案是要你再入坑它家的凱擘,"[0, 1, 0, 1, 0]",0.2,0.9,0.0,0.5,0.2


In [29]:
# https://blog.csdn.net/Avrilzyx/article/details/114586729
model_to_save = model.module if hasattr(model, 'module') else model
torch.save(model_to_save.state_dict(),'multi_label_1222_relabel.pth')