In [68]:
!pip install transformers==3.0.0

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 23.0 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [69]:
import torch
import numpy as np
import pandas as pd
import transformers
from sklearn.model_selection import train_test_split
from transformers import AutoModel, BertTokenizerFast

In [70]:
torch.cuda.is_available()

True

# Extracting Data and combining headline and content

In [71]:
CVAP_all_SD_df = pd.read_csv('../EmoBank/ChineseEmoBank/CVAP_SD/CVAP_all_SD.csv', encoding= 'utf-8',  sep="\t", index_col= 0)
CVAP_all_SD_df.head()

Unnamed: 0_level_0,Phrase,Valence_Mean,Arousal_Mean,Valence_SD,Arousal_SD
No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,怪八卦,3.45,4.383,0.423,0.579
1,更加小心,5.111,7.188,0.875,0.242
2,格外小心,3.95,6.5,0.527,0.381
3,特別小心,5.0,7.0,0.5,0.0
4,極為小心,3.989,6.925,0.694,0.319


In [72]:
df = CVAP_all_SD_df.drop(['Valence_SD', 'Arousal_SD'], axis= 1)
df.head()

Unnamed: 0_level_0,Phrase,Valence_Mean,Arousal_Mean
No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,怪八卦,3.45,4.383
1,更加小心,5.111,7.188
2,格外小心,3.95,6.5
3,特別小心,5.0,7.0
4,極為小心,3.989,6.925


In [73]:
text = ['陆军领导机构火箭军战略支援部队成立大会在京举行 习近平向中国人民解放军陆军火箭军战略支援部队',
'中央军委印发《关于深化国防和军队改革的意见》. 经中央军委主席习近平批准，中央军委近日印发了',
'《习近平关于严明党的纪律和规矩论述摘编》出版发行. 由中共中央纪律检查委员会、中共中央文献研',
'以实际行动向党中央看齐 向高标准努力',
'廣大党员干部正在积极学习习近平总书记在中央政治局专题',
'关键之年 改革挺进深水区. 刚刚过去的2015年，是全面深化改革的关键之年',
'习近平关于严明党',
'近平总书记在中',
'將數據集分成訓練集和測試集，以 80:20 的比例分割']

tag = [0, 1, 2, 2, 1, 0, 2,0,1]
df = pd.DataFrame({
    'text':text,
    'tag': tag
})
df

Unnamed: 0,text,tag
0,陆军领导机构火箭军战略支援部队成立大会在京举行 习近平向中国人民解放军陆军火箭军战略支援部队,0
1,中央军委印发《关于深化国防和军队改革的意见》. 经中央军委主席习近平批准，中央军委近日印发了,1
2,《习近平关于严明党的纪律和规矩论述摘编》出版发行. 由中共中央纪律检查委员会、中共中央文献研,2
3,以实际行动向党中央看齐 向高标准努力,2
4,廣大党员干部正在积极学习习近平总书记在中央政治局专题,1
5,关键之年 改革挺进深水区. 刚刚过去的2015年，是全面深化改革的关键之年,0
6,习近平关于严明党,2
7,近平总书记在中,0
8,將數據集分成訓練集和測試集，以 80:20 的比例分割,1


In [74]:
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['tag'],
                                                    stratify=df['tag'])

# Preparing Data for Model

In [75]:
bert = AutoModel.from_pretrained('bert-base-chinese')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')

In [None]:
train_idx = x_train.dropna().index
test_idx = x_test.dropna().index

train_tokens = tokenizer.batch_encode_plus(x_train[train_idx].to_list(),
                                           max_length = 50,
                                           pad_to_max_length = True,
                                           truncation = True)
test_tokens = tokenizer.batch_encode_plus(x_test[test_idx].to_list(),
                                           max_length = 50,
                                           pad_to_max_length = True,
                                           truncation = True)

In [None]:
train_seq = torch.tensor(train_tokens['input_ids'])
train_mask = torch.tensor(train_tokens['attention_mask'])
train_y = torch.tensor(y_train[train_idx].to_list())

test_seq = torch.tensor(test_tokens['input_ids'])
test_mask = torch.tensor(test_tokens['attention_mask'])
test_y = torch.tensor(y_test[test_idx].to_list())

In [None]:
from torch.utils.data import TensorDataset, RandomSampler, DataLoader

In [None]:
train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
trainloader = DataLoader(train_data, 
                         sampler = train_sampler,
                         batch_size = 32)

test_data = TensorDataset(test_seq, test_mask, test_y)
test_sampler = RandomSampler(test_data)
testloader = DataLoader(test_data, 
                         sampler = test_sampler,
                         batch_size = 32)

In [None]:
for param in bert.parameters():
    param.requires_grad = False

# Making our Model

In [None]:
from torch import nn
from transformers import AdamW
import torch.nn.functional as F
from sklearn.utils.class_weight import compute_class_weight

In [None]:
class BertClassifier(nn.Module):
    def __init__(self, bert):
        super().__init__()
        self.bert = bert
        self.fc1 = nn.Linear(768,3)
    
    def forward(self, sent_id, mask):
        _ , cls_hs = self.bert(sent_id, attention_mask = mask)
        return F.log_softmax(self.fc1(cls_hs), dim = 1)

In [None]:
model = BertClassifier(bert)
model = model.cuda()

In [None]:
optimizer = AdamW(model.parameters(), lr = 1e-5)

In [None]:
class_weights = compute_class_weight(class_weight = "balanced",classes= np.unique(y_train), y= y_train)
# class_weights = compute_class_weight('balanced', np.unique(y_train), y_train)
class_weights

array([1., 1., 1.])

In [None]:
weights = torch.tensor(class_weights, dtype = torch.float)
weights = weights.cuda()

criterion = nn.NLLLoss(weight = weights)

# Fine Tuning our model

In [None]:
from tqdm import tqdm

In [None]:
epochs = 10

for e in range(epochs):   
    train_loss = 0.0
    for batch in tqdm(trainloader):
        batch = [i.cuda() for i in batch]
        sent_id, masks, labels = batch

        optimizer.zero_grad()
        preds = model(sent_id, masks)
        loss = criterion(preds, labels)
        train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
    print(f'Epoch:{e+1}\t\tTraining Loss: {train_loss / len(trainloader)}')

100%|██████████| 1/1 [00:00<00:00, 14.18it/s]


Epoch:1		Training Loss: 1.117079496383667


100%|██████████| 1/1 [00:00<00:00, 43.48it/s]


Epoch:2		Training Loss: 1.115987777709961


100%|██████████| 1/1 [00:00<00:00, 40.00it/s]


Epoch:3		Training Loss: 1.1149121522903442


100%|██████████| 1/1 [00:00<00:00, 45.46it/s]


Epoch:4		Training Loss: 1.113853096961975


100%|██████████| 1/1 [00:00<00:00, 50.00it/s]


Epoch:5		Training Loss: 1.112810730934143


100%|██████████| 1/1 [00:00<00:00, 51.27it/s]


Epoch:6		Training Loss: 1.1117850542068481


100%|██████████| 1/1 [00:00<00:00, 52.63it/s]


Epoch:7		Training Loss: 1.1107763051986694


100%|██████████| 1/1 [00:00<00:00, 51.26it/s]


Epoch:8		Training Loss: 1.1097840070724487


100%|██████████| 1/1 [00:00<00:00, 51.16it/s]


Epoch:9		Training Loss: 1.1088083982467651


100%|██████████| 1/1 [00:00<00:00, 52.66it/s]

Epoch:10		Training Loss: 1.1078490018844604





In [None]:
pred_label = []
true_label = []
for batch in tqdm(testloader):
    batch = [i.cuda() for i in batch]
    sent_id, masks, labels = batch

    preds = model(sent_id, masks)
    pred_label.extend(torch.argmax(preds, axis = 1).cpu())
    true_label.extend(labels.cpu())

100%|██████████| 1/1 [00:00<00:00, 66.67it/s]


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
confusion_matrix(true_label, pred_label)

array([[1, 0, 0],
       [1, 0, 0],
       [1, 0, 0]], dtype=int64)

In [None]:
print(classification_report(true_label, pred_label))

              precision    recall  f1-score   support

           0       0.33      1.00      0.50         1
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         1

    accuracy                           0.33         3
   macro avg       0.11      0.33      0.17         3
weighted avg       0.11      0.33      0.17         3



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
