### 1. import packages

In [2]:
#pip install transformers -i https://pypi.tuna.tsinghua.edu.cn/simple
#pip install pandas -i https://pypi.tuna.tsinghua.edu.cn/simple
#pip install torch -i https://pypi.tuna.tsinghua.edu.cn/simple
#pip install scikit_learn -i https://pypi.tuna.tsinghua.edu.cn/simple
#pip install emoji

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Note: you may need to restart the kernel to use updated packages.


In [5]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader
from transformers import AdamW
import pandas as pd
import torch
import random
import emoji
import numpy as np
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from tqdm.notebook import tqdm
import json
from datetime import datetime
from collections import OrderedDict, defaultdict, Counter
import logging
import re
from string import punctuation
from torch.utils.tensorboard import SummaryWriter

logging.basicConfig(level=logging.INFO,filename='bert.log',filemode='w', format='%(asctime)s - %(message)s')
# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

In [2]:
SEED = 9999
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

### 2.wordpiece

In [3]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=231508.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=28.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=570.0), HTML(value='')))




### 3.clean and load data

In [11]:
def remove_punc(string):
    punctuation_zh = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~“”？，！【】（）、。：；’‘……￥·"""
    dicts = {i: '' for i in punctuation + punctuation_zh}
    dicts.pop('#')  # keep#
    punc_table = str.maketrans(dicts)
    return string.translate(punc_table)

In [19]:
def remove_emoji(string):
    allchars = [str for str in string]
    emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI['en']]
    clean_text = ' '.join([str for str in string.split() if not any(i in str for i in emoji_list)])
    return clean_text

In [12]:
def read_data(file):
    texts = []
    labels = []
    data = pd.read_csv(file)
    data.dropna(subset=['tidyTweet', 'Label'], inplace=True)
    for i in tqdm(data.itertuples(), total=len(data)):
        line = getattr(i, 'tidyTweet')
        line = line.strip()
        if not line:
            continue
        line = remove_emoji(line)
        line = remove_punc(line).strip()
        texts.append(line)
        label = getattr(i, 'Label')
        labels.append(label)
        if label == 1.0:
            texts.extend([line] * 2)
            labels.extend([label] * 2)
    assert len(texts) == len(labels)
    return texts, labels

In [20]:
texts, labels = read_data("733 label data - 2.csv")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12058.0), HTML(value='')))




In [21]:
Counter(labels)

Counter({1.0: 8643, 0.0: 9177})

### 4. seperate train set and val

In [22]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, stratify=labels, test_size=0.2, random_state=43)

In [23]:
len(train_labels),len(val_labels),len(train_texts),len(val_texts)

(14256, 3564, 14256, 3564)

### 5. check text and label

In [24]:
train_texts[:4]

['#Fauci is a FRAUD and a liar #FauciLied #FireFauci #FauciForPrison #FauciLiedMillionsDied #FauciIsALiar',
 'Im with him #markhamill #Donthecon #TrumpCrimeFamily #TraitorTrump',
 'Capo don the con can you say  Bye bye #maga',
 'If youre waiting to hear MSM report Vax adverse reactions then you dont know how the system works MSM news is FCC regulated and funded by the FEDS But if you want to take your chances without any independent Research Go for it That would make you a #COVIDIOT if you do']

In [25]:
train_texts

['#Fauci is a FRAUD and a liar #FauciLied #FireFauci #FauciForPrison #FauciLiedMillionsDied #FauciIsALiar',
 'Im with him #markhamill #Donthecon #TrumpCrimeFamily #TraitorTrump',
 'Capo don the con can you say  Bye bye #maga',
 'If youre waiting to hear MSM report Vax adverse reactions then you dont know how the system works MSM news is FCC regulated and funded by the FEDS But if you want to take your chances without any independent Research Go for it That would make you a #COVIDIOT if you do',
 '1',
 'Im not a conspiracy theorist but societal condition to me is a very real thing I also believe theres some conspiracies made to mislead the masses further #ConspiracyTheories',
 'Tennessee President Biden is bringing manufacturing jobs to your state Clean Energy high paying jobs #MadeInAmerica #AmericaFirst #BidenDelivers #BidensAmerica',
 'Thanks to Fauci and his ghouls including an ignorant hateful media #Ivermectin #Hydroxychloroquine #FauciLied #masksdontwork #LetsGoBrandon',
 'Come k

In [18]:
train_labels[:4]

[1.0, 0.0, 0.0, 1.0]

### 6. calculate the maximum length

In [26]:
max_len = max([len(item) for item in train_texts])
print(max_len)

max_len = max([len(item) for item in val_texts])
print(max_len)

286
281


### 7. map between label and ID

In [27]:
label2id = {item: idx for idx, item in enumerate(set(sorted(labels)))}
id2label = {v: k for k, v in label2id.items()}

In [28]:
label2id,id2label

({0.0: 0, 1.0: 1}, {0: 0.0, 1: 1.0})

### 8.tokenization

In [29]:
train_encodings = tokenizer(train_texts,
                            truncation=True,
                            padding='max_length',
                            max_length=128)
val_encodings = tokenizer(val_texts,
                          truncation=True,
                          padding='max_length',
                          max_length=128)

### 9. generate Dataset

In [30]:
# PyTorch Dataset
class CuDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        idx = int(idx)
        item = {
            key: torch.tensor(val[idx])
            for key, val in self.encodings.items()
        }
        item['labels'] = torch.tensor(label2id[self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

In [31]:
train_dataset = CuDataset(train_encodings, train_labels)
val_dataset = CuDataset(val_encodings, val_labels)

### 10.generate Dataloader

In [32]:
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

### 11. load the model

In [33]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device(
    'cpu')  
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=len(label2id))
model.to(device)
model.train()

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=440473133.0), HTML(value='')))




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

### 12.calculate Accuracy，Precision，Recall，F1 score

In [34]:
def compute_metrics(labels, preds):
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='macro')
    recall = recall_score(labels, preds, average='macro')
    f1 = f1_score(labels, preds, average='macro')
    logging.info(f'accuracy: {accuracy}')
    logging.info(f'precision: {precision}')
    logging.info(f'recall: {recall}')
    logging.info(f'f1: {f1}\n')
    return accuracy, precision, recall, f1

### 13. evaluate the model

In [35]:
@torch.no_grad()
def eval_model(model, eval_loader):
    model.eval()
    labels = []
    preds = []
    for idx, batch in enumerate(eval_loader):
        input_ids = batch['input_ids'].to(device)
        labels.extend(batch['labels'].numpy())
        outputs = model(input_ids)  
        preds.extend(torch.argmax(outputs[0], dim=-1).cpu().numpy()) 
    accuracy, precision, recall, f1 = compute_metrics(labels, preds)
    model.train()
    return accuracy, precision, recall, f1

### 14. train the model

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [{
    'params':
    [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
    'weight_decay_rate':
    0.01
}, {
    'params':
    [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
    'weight_decay_rate':
    0.0
}]
optim = AdamW(optimizer_grouped_parameters, lr=1e-5)  
step = 0
best_acc = 0
epoch = 10
path = 'model_best_bert'
writer = SummaryWriter(log_dir=path)
for epoch in tqdm(range(epoch), desc='Epoch'):
    for idx, batch in tqdm(enumerate(train_loader),
                           total=len(train_texts) // batch_size,
                           desc='Batch'):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs[0]  
        logging.info(
            f'Epoch-{epoch}, Step-{step}, Loss: {loss.cpu().detach().numpy()}')
        step += 1
        loss.backward()
        optim.step()
        writer.add_scalar('train_loss', loss.item(), step)
    logging.info(
        f'Epoch {epoch}, present best acc: {best_acc}, start evaluating.')
    accuracy, precision, recall, f1 = eval_model(model, eval_loader)  
    writer.add_scalar('dev_accuracy', accuracy, step)
    writer.add_scalar('dev_precision', precision, step)
    writer.add_scalar('dev_recall', recall, step)
    writer.add_scalar('dev_f1', f1, step)
    if accuracy > best_acc:
        model.save_pretrained(path)  
        tokenizer.save_pretrained(path)
        best_acc = accuracy



HBox(children=(HTML(value='Epoch'), FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(HTML(value='Batch'), FloatProgress(value=0.0, max=445.0), HTML(value='')))