In [1]:
import json
import pandas as pd
import numpy as np
import re
from sklearn import metrics
from sklearn.metrics import classification_report
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BertTokenizer
from sklearn.metrics import accuracy_score
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from tqdm import tqdm
import math
import random
import os
import pickle
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
device = 'cuda' if torch.cuda.is_available() else 'cpu'   

In [2]:
# set up logging
import logging
logging.basicConfig(
        format="%(asctime)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
)
logger = logging.getLogger(__name__)
logging.getLogger('transformers').setLevel(logging.ERROR)

In [3]:
df = pd.read_csv('clean_dataset.csv')

In [4]:
df['tweets'] = df['tweets'].apply(lambda x:str(x).replace("nan",""))

In [5]:
df['race'] = df['race'].apply(lambda x:int(x)-1)

In [6]:
df

Unnamed: 0,user_id,race,age,tweets,name,screen_name,description,lang,img_path
0,12488,3,1,"YKAR, a futuristic sans serif font by #Freeb...",Chad Boyce,djsnipa1,"Multimedia Developer, Graphic Designer, DJ, an...",,profile pics/60147.jpeg
1,719703,3,1,"In other words, it’s good news about the vacci...",Tomato 😷,Tomato,🇭🇰Rise Up!,,profile pics/60148.jpeg
2,811618,2,1,Blah blah blah.I think RAPE is worst! RT Bein...,Mr.O,Putanginamo,http://t.co/UfipjuQ2Mw is a blog and talk show...,,profile pics/60152.jpeg
3,822540,3,1,❤️ 🙏. bonk. #FFXIV400kSweepstakes. Nice. Ed...,parker,parker,gotta go fast,,profile pics/60153.jpeg
4,865071,3,1,How about pizza dipped in water 🤦🏻‍♂️.Day 21 ...,Kevin Jones,kevinj,,,profile pics/60154.jpeg
...,...,...,...,...,...,...,...,...,...
3046,4892221799,3,1,What a beautiful thing to see this ...,Amanda,BohoGrlNxtDoor,Just trying to stand out in the crowd. Which i...,,profile pics/64265.jpeg
3047,4895390642,1,1,Inspired Attempt Tour: like when you first try...,Dalton's Eyeliner,FlyAwayEyeliner,"I'm Dalton's eyeliner, who are you? Are you Da...",,profile pics/64267.jpeg
3048,4921995243,3,1,yo,saved,memelady,,,profile pics/64269.jpeg
3049,4924158634,3,1,Those are amazing 😆.Looked over to see both b...,Winter da CoffeeCat,WinterStar21,"Big fan of cats, coffee, horror stuff and anim...",,profile pics/64270.jpeg


In [7]:
df['tweets']

0       YKAR, a futuristic sans serif font by   #Freeb...
1       In other words, it’s good news about the vacci...
2       Blah blah blah.I think RAPE is worst! RT  Bein...
3        ❤️ 🙏.  bonk. #FFXIV400kSweepstakes. Nice.  Ed...
4        How about pizza dipped in water 🤦🏻‍♂️.Day 21 ...
                              ...                        
3046               What a beautiful thing to see this ...
3047    Inspired Attempt Tour: like when you first try...
3048                                                   yo
3049     Those are amazing 😆.Looked over to see both b...
3050    Stop defaulting to You just cant see the good ...
Name: tweets, Length: 3051, dtype: object

In [8]:
df['race']

0       3
1       3
2       2
3       3
4       3
       ..
3046    3
3047    1
3048    3
3049    3
3050    3
Name: race, Length: 3051, dtype: int64

In [9]:
df['race'].value_counts()

3    2482
0     290
1     181
2      98
Name: race, dtype: int64

In [10]:
class TextDataSet(Dataset):
    def __init__(self, input_ids, attention_mask, label):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.label = label

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):   
        return (self.input_ids[idx], self.attention_mask[idx], self.label[idx])

In [11]:
def encode_data(text_list, labels, ratio=0.8):
    inputs = tokenizer.batch_encode_plus(
        text_list,                      
        add_special_tokens = True,             
        truncation=True,
        padding = 'max_length',     
        return_tensors = 'pt',
        max_length = 128
    )
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    labels = torch.Tensor(labels).long()
    
    dataset = TextDataSet(input_ids, attention_mask, labels)
    train_size = int(len(dataset)*ratio)
    valid_size = len(dataset) - train_size
    train_dataset, valid_dataset = random_split(dataset,[train_size,valid_size])
    print('Train samples: {}  Valid samples: {}'.format(len(train_dataset),len(valid_dataset)))
    
    return train_dataset, valid_dataset

In [12]:
def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    """
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove emoji
    tweet = re.sub(r'[\U00010000-\U0010ffff]', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)

    return tweet

In [13]:
class Trainer:

    def __init__(self, model, train_loader, valid_loader, config):
        self.model = model
        self.train_loader = train_loader
        self.valid_loader = valid_loader
        self.config = config

        # take over whatever gpus are on the system
        self.device = 'cpu'
        if torch.cuda.is_available():
            self.device = torch.cuda.current_device()
#             self.model = torch.nn.DataParallel(self.model).to(self.device)
            self.model = self.model.to(self.device)

    def save_checkpoint(self):
        # DataParallel wrappers keep raw model object in .module attribute
        raw_model = self.model.module if hasattr(self.model, "module") else self.model
        os.makedirs(self.config.ckpt_path, exist_ok=True)
        save_path = os.path.join(self.config.ckpt_path, self.config.model_name)
        torch.save(raw_model.state_dict(), save_path)
        logger.info("Save model to {}".format(save_path))

    def train(self):
        model, config = self.model, self.config
        raw_model = model.module if hasattr(self.model, "module") else model
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=config.learning_rate, betas=config.betas)
        def run_epoch(split):
            is_train = (split == 'train')
            model.train(is_train)
            loader = self.train_loader if is_train else self.valid_loader
            
            losses = []
            all_y = []
            all_y_pred = []
            pbar = tqdm(enumerate(loader), total=len(loader)) if is_train else enumerate(loader)
            for it, (input_ids, attention_mask, y) in pbar:
                # place data on the correct device
#                 print(input_ids.shape,y.shape)
                input_ids = input_ids.to(self.device)
                attention_mask = attention_mask.to(self.device)
                y = y.to(self.device)
                # forward the model
                with torch.set_grad_enabled(is_train):
                    outputs = model(input_ids, attention_mask, labels=y)
                    logits = outputs.logits
                    loss = loss_fct(logits, y)
                    loss = loss.mean() # collapse all losses if they are scattered on multiple gpus
                    losses.append(loss.item())
                    y_pred = torch.argmax(logits, dim=1)
                    y = y.cpu().detach().numpy()
                    y_pred = y_pred.cpu().detach().numpy()
                    step_score = accuracy_score(y_pred, y)
                    all_y.extend(y)
                    all_y_pred.extend(y_pred)
                
                if is_train:

                    # backprop and update the parameters
                    model.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_norm_clip)
                    optimizer.step()

                    # decay the learning rate based on our progress
                    if config.lr_decay:
                        self.tokens += batch_size # number of tokens processed this step (i.e. label is not -100)
                        if self.tokens < config.warmup_tokens:
                            # linear warmup
                            lr_mult = float(self.tokens) / float(max(1, config.warmup_tokens))
                        else:
                            # cosine learning rate decay
                            progress = float(self.tokens - config.warmup_tokens) / float(max(1, config.final_tokens - config.warmup_tokens))
                            lr_mult = max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress)))
                        lr = config.learning_rate * lr_mult
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr
                    else:
                        lr = config.learning_rate

                    # report progress
                    pbar.set_description("epoch {} iter {}: train loss {:.5f}, score {:.2f}%, lr {:e}".format(epoch+1,it,loss.item(),step_score*100,lr))

            if not is_train:
                valid_loss = float(np.mean(losses))
                all_y_pred = np.array(all_y_pred)
                all_y = np.array(all_y)
                valid_score = accuracy_score(all_y_pred, all_y)
                logger.info("valid loss: %f", valid_loss)
                logger.info("valid score: %f", valid_score)
                print(classification_report(y_true=all_y, y_pred=all_y_pred))
                return valid_loss

        self.tokens = 0 # counter used for learning rate decay
        best_loss = float('inf')
        valid_loss = run_epoch('valid')
        for epoch in range(config.max_epochs):
            
            run_epoch('train')
            if self.valid_loader is not None:
                valid_loss = run_epoch('valid')
            # supports early stopping based on the valid loss, or just save always if no valid set is provided
            good_model = self.valid_loader is None or valid_loss < best_loss
            if self.config.ckpt_path is not None and good_model:
                best_loss = valid_loss
                self.save_checkpoint()

In [14]:
class TrainerConfig:
    # optimization parameters
    max_epochs = 10
    learning_rate = 1e-5
    betas = (0.9, 0.95)
    grad_norm_clip = 1.0
    weight_decay = 0.1 # may useful optimize method
    # learning rate decay params: linear warmup followed by cosine decay to 10% of original
    lr_decay = False # optimize method
    warmup_tokens = 375e6 # use this to train model from a lower learning rate
    final_tokens = 260e9 # all tokens during whole training process
    # checkpoint settings
    ckpt_path = './models_dir' # save model path
    model_name = "model_race.pt"

    def __init__(self, **kwargs):
        for k,v in kwargs.items():
            print(k,v)
            setattr(self, k, v)

In [15]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [16]:
train_data = []
train_label = []
for t, r in zip(df['tweets'].to_list(),df['race'].to_list()):
    if r == 3:
#         if random.random() < 0.125:
        train_data.append(t)
        train_label.append(r)
    else:
        train_data.append(t)
        train_label.append(r)

In [17]:
train_dataset, valid_dataset = encode_data(train_data,train_label)
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)

Train samples: 2440  Valid samples: 611


In [18]:
loss_fct = nn.CrossEntropyLoss()
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)
# print model all parameters and parameters need training
print('{} : all params: {:4f}M'.format(model._get_name(), sum(p.numel() for p in model.parameters()) / 1000 / 1000))

BertForSequenceClassification : all params: 109.485316M


In [19]:
max_epochs = 10
final_tokens = max_epochs * batch_size * len(train_loader)
warmup_tokens = final_tokens//10
tconf = TrainerConfig(max_epochs=max_epochs, learning_rate=1e-5, lr_decay=True, 
                      warmup_tokens=warmup_tokens, final_tokens=final_tokens)

max_epochs 10
learning_rate 1e-05
lr_decay True
warmup_tokens 2464
final_tokens 24640


In [None]:
trainer = Trainer(model, train_loader, valid_loader, tconf)
trainer.train()

12/07/2021 22:35:32 - valid loss: 1.664461
12/07/2021 22:35:32 - valid score: 0.037643
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        50
           1       0.06      0.16      0.09        32
           2       0.03      0.86      0.07        21
           3       0.00      0.00      0.00       508

    accuracy                           0.04       611
   macro avg       0.02      0.25      0.04       611
weighted avg       0.00      0.04      0.01       611



epoch 1 iter 76: train loss 0.53956, score 87.50%, lr 1.000000e-05: 100%|████████████████████████████████| 77/77 [00:21<00:00,  3.63it/s]
12/07/2021 22:35:55 - valid loss: 0.595232
12/07/2021 22:35:55 - valid score: 0.831424
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        50
           1       0.00      0.00      0.00        32
           2       0.00      0.00      0.00        21
           3       0.83      1.00      0.91       508

    accuracy                           0.83       611
   macro avg       0.21      0.25      0.23       611
weighted avg       0.69      0.83      0.75       611



12/07/2021 22:35:59 - Save model to ./models_dir/model_race.pt
epoch 2 iter 76: train loss 0.45668, score 87.50%, lr 9.698463e-06: 100%|████████████████████████████████| 77/77 [00:20<00:00,  3.71it/s]
12/07/2021 22:36:22 - valid loss: 0.571006
12/07/2021 22:36:22 - valid score: 0.831424
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        50
           1       0.00      0.00      0.00        32
           2       0.00      0.00      0.00        21
           3       0.83      1.00      0.91       508

    accuracy                           0.83       611
   macro avg       0.21      0.25      0.23       611
weighted avg       0.69      0.83      0.75       611



12/07/2021 22:36:26 - Save model to ./models_dir/model_race.pt
epoch 3 iter 76: train loss 0.52065, score 87.50%, lr 8.830222e-06: 100%|████████████████████████████████| 77/77 [00:20<00:00,  3.71it/s]
12/07/2021 22:36:48 - valid loss: 0.600868
12/07/2021 22:36:48 - valid score: 0.831424
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        50
           1       0.00      0.00      0.00        32
           2       0.00      0.00      0.00        21
           3       0.83      1.00      0.91       508

    accuracy                           0.83       611
   macro avg       0.21      0.25      0.23       611
weighted avg       0.69      0.83      0.75       611



epoch 4 iter 76: train loss 0.27814, score 87.50%, lr 7.500000e-06: 100%|████████████████████████████████| 77/77 [00:20<00:00,  3.72it/s]
12/07/2021 22:37:10 - valid loss: 0.576066
12/07/2021 22:37:10 - valid score: 0.826514
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        50
           1       0.00      0.00      0.00        32
           2       0.00      0.00      0.00        21
           3       0.83      0.99      0.91       508

    accuracy                           0.83       611
   macro avg       0.21      0.25      0.23       611
weighted avg       0.69      0.83      0.75       611



epoch 5 iter 76: train loss 0.99966, score 62.50%, lr 5.868241e-06: 100%|████████████████████████████████| 77/77 [00:20<00:00,  3.74it/s]
12/07/2021 22:37:32 - valid loss: 0.589041
12/07/2021 22:37:32 - valid score: 0.829787
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.40      0.04      0.07        50
           1       0.00      0.00      0.00        32
           2       0.00      0.00      0.00        21
           3       0.83      0.99      0.91       508

    accuracy                           0.83       611
   macro avg       0.31      0.26      0.24       611
weighted avg       0.73      0.83      0.76       611



epoch 6 iter 76: train loss 0.86954, score 62.50%, lr 4.131759e-06: 100%|████████████████████████████████| 77/77 [00:22<00:00,  3.41it/s]
12/07/2021 22:37:56 - valid loss: 0.660893
12/07/2021 22:37:56 - valid score: 0.828151
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.36      0.10      0.16        50
           1       0.00      0.00      0.00        32
           2       0.00      0.00      0.00        21
           3       0.84      0.99      0.91       508

    accuracy                           0.83       611
   macro avg       0.30      0.27      0.27       611
weighted avg       0.73      0.83      0.77       611



epoch 7 iter 76: train loss 0.56075, score 75.00%, lr 2.500000e-06: 100%|████████████████████████████████| 77/77 [00:20<00:00,  3.70it/s]
12/07/2021 22:38:18 - valid loss: 0.611969
12/07/2021 22:38:18 - valid score: 0.823241
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.35      0.16      0.22        50
           1       0.00      0.00      0.00        32
           2       0.00      0.00      0.00        21
           3       0.84      0.97      0.90       508

    accuracy                           0.82       611
   macro avg       0.30      0.28      0.28       611
weighted avg       0.73      0.82      0.77       611



epoch 8 iter 76: train loss 0.34358, score 87.50%, lr 1.169778e-06: 100%|████████████████████████████████| 77/77 [00:20<00:00,  3.72it/s]
12/07/2021 22:38:40 - valid loss: 0.623000
12/07/2021 22:38:40 - valid score: 0.824877
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.35      0.14      0.20        50
           1       0.00      0.00      0.00        32
           2       0.00      0.00      0.00        21
           3       0.84      0.98      0.90       508

    accuracy                           0.82       611
   macro avg       0.30      0.28      0.28       611
weighted avg       0.73      0.82      0.77       611



epoch 9 iter 20: train loss 0.35779, score 87.50%, lr 1.000000e-06:  27%|████████▋                       | 21/77 [00:05<00:15,  3.68it/s]