In [1]:
import json
import pandas as pd
import numpy as np
import re
from sklearn import metrics
from sklearn.metrics import classification_report
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BertTokenizer
from sklearn.metrics import accuracy_score
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from tqdm import tqdm
import math
import random
import os
import pickle
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
device = 'cuda' if torch.cuda.is_available() else 'cpu'   

In [2]:
# set up logging
import logging
logging.basicConfig(
        format="%(asctime)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
)
logger = logging.getLogger(__name__)
logging.getLogger('transformers').setLevel(logging.ERROR)

In [3]:
df = pd.read_csv('clean_dataset_1145.csv')

In [4]:
df['tweets'] = df['tweets'].apply(lambda x:str(x).replace("nan",""))

In [5]:
df

Unnamed: 0,age,screen_name,name,lang,description,tweets,img_path
0,1,_____zac_____,zac ¢,en,_____Û___È_Ü´Ù,".The owner of drip doesnt even have 100 mill,...",labeled_users_1145/profile pics test/0.jpeg
1,0,___aleia,_æ___ dad ___æ_,en,BLACK. LIVES. MATTER.,I haven’t talked to this girl since my sophomo...,labeled_users_1145/profile pics test/1.jpeg
2,0,___schaeffer___,Brenden Schaeffer,en,Culver-Stockton College '20 ¢ Ô_Ô_Ô KM 1548...,☝🏼👋🏼 ://t.co/7NcaO1fyc5 ://t.co/bkhrNcvp6Q 37...,labeled_users_1145/profile pics test/3.jpeg
3,0,__EmilyRice__,em,en,#TXST22,yes but come to san marcos and live with me 🥰...,labeled_users_1145/profile pics test/9.jpeg
4,0,__ginaaaa__,Gina Marano,en,WVU Nursing 20,small :).Go get ready for dinner. JACK are ...,labeled_users_1145/profile pics test/10.jpeg
...,...,...,...,...,...,...,...
1036,0,Zgs_Apollo,Anthony Sharp,en,22 Youtuber & Twitch Streamer trying to live t...,"Goodnight my friends, much love.Drop a ❤️ if t...",labeled_users_1145/profile pics test/3266.jpeg
1037,0,zmeadows_18,Z Meadows,und,|OUCÈ23__|,ROLL BOBBIES ROLL💚🖤💚🖤💚.We Are Texans! Im takin...,labeled_users_1145/profile pics test/3268.jpeg
1038,0,ZoeCalamaco,Zoe _,no,Angelo state,one person followed me // automatically checke...,labeled_users_1145/profile pics test/3272.jpeg
1039,1,ZoPeachy,Zobella Thee Alpha __ê_____´Ù__ ...,en,Harlot for hire. FinDom. 27. Nonbinary. they/t...,Good morning! Say it back ♡ Friday! Send to p...,labeled_users_1145/profile pics test/3274.jpeg


In [6]:
df['tweets']

0        .The owner of drip doesnt even have 100 mill,...
1       I haven’t talked to this girl since my sophomo...
2       ☝🏼👋🏼 ://t.co/7NcaO1fyc5 ://t.co/bkhrNcvp6Q  37...
3        yes but come to san marcos and live with me 🥰...
4        small :).Go get ready for dinner.   JACK are ...
                              ...                        
1036    Goodnight my friends, much love.Drop a ❤️ if t...
1037    ROLL BOBBIES ROLL💚🖤💚🖤💚.We Are Texans! Im takin...
1038    one person followed me // automatically checke...
1039    Good morning! Say it back ♡  Friday! Send to p...
1040    i just know willows next album is gonna set me...
Name: tweets, Length: 1041, dtype: object

In [7]:
df['age']

0       1
1       0
2       0
3       0
4       0
       ..
1036    0
1037    0
1038    0
1039    1
1040    0
Name: age, Length: 1041, dtype: int64

In [8]:
df['age'].value_counts()

0    729
1    312
Name: age, dtype: int64

In [9]:
class TextDataSet(Dataset):
    def __init__(self, input_ids, attention_mask, label):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.label = label

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):   
        return (self.input_ids[idx], self.attention_mask[idx], self.label[idx])

In [10]:
def encode_data(text_list, labels, ratio=0.8):
    inputs = tokenizer.batch_encode_plus(
        text_list,                      
        add_special_tokens = True,             
        truncation=True,
        padding = 'max_length',     
        return_tensors = 'pt',
        max_length = 128
    )
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    labels = torch.Tensor(labels).long()
    
    dataset = TextDataSet(input_ids, attention_mask, labels)
    train_size = int(len(dataset)*ratio)
    valid_size = len(dataset) - train_size
    train_dataset, valid_dataset = random_split(dataset,[train_size,valid_size])
    print('Train samples: {}  Valid samples: {}'.format(len(train_dataset),len(valid_dataset)))
    
    return train_dataset, valid_dataset

In [11]:
def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    """
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove emoji
    tweet = re.sub(r'[\U00010000-\U0010ffff]', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)

    return tweet

In [12]:
class Trainer:

    def __init__(self, model, train_loader, valid_loader, config):
        self.model = model
        self.train_loader = train_loader
        self.valid_loader = valid_loader
        self.config = config

        # take over whatever gpus are on the system
        self.device = 'cpu'
        if torch.cuda.is_available():
            self.device = torch.cuda.current_device()
#             self.model = torch.nn.DataParallel(self.model).to(self.device)
            self.model = self.model.to(self.device)

    def save_checkpoint(self):
        # DataParallel wrappers keep raw model object in .module attribute
        raw_model = self.model.module if hasattr(self.model, "module") else self.model
        os.makedirs(self.config.ckpt_path, exist_ok=True)
        save_path = os.path.join(self.config.ckpt_path, self.config.model_name)
        torch.save(raw_model.state_dict(), save_path)
        logger.info("Save model to {}".format(save_path))

    def train(self):
        model, config = self.model, self.config
        raw_model = model.module if hasattr(self.model, "module") else model
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=config.learning_rate, betas=config.betas)
        def run_epoch(split):
            is_train = (split == 'train')
            model.train(is_train)
            loader = self.train_loader if is_train else self.valid_loader
            
            losses = []
            all_y = []
            all_y_pred = []
            pbar = tqdm(enumerate(loader), total=len(loader)) if is_train else enumerate(loader)
            for it, (input_ids, attention_mask, y) in pbar:
                # place data on the correct device
#                 print(input_ids.shape,y.shape)
                input_ids = input_ids.to(self.device)
                attention_mask = attention_mask.to(self.device)
                y = y.to(self.device)
                # forward the model
                with torch.set_grad_enabled(is_train):
                    outputs = model(input_ids, attention_mask, labels=y)
                    logits = outputs.logits
                    loss = loss_fct(logits, y)
                    loss = loss.mean() # collapse all losses if they are scattered on multiple gpus
                    losses.append(loss.item())
                    y_pred = torch.argmax(logits, dim=1)
                    y = y.cpu().detach().numpy()
                    y_pred = y_pred.cpu().detach().numpy()
                    step_score = accuracy_score(y_pred, y)
                    all_y.extend(y)
                    all_y_pred.extend(y_pred)
                
                if is_train:

                    # backprop and update the parameters
                    model.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_norm_clip)
                    optimizer.step()

                    # decay the learning rate based on our progress
                    if config.lr_decay:
                        self.tokens += batch_size # number of tokens processed this step (i.e. label is not -100)
                        if self.tokens < config.warmup_tokens:
                            # linear warmup
                            lr_mult = float(self.tokens) / float(max(1, config.warmup_tokens))
                        else:
                            # cosine learning rate decay
                            progress = float(self.tokens - config.warmup_tokens) / float(max(1, config.final_tokens - config.warmup_tokens))
                            lr_mult = max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress)))
                        lr = config.learning_rate * lr_mult
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr
                    else:
                        lr = config.learning_rate

                    # report progress
                    pbar.set_description("epoch {} iter {}: train loss {:.5f}, score {:.2f}%, lr {:e}".format(epoch+1,it,loss.item(),step_score*100,lr))

            if not is_train:
                valid_loss = float(np.mean(losses))
                all_y_pred = np.array(all_y_pred)
                all_y = np.array(all_y)
                valid_score = accuracy_score(all_y_pred, all_y)
                logger.info("valid loss: %f", valid_loss)
                logger.info("valid score: %f", valid_score)
                print(classification_report(y_true=all_y, y_pred=all_y_pred))
                return valid_loss

        self.tokens = 0 # counter used for learning rate decay
        best_loss = float('inf')
        valid_loss = run_epoch('valid')
        for epoch in range(config.max_epochs):
            
            run_epoch('train')
            if self.valid_loader is not None:
                valid_loss = run_epoch('valid')
            # supports early stopping based on the valid loss, or just save always if no valid set is provided
            good_model = self.valid_loader is None or valid_loss < best_loss
            if self.config.ckpt_path is not None and good_model:
                best_loss = valid_loss
                self.save_checkpoint()

In [13]:
class TrainerConfig:
    # optimization parameters
    max_epochs = 10
    learning_rate = 1e-5
    betas = (0.9, 0.95)
    grad_norm_clip = 1.0
    weight_decay = 0.1 # may useful optimize method
    # learning rate decay params: linear warmup followed by cosine decay to 10% of original
    lr_decay = False # optimize method
    warmup_tokens = 375e6 # use this to train model from a lower learning rate
    final_tokens = 260e9 # all tokens during whole training process
    # checkpoint settings
    ckpt_path = './models_dir' # save model path
    model_name = "model_race.pt"

    def __init__(self, **kwargs):
        for k,v in kwargs.items():
            print(k,v)
            setattr(self, k, v)

In [14]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [15]:
train_data = []
train_label = []
for t, r in zip(df['tweets'].to_list(),df['age'].to_list()):
    if r == 3:
#         if random.random() < 0.125:
        train_data.append(t)
        train_label.append(r)
    else:
        train_data.append(t)
        train_label.append(r)

In [16]:
train_dataset, valid_dataset = encode_data(train_data,train_label)
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)

Train samples: 832  Valid samples: 209


In [17]:
loss_fct = nn.CrossEntropyLoss()
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
# print model all parameters and parameters need training
print('{} : all params: {:4f}M'.format(model._get_name(), sum(p.numel() for p in model.parameters()) / 1000 / 1000))

BertForSequenceClassification : all params: 109.483778M


In [18]:
max_epochs = 10
final_tokens = max_epochs * batch_size * len(train_loader)
warmup_tokens = final_tokens//10
tconf = TrainerConfig(max_epochs=max_epochs, learning_rate=1e-5, lr_decay=True, 
                      warmup_tokens=warmup_tokens, final_tokens=final_tokens)

max_epochs 10
learning_rate 1e-05
lr_decay True
warmup_tokens 832
final_tokens 8320


In [19]:
trainer = Trainer(model, train_loader, valid_loader, tconf)
trainer.train()

12/07/2021 22:40:40 - valid loss: 0.703308
12/07/2021 22:40:40 - valid score: 0.397129


              precision    recall  f1-score   support

           0       0.74      0.19      0.31       144
           1       0.32      0.85      0.47        65

    accuracy                           0.40       209
   macro avg       0.53      0.52      0.39       209
weighted avg       0.61      0.40      0.36       209



epoch 1 iter 25: train loss 0.46862, score 81.25%, lr 1.000000e-05: 100%|████████████████████████████████| 26/26 [00:07<00:00,  3.49it/s]
12/07/2021 22:40:48 - valid loss: 0.617577
12/07/2021 22:40:48 - valid score: 0.688995
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.69      1.00      0.82       144
           1       0.00      0.00      0.00        65

    accuracy                           0.69       209
   macro avg       0.34      0.50      0.41       209
weighted avg       0.47      0.69      0.56       209



12/07/2021 22:40:52 - Save model to ./models_dir/model_race.pt
epoch 2 iter 25: train loss 0.53344, score 78.12%, lr 9.698463e-06: 100%|████████████████████████████████| 26/26 [00:07<00:00,  3.68it/s]
12/07/2021 22:40:59 - valid loss: 0.584370
12/07/2021 22:40:59 - valid score: 0.688995
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.69      1.00      0.82       144
           1       0.00      0.00      0.00        65

    accuracy                           0.69       209
   macro avg       0.34      0.50      0.41       209
weighted avg       0.47      0.69      0.56       209



12/07/2021 22:41:03 - Save model to ./models_dir/model_race.pt
epoch 3 iter 25: train loss 0.51128, score 81.25%, lr 8.830222e-06: 100%|████████████████████████████████| 26/26 [00:06<00:00,  3.72it/s]
12/07/2021 22:41:11 - valid loss: 0.574160
12/07/2021 22:41:11 - valid score: 0.684211


              precision    recall  f1-score   support

           0       0.69      0.99      0.81       144
           1       0.00      0.00      0.00        65

    accuracy                           0.68       209
   macro avg       0.34      0.50      0.41       209
weighted avg       0.47      0.68      0.56       209



12/07/2021 22:41:14 - Save model to ./models_dir/model_race.pt
epoch 4 iter 25: train loss 0.59815, score 68.75%, lr 7.500000e-06: 100%|████████████████████████████████| 26/26 [00:07<00:00,  3.66it/s]
12/07/2021 22:41:22 - valid loss: 0.582817
12/07/2021 22:41:22 - valid score: 0.693780


              precision    recall  f1-score   support

           0       0.72      0.92      0.80       144
           1       0.52      0.20      0.29        65

    accuracy                           0.69       209
   macro avg       0.62      0.56      0.55       209
weighted avg       0.66      0.69      0.64       209



epoch 5 iter 25: train loss 0.44846, score 78.12%, lr 5.868241e-06: 100%|████████████████████████████████| 26/26 [00:07<00:00,  3.69it/s]
12/07/2021 22:41:30 - valid loss: 0.586911
12/07/2021 22:41:30 - valid score: 0.708134


              precision    recall  f1-score   support

           0       0.75      0.86      0.80       144
           1       0.55      0.37      0.44        65

    accuracy                           0.71       209
   macro avg       0.65      0.62      0.62       209
weighted avg       0.69      0.71      0.69       209



epoch 6 iter 25: train loss 0.33576, score 87.50%, lr 4.131759e-06: 100%|████████████████████████████████| 26/26 [00:07<00:00,  3.69it/s]
12/07/2021 22:41:37 - valid loss: 0.590936
12/07/2021 22:41:37 - valid score: 0.708134


              precision    recall  f1-score   support

           0       0.76      0.85      0.80       144
           1       0.54      0.40      0.46        65

    accuracy                           0.71       209
   macro avg       0.65      0.62      0.63       209
weighted avg       0.69      0.71      0.69       209



epoch 7 iter 25: train loss 0.38200, score 87.50%, lr 2.500000e-06: 100%|████████████████████████████████| 26/26 [00:07<00:00,  3.69it/s]
12/07/2021 22:41:45 - valid loss: 0.659406
12/07/2021 22:41:45 - valid score: 0.693780


              precision    recall  f1-score   support

           0       0.75      0.84      0.79       144
           1       0.51      0.37      0.43        65

    accuracy                           0.69       209
   macro avg       0.63      0.60      0.61       209
weighted avg       0.67      0.69      0.68       209



epoch 8 iter 25: train loss 0.32676, score 87.50%, lr 1.169778e-06: 100%|████████████████████████████████| 26/26 [00:07<00:00,  3.71it/s]
12/07/2021 22:41:52 - valid loss: 0.636753
12/07/2021 22:41:52 - valid score: 0.679426


              precision    recall  f1-score   support

           0       0.77      0.77      0.77       144
           1       0.48      0.48      0.48        65

    accuracy                           0.68       209
   macro avg       0.62      0.62      0.62       209
weighted avg       0.68      0.68      0.68       209



epoch 9 iter 25: train loss 0.25552, score 96.88%, lr 1.000000e-06: 100%|████████████████████████████████| 26/26 [00:07<00:00,  3.71it/s]
12/07/2021 22:42:00 - valid loss: 0.613113
12/07/2021 22:42:00 - valid score: 0.688995


              precision    recall  f1-score   support

           0       0.75      0.82      0.78       144
           1       0.50      0.40      0.44        65

    accuracy                           0.69       209
   macro avg       0.63      0.61      0.61       209
weighted avg       0.67      0.69      0.68       209



epoch 10 iter 25: train loss 0.32243, score 90.62%, lr 1.000000e-06: 100%|███████████████████████████████| 26/26 [00:07<00:00,  3.71it/s]
12/07/2021 22:42:07 - valid loss: 0.640485
12/07/2021 22:42:07 - valid score: 0.698565


              precision    recall  f1-score   support

           0       0.77      0.80      0.78       144
           1       0.52      0.48      0.50        65

    accuracy                           0.70       209
   macro avg       0.64      0.64      0.64       209
weighted avg       0.69      0.70      0.70       209

