In [21]:
!pip install -q tensorflow-text
!pip install -q tf-models-official

In [22]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
import tensorflow as tf

#checking google collab gpu device
device_name = tf.test.gpu_device_name()

if device_name == '/device:GPU:0':
  print('Found GPU at: {}'.format(device_name))
else:
  raise SystemError('No  GPU')

Found GPU at: /device:GPU:0


In [24]:
import torch

if torch.cuda.is_available():
  device = torch.device("cuda")
  print('There are %d GPU(s) available.' % torch.cuda.device_count())
  print('We wil use the GPU: ', torch.cuda.get_device_name(0))
else:
  print('you have no gpu')
  device = torch.device('cpu')

There are 1 GPU(s) available.
We wil use the GPU:  Tesla T4


In [25]:
!pip install transformers



In [26]:
import pandas as pd
import numpy as np
import numpy as np
import time
import datetime
import random
import matplotlib.pyplot as plt
import seaborn as sns
import os

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [27]:
#read data from google drive. Since clean data is in this repository, you can change the path accordingly
df = pd.read_csv('/content/drive/My Drive/SJSU/Sem 8/CMPE 188/clean_v1.csv')
df.rename(columns = {'class' : 'label'}, inplace = True)
df.head()

Unnamed: 0.1,Unnamed: 0,tweet,category,label
0,0,As a woman you shouldn't complain about clean...,neither,2
1,1,boy dats cold tyga dwn bad for cuffin dat hoe...,offensive_lang,1
2,2,Dawg You ever fuck a bitch and she start to c...,offensive_lang,1
3,3,she look like a tranny,offensive_lang,1
4,4,The shit you hear about me might be true or i...,offensive_lang,1


In [28]:
print('Loading BERT tokenizer')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)

Loading BERT tokenizer


In [29]:
#tokenizing all the tweets with the BERT tokenizer
tweets = df.tweet.values
label = df.label.values

input_id = []

for tweet in tweets:
  encodeded_tweet = tokenizer.encode(
      tweet, add_special_tokens = True
  )
  input_id.append(encodeded_tweet)

print('Original tweet: ',  tweets[0])
print('Tokenized tweet: ', input_id[0])

Original tweet:   As a woman you shouldn't complain about cleaning up your house as a man you should always take the trash out 
Tokenized tweet:  [101, 2004, 1037, 2450, 2017, 5807, 1005, 1056, 17612, 2055, 9344, 2039, 2115, 2160, 2004, 1037, 2158, 2017, 2323, 2467, 2202, 1996, 11669, 2041, 102]


In [30]:
print('Max tweet length: ', max(len(tweet) for tweet in input_id))

Max tweet length:  68


In [31]:
MAX_LEN = 75 #arbitrarily chosen. Just over the length of the longest tweet

print('\nPadding/truncating all sentences')
print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

input_id = pad_sequences(input_id, maxlen=MAX_LEN, dtype="long",
                         value=0, truncating="post", padding="post")

print('\n Done')


Padding/truncating all sentences

Padding token: "[PAD]", ID: 0

 Done


In [32]:
#adding attention masks to tokenized tweets.
#attention masks tell the model which words should be payed more attention to
attention_masks = []

for tweet in input_id:
  att_mask = [int(token_id > 0) for token_id in tweet]
  attention_masks.append(att_mask)

In [33]:
#train test split for both the inputs and attention masks
train_input, val_input, train_label, val_label = train_test_split(input_id, label, random_state=2018, test_size=0.1)

train_mask, val_mask, _, _ = train_test_split(attention_masks, label, random_state=2018, test_size=0.1)

In [34]:
#convert to tensor data types
train_input = torch.tensor(train_input)
val_input = torch.tensor(val_input)

train_label = torch.tensor(train_label)
val_label = torch.tensor(val_label)

train_mask = torch.tensor(train_mask)
val_mask = torch.tensor(val_mask)

In [35]:
batch_size = 32 #use batch size of 32 but 16 was an alternative here

train_data = TensorDataset(train_input, train_mask, train_label)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_input, val_mask, val_label)
val_sampler = RandomSampler(val_data)
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

In [36]:
#load the pretrained model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 3,
    output_attentions = False,
    output_hidden_states = False
)

model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [37]:
optimizer = AdamW(model.parameters(),
                 lr = 5e-5, #this is the largest recommneded learning rate from the paper
                 eps = 1e-8) #used to prevent division by zero during implementation

epochs = 4 #we are using the maximum recommended number of epochs
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [38]:
#to get the accuracy of the model
def flat_accuracy(preds, labels):
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat) / len(labels_flat)

#to get the time elapsed over each step
def format_time(elapsed):
  elapsed_rounded = int(round((elapsed)))
  return str(datetime.timedelta(seconds=elapsed_rounded))

In [39]:
# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        loss = outputs[0]

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in val_dataloader:
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...
  Batch    40  of    697.    Elapsed: 0:00:16.
  Batch    80  of    697.    Elapsed: 0:00:33.
  Batch   120  of    697.    Elapsed: 0:00:49.
  Batch   160  of    697.    Elapsed: 0:01:07.
  Batch   200  of    697.    Elapsed: 0:01:24.
  Batch   240  of    697.    Elapsed: 0:01:41.
  Batch   280  of    697.    Elapsed: 0:01:59.
  Batch   320  of    697.    Elapsed: 0:02:16.
  Batch   360  of    697.    Elapsed: 0:02:34.
  Batch   400  of    697.    Elapsed: 0:02:52.
  Batch   440  of    697.    Elapsed: 0:03:10.
  Batch   480  of    697.    Elapsed: 0:03:27.
  Batch   520  of    697.    Elapsed: 0:03:45.
  Batch   560  of    697.    Elapsed: 0:04:03.
  Batch   600  of    697.    Elapsed: 0:04:21.
  Batch   640  of    697.    Elapsed: 0:04:38.
  Batch   680  of    697.    Elapsed: 0:04:56.

  Average training loss: 0.29
  Training epcoh took: 0:05:04

Running Validation...
  Accuracy: 0.91
  Validation took: 0:00:13

Training...
  Batch    40  of    697.    Elapsed: 0:00:18.

In [40]:
#export model
output_dir = './model_save/'

if not os.path.exists(output_dir):
  os.makedirs(output_dir)

model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./model_save/tokenizer_config.json',
 './model_save/special_tokens_map.json',
 './model_save/vocab.txt',
 './model_save/added_tokens.json')