In [5]:
!pip install pytorch-transformers
!pip install pytorch-pretrained-bert pytorch-nlp
!pip install pytorch_pretrained_bert
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

Collecting pytorch-transformers
  Downloading pytorch_transformers-1.2.0-py3-none-any.whl (176 kB)
[K     |████████████████████████████████| 176 kB 5.0 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 49.8 MB/s 
Collecting boto3
  Downloading boto3-1.18.43-py3-none-any.whl (131 kB)
[K     |████████████████████████████████| 131 kB 66.5 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 44.6 MB/s 
Collecting s3transfer<0.6.0,>=0.5.0
  Downloading s3transfer-0.5.0-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 6.4 MB/s 
[?25hCollecting jmespath<1.0.0,>=0.7.1
  Downloading jmespath-0.10.0-py2.py3-none-any.whl (24 kB)
Collecting botocore<1.22.0,>=1.21.43
  Downloading botocore-1.21.43-py3-none-any.whl (7.9 MB)
[K     |███████████████

Collecting pytorch-pretrained-bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[K     |████████████████████████████████| 123 kB 5.1 MB/s 
[?25hCollecting pytorch-nlp
  Downloading pytorch_nlp-0.5.0-py3-none-any.whl (90 kB)
[K     |████████████████████████████████| 90 kB 8.0 MB/s 
Installing collected packages: pytorch-pretrained-bert, pytorch-nlp
Successfully installed pytorch-nlp-0.5.0 pytorch-pretrained-bert-0.6.2


# Load twitter dataset  

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
pd.set_option('display.max_colwidth', None)
data = pd.read_excel("/content/drive/MyDrive/final_data.xlsx")
data = data[['preprocess_tweets','object','Emoji']]
data = data.dropna()
print('We have',len(data), 'tweets in the dataset')
# labels
labels = data.Emoji.values

We have 21238 tweets in the dataset


In [7]:
# number of tweets for each emoji (check to be balance)
print(data['Emoji'].value_counts())

1    2426
7    2392
0    2280
2    2199
5    2150
8    2103
3    2042
4    1979
9    1843
6    1824
Name: Emoji, dtype: int64


# Train Topic Model (LDA)

In [8]:
# converting the text data(tweets and objects) into vectors and build vocabulary 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = CountVectorizer(analyzer='word',                  # word-level tokenization
                             min_df=2,                         # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             max_features=1800,                # max number of uniq words
                            )

data_vectorized1 = vectorizer.fit_transform(data['preprocess_tweets'])
data_vectorized2 = vectorizer.fit_transform(data['object'])
print(data_vectorized1.shape)
print(data_vectorized2.shape)

(21238, 1800)
(21238, 1800)


In [9]:
# fit LDA on dataset
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components=90,           # Number of topics
                                      learning_method='online',
                                      random_state=0,            # Random state
                                      n_jobs = -1                # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(np.concatenate((data_vectorized1.toarray(),data_vectorized2.toarray())))

In [106]:
# calculate LDA prob for all tweets and objects in dataset
def predict_topic(text):
    # Step 1: Vectorize transform
    mytext_4 = vectorizer.transform(text)
    # Step 2: LDA Transform
    topic_probability_scores = lda_model.transform(mytext_4)
    return topic_probability_scores

prob_scores_q1 = predict_topic(text = data['preprocess_tweets'])
prob_scores_q2 = predict_topic(text = data['object'])
print(prob_scores_q1.shape)
print(prob_scores_q2.shape)

(21238, 90)
(21238, 90)


# Preparing data to enter the BERT network

In [107]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# function to tokenize and generate input ids for the tokens
# returns a list of input ids
def prep_data(ques1, ques2):
    
  all_input_ids = []
  
  for (q1,q2) in zip(ques1, ques2):
    
    # first sentence is appended with [CLS] and [SEP] in the beginning and end
    q1 = '[CLS] ' + q1 + ' [SEP] '
    tokens = tokenizer.tokenize(q1)
    
    # 0 denotes first sentence
    seg_ids = [0] * len(tokens)
    
    # second sentence is appended with [SEP] in the end
    q2 = q2 + ' [SEP] '
    tok_q2 = tokenizer.tokenize(q2)
    
    # seg ids is appended with 1 to denote second sentence
    seg_ids += [1] * len(tok_q2)
    
    # first and second sentence tokens are appended together
    tokens += tok_q2
    
    # input ids are generated for the tokens (one question pair)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # input ids are stored in a separate list
    all_input_ids.append(input_ids)
    
  return all_input_ids

all_input_ids = prep_data(data['preprocess_tweets'].values, data['object'].values)

In [109]:
# pad sentence to have equal size tweets
max_len = 0
for i in all_input_ids: 
    if max_len < len(i):
        max_len = len(i)

# max len of sentences 
n = max_len
print(max_len)
pad_input_ids = pad_sequences(all_input_ids, maxlen=n, dtype="long", truncating="post", padding="post")

280


In [110]:
# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in pad_input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

In [111]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [76]:
import torch.nn as nn
import torch.nn.functional as F
from pytorch_pretrained_bert import  BertModel

    
class my_BERT(nn.Module):
    ''' A sequence to sequence model with attention mechanism. '''
    def __init__(self,emb_size, topic_num):
 
        super().__init__()
        self.tbert  = BertModel.from_pretrained('bert-base-uncased') # pretrain BERT
        # linear layer (948,10)
        #(948 = 768:BERT   90:Tweet_LDA  90:object_LDA) (10 = emoji classes)
        self.linear = nn.Linear(emb_size + topic_num + topic_num, 10, bias=False)  

    def forward(self, b_input_ids, attention_mask, topics, token_type_ids=None):
         
        _,pooled_layer = self.tbert(b_input_ids,attention_mask)  # cls output of BERT
        out            = self.linear(torch.cat((pooled_layer,topics),-1))
        return  F.softmax(out)


In [77]:
model = my_BERT(emb_size=768,topic_num=90)
model.to(device)

my_BERT(
  (tbert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
          

In [112]:
# Use train_test_split to split our data into train and validation sets for training
train_inputs,validation_inputs,train_labels,validation_labels=train_test_split(np.concatenate((pad_input_ids,prob_scores_q1,prob_scores_q2),axis=-1),labels,random_state=2021,test_size=0.2)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, pad_input_ids,random_state=2021, test_size=0.2)

In [113]:
train_topics = torch.tensor(train_inputs[:,n:])
train_inputs = torch.tensor(train_inputs[:,0:n])

validation_topics = torch.tensor(validation_inputs[:,n:])
validation_inputs = torch.tensor(validation_inputs[:,0:n])

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [114]:
# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

batch_size = 10
train_data = TensorDataset(train_inputs, train_masks, train_labels,train_topics)
train_dataloader = DataLoader(train_data,batch_size=batch_size,shuffle = True)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels,validation_topics)
validation_dataloader = DataLoader(validation_data, batch_size=batch_size,shuffle = True)


In [115]:
import torch.nn as nn
import torch.optim as optim

learning_rate = 3e-4
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [116]:
torch.cuda.empty_cache()

In [119]:
# Function to calculate the accuracy of our predictions vs labels
def accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
train_loss_set = []
train_acc_set = []


epochs = 50
from tqdm import trange 
from sklearn.metrics import f1_score


# trange is a tqdm wrapper around the normal python range
for epoch in trange(epochs, desc="Epoch"):
  print('-'*8+"epoch:"+str(epoch)+'-'*8)
    
  # Training 
  model.train()
  
  # Tracking variables
  tr_loss = 0
  train_accuracy = 0
  nb_tr_steps = 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    
    # Add batch to GPU
    #batch = tuple(t.to(device) for t in batch)
    
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels, b_topics= batch
   
    # Clear out the gradients
    optimizer.zero_grad()
     
    ###############Bug fix code####################
    b_input_ids = b_input_ids.type(torch.LongTensor)
    b_input_mask = b_input_mask.type(torch.LongTensor)
    b_labels = b_labels.type(torch.LongTensor)
    b_input_ids = b_input_ids.to(device)
    b_input_mask = b_input_mask.to(device)
    b_labels = b_labels.to(device)
    ###############Bug fix code####################

    # Forward pass
    outputs = model(b_input_ids,b_input_mask,b_topics.float().to(device))

    #print("pred",outputs)
    loss = criterion(outputs, b_labels)
    train_loss_set.append(loss.item())    


    
    # Backward pass
    loss.backward()
    
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    
    
    # train accuracy
    outputs_cpu = outputs.detach().cpu().numpy()
    b_labels_cpu = b_labels.detach().cpu().numpy()
    tmp_train_accuracy = accuracy(outputs_cpu, b_labels_cpu)
    train_acc_set.append(tmp_train_accuracy)    
    train_accuracy += tmp_train_accuracy

    # train accuracy
    tr_loss += loss.item()
    nb_tr_steps += 1

  
      
  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables 
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps = 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:
    
    # Add batch to GPU
    #batch = tuple(t.to(device) for t in batch)
    
    # Unpack the inputs from our dataloader
    val_input_ids, val_input_mask, val_labels, val_topics = batch
    
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    ###############Bug fix code####################
    val_input_ids  = val_input_ids.type(torch.LongTensor)
    val_input_mask = val_input_mask.type(torch.LongTensor)
    val_input_ids  = val_input_ids.to(device)
    val_input_mask = val_input_mask.to(device)
    val_labels = val_labels.to(device)
    ###############Bug fix code####################
    # Forward pass, calculate logit predictions

    val_output = model(val_input_ids,val_input_mask,val_topics.float().to(device))
    
    # Move logits and labels to CPU
    losss = criterion(val_output, val_labels)
    eval_loss += losss.item()
    val_output_cpu = val_output.detach().cpu().numpy()
    val_labels_cpu = val_labels.detach().cpu().numpy()
    tmp_eval_accuracy = accuracy(val_output_cpu, val_labels_cpu)
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1


  print("\nepoch train loss:\t"+ str(round((tr_loss/nb_tr_steps),6))    + "\tepoch train acc:\t" + str(round((train_accuracy/nb_tr_steps),6)))
  print("epoch validation loss:\t" + str(round((eval_loss/nb_eval_steps),6)) + "\tepoch validation acc:\t"  + str(round((eval_accuracy/nb_eval_steps),6)))
  torch.cuda.empty_cache()


  