In [None]:
import torch
import tensorflow
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
pd.set_option('display.max_colwidth', None)

In [None]:
data = pd.read_excel("preprocess_data.xlsx")
data = data[['preprocess_tweets','object','Emoji']]
data = data.dropna()
print('We have',len(data), 'tweets in the dataset')
labels = data.Emoji.values

In [None]:
# to workwith 5 most frequent emoji
#n = 5
#top_5_emoji = data['Emoji'].value_counts()[:n].index.tolist()
#mask = data['Emoji'].isin(top_5_emoji)
#data = data[mask]
#data = data.reset_index(drop=True)
#print(top_5_emoji)

#dict={1:0,7:1,0:2,2:3,5:4}
#data = data.replace({"Emoji": dict})
#labels = data.Emoji.values
#print('We have',len(data), 'not nan tweet in the data')

# LDA topic modelig

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = CountVectorizer(analyzer='word',       
                             min_df=1,                         # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             max_features=1400,                # max number of uniq words
                            )

data_vectorized1 = vectorizer.fit_transform(data['preprocess_tweets'])
data_vectorized2 = vectorizer.fit_transform(data['object'])
print(data_vectorized1.shape)
print(data_vectorized2.shape)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components=90,           # Number of topics
                                      learning_method='online',
                                      random_state=0,            # Random state
                                      n_jobs = -1                # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(np.concatenate((data_vectorized1.toarray(),data_vectorized2.toarray())))

In [None]:
def predict_topic(text):
    # Step 1: Vectorize transform
    mytext_4 = vectorizer.transform(text)
    # Step 2: LDA Transform
    topic_probability_scores = lda_model.transform(mytext_4)
    return topic_probability_scores


prob_scores_q1 = predict_topic(text = data['preprocess_tweets'])
print(prob_scores_q1.shape)
prob_scores_q2 = predict_topic(text = data['object'])
print(prob_scores_q2.shape)

# Data prepration for BERT

In [None]:
from pytorch_pretrained_bert import BertTokenizer, BertConfig

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# function to tokenize and generate input ids for the tokens
# returns a list of input ids
def prep_data(ques1):
    
  all_input_ids = []
  
  for q1 in ques1:
    
    q1 = '[CLS] ' + q1 
    tokens = tokenizer.tokenize(q1)
    
    # input ids are generated for the tokens (one question pair)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # input ids are stored in a separate list
    all_input_ids.append(input_ids)
    
  return all_input_ids

all_input_ids = prep_data(data['preprocess_tweets'].values)

In [None]:
# pad sentence to have equal size 
max_len = 0
for i in all_input_ids: 
    if max_len < len(i):
        max_len = len(i)

# max len of sentences 
n = max_len
print(max_len)
pad_input_ids = pad_sequences(all_input_ids, maxlen=n, dtype="long", truncating="post", padding="post")

In [None]:
# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in pad_input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

In [None]:
# new accuracy
def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Multimodal model

In [None]:
import torch.nn as nn
import torch.nn.functional as F
from pytorch_pretrained_bert import  BertModel

    
class my_BERT(nn.Module):
    ''' A sequence to sequence model with attention mechanism. '''
    def __init__(self,emb_size, topic_num):
 
        super().__init__()
         # pretrain BERT
        self.tbert  = BertModel.from_pretrained('bert-base-uncased')
        # output shape of classifier = 5 for 5 emoji and 10 for 10 emoji
        self.classifier = torch.nn.Linear(emb_size + topic_num + topic_num,10) 

    def forward(self, b_input_ids, attention_mask, topics, token_type_ids=None):
         
        _, bert = self.tbert(b_input_ids,attention_mask)  # cls output of BERT
        output  = self.classifier(torch.cat((bert,topics),-1))
        
        return output


In [None]:
model = my_BERT(emb_size=768,topic_num=90)
model.to(device)

In [None]:
# Use train_test_split to split our data into train and validation sets for training
train_inputs,validation_inputs,train_labels,validation_labels=train_test_split(np.concatenate((pad_input_ids,prob_scores_q1,prob_scores_q2),axis=-1),labels,random_state=2021,test_size=0.2)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, pad_input_ids,random_state=2021, test_size=0.2)

train_topics = torch.tensor(train_inputs[:,n:])
train_inputs = torch.tensor(train_inputs[:,0:n])

validation_topics = torch.tensor(validation_inputs[:,n:])
validation_inputs = torch.tensor(validation_inputs[:,0:n])

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

batch_size = 10
train_data = TensorDataset(train_inputs, train_masks, train_labels,train_topics)
train_dataloader = DataLoader(train_data,batch_size=batch_size,shuffle = True)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels,validation_topics)
validation_dataloader = DataLoader(validation_data, batch_size=batch_size,shuffle = True)


# define loss function and optimizer

In [None]:
import torch.nn as nn
import torch.optim as optim
from pytorch_pretrained_bert.optimization import BertAdam
from torch.optim.lr_scheduler import ExponentialLR,StepLR


param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay_rate': 0.0}
        ]
    
optimizer = BertAdam(optimizer_grouped_parameters, lr=0.005, warmup=0.1)
criterion = nn.CrossEntropyLoss()
scheduler = StepLR(optimizer, step_size = 3, gamma=0.1)


# Train

In [None]:
from tqdm import trange 
from sklearn.metrics import f1_score

train_loss_set = []
train_acc_set = []
val_loss_set= []
val_acc_set=[]
epochs = 20



# trange is a tqdm wrapper around the normal python range
for epoch in trange(epochs, desc="Epoch"):
  print('-'*8+"epoch:"+str(epoch)+'-'*8)
    
  # Training 
  model.train()
  
  # Tracking variables
  train_loss = 0
  train_accuracy = 0
  nb_tr_steps = 0
  nb_tr_examples = 0

  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
       
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels, b_topics= batch
     
    b_input_ids = b_input_ids.type(torch.LongTensor)
    b_input_mask = b_input_mask.type(torch.LongTensor)
    b_labels = b_labels.type(torch.LongTensor)
    b_input_ids = b_input_ids.to(device)
    b_input_mask = b_input_mask.to(device)
    b_labels = b_labels.to(device)

    
    # Forward pass
    optimizer.zero_grad()
    outputs = model(b_input_ids,b_input_mask,b_topics.float().to(device))

    loss = criterion(outputs, b_labels)
    loss.backward(loss)
   
    # train accuracy
    outputs = outputs.detach().cpu().numpy()
    b_labels = b_labels.to('cpu').numpy()
    tmp_train_accuracy = accuracy(outputs, b_labels)
    train_loss += loss.item()
    train_accuracy += tmp_train_accuracy
    nb_tr_examples += b_input_ids.size(0)
    
    optimizer.step()
    
  scheduler.step()
  train_loss_set.append((train_loss/nb_tr_examples))     
  train_acc_set.append((train_accuracy/nb_tr_examples))

  
      
  # Validation

  model.eval()

  # Tracking variables 
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps = 0
  nb_eval_examples = 0  

  # Evaluate data for one epoch
  for batch in validation_dataloader:
    
    # Unpack the inputs from our dataloader
    val_input_ids, val_input_mask, val_labels, val_topics = batch
    

    val_input_ids  = val_input_ids.type(torch.LongTensor)
    val_input_mask = val_input_mask.type(torch.LongTensor)
    val_input_ids  = val_input_ids.to(device)
    val_input_mask = val_input_mask.to(device)
    val_labels = val_labels.to(device)

    
    # Forward pass, calculate logit predictions
    with torch.no_grad():
        val_output = model(val_input_ids,val_input_mask,val_topics.float().to(device))
    
    tmp_eval_loss = criterion(val_output, val_labels)
    
    val_output = val_output.detach().cpu().numpy()
    val_labels = val_labels.to('cpu').numpy()
    tmp_eval_accuracy = accuracy(val_output, val_labels)
    
    eval_loss += tmp_eval_loss.item()
    eval_accuracy += tmp_eval_accuracy
    

    nb_eval_examples += val_input_ids.size(0)
    
  val_loss_set.append((eval_loss / nb_eval_examples))
  val_acc_set.append((eval_accuracy / nb_eval_examples))
    
  if val_acc_set[-1] >= max(val_acc_set):
        print('saving model ...')
        torch.save(model, "my_checkpoint2.pth.tar")


  print("epoch train loss:\t"+ str(round((train_loss_set[-1]),6))    + "\tepoch train acc:\t" + str(round((train_acc_set[-1]),6)))
  print("epoch validation loss:\t" + str(round((val_loss_set[-1]),6)) + "\tepoch validation acc:\t"  + str(round((val_acc_set[-1]),6)))

In [None]:
import matplotlib.pyplot as plt

def plot_hist():
    plt.plot(train_acc_set)
    plt.plot(val_acc_set)
    plt.title("model accuracy")
    plt.ylabel("accuracy")
    plt.xlabel("epoch")
    plt.legend(["train", "validation"], loc="upper left")
    plt.show()


plot_hist()

In [None]:
import matplotlib.pyplot as plt


def plot_hist():
    plt.plot(train_loss_set)
    plt.plot(val_loss_set)
    plt.title("model loss")
    plt.ylabel("loss")
    plt.xlabel("epoch")
    plt.legend(["train", "validation"], loc="upper left")
    plt.show()


plot_hist()

# Test

In [None]:
test_data = pd.read_excel("preprocess_test_data.xlsx")
test_data = test_data[['preprocess_tweets','object','Emoji']]
test_data = test_data.dropna()
print('We have',len(test_data), 'tweets in the test dataset')
labels = test_data.Emoji.values

In [None]:
prob_scores_q1 = predict_topic(text = test_data['preprocess_tweets'])
print(prob_scores_q1.shape)
prob_scores_q2 = predict_topic(text = test_data['object'])
print(prob_scores_q2.shape)

In [None]:
all_input_ids = prep_data(test_data['preprocess_tweets'].values)

In [None]:
pad_input_ids = pad_sequences(all_input_ids, maxlen=n, dtype="long", truncating="post", padding="post")

In [None]:
# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in pad_input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

In [None]:
test_topics = torch.from_numpy(np.concatenate((prob_scores_q1,prob_scores_q2),axis=-1))
test_inputs = torch.tensor(pad_input_ids)
test_labels = torch.tensor(labels)
test_masks = torch.tensor(attention_masks)

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

test_dataset = TensorDataset(test_inputs, test_masks, test_labels,test_topics)
test_dataloader = DataLoader(test_dataset,batch_size=batch_size)

In [None]:
model = torch.load("my_checkpoint2.pth.tar", map_location=torch.device('cpu'))


In [None]:
test_loss_set  = []
test_acc_set   = []

test_loss, eval_acc = 0, 0
nb_test_steps = 0
nb_test_examples = 0
step = 0 

for batch in test_dataloader:
    
    # Unpack the inputs from our dataloader
    test_input_ids, test_input_mask, test_labels, test_topics = batch
    
    test_input_ids  = test_input_ids.type(torch.LongTensor)
    test_input_mask = test_input_mask.type(torch.LongTensor)
    test_input_ids  = test_input_ids.to(device)
    test_input_mask = test_input_mask.to(device)
    test_labels     = test_labels.to(device)
     
    test_output   = model(test_input_ids,test_input_mask,test_topics.float().to(device))
    tmp_test_loss = criterion(test_output, test_labels)
    test_output   = test_output.detach().cpu().numpy()
    test_labels   = test_labels.to('cpu').numpy()
    tmp_test_acc  = accuracy(val_output, val_labels)
    
    test_loss += tmp_test_loss.item()
    test_acc  += tmp_test_acc
    
    nb_test_examples += test_input_ids.size(0)
 

test_loss_set.append((test_loss / nb_eval_examples))
test_acc_set.append((test_acc / nb_eval_examples))
       
print("test loss:\t" + str(round((test_loss_set),6)) + "\t test acc:\t"  + str(round((test_acc_set,6)))
