In [None]:
!pip3 install transformers==3.4.0
!pip3 install emoji
!pip3 install keras
!pip3 install tensorflow

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaModel
from transformers import BertForSequenceClassification, BertTokenizer, BertModel
from transformers import AutoModel, AutoTokenizer
from sklearn import metrics
import torch.nn as nn 
import torch.nn.functional as F
from transformers import AdamW
from sklearn.utils import class_weight

In [None]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
n_gpu = torch.cuda.device_count()
#torch.cuda.get_device_name(0)

In [None]:
# linguistic features
df = pd.read_csv('./LIWC2015_Results_1.csv', header=None)
features = np.array(df.loc[:, 1:])
print(features.shape) # (6696, 93)

(6696, 93)


In [None]:
# Set the maximum sequence length
MAX_LEN = 50

In [None]:
df = pd.read_csv('../bragging_data/bragging_data.csv', header=0, names=['id', 'text', 'sampling', 'round_no', 'label'])

text_training = []
text_testing = []
label_training = []
label_testing = []
labels_number = []
index_training = []
index_testing = []
features_training = []
features_testing = []

text_testing_print = []

texts = df.text.values
sampling_ways = df.sampling.values
round_nos = df.round_no.values
labels = df.label.values

for i in labels:
    if i == "not":
        labels_number.append(0)
    else:
        labels_number.append(1)

for i in range(len(sampling_ways)):
    if sampling_ways[i] == 'keyword':
        index_training.append(i)
        #text_training.append(texts[i])
        label_training.append(labels_number[i])
        features_training.append(features[i])
    else:
        index_testing.append(i)
        #text_testing.append(texts[i])
        text_testing_print.append(texts[i])
        label_testing.append(labels_number[i])
        features_testing.append(features[i]) 

In [None]:
texts = ['[CLS] ' + str(sentence) + ' [SEP]' for sentence in texts]

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)

tokenized_texts = [tokenizer.tokenize(sent) for sent in texts]

# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

# Pad input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype='long', truncating='post', padding='post')
print(input_ids)
# Create attention masks
attention_masks = []

for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

[[10975  7454   104 ...     0     0     0]
 [10975  7454   104 ...     0     0     0]
 [10975  7454   104 ...     0     0     0]
 ...
 [10975  7454   104 ...     0     0     0]
 [10975  7454   104 ... 38022     8 29804]
 [10975  7454   104 ...     0     0     0]]


In [None]:
mask_training = []
mask_testing = []
for i in index_training:
    text_training.append(input_ids[i])
    mask_training.append(attention_masks[i])
for i in index_testing:
    text_testing.append(input_ids[i])
    mask_testing.append(attention_masks[i])

In [23]:
hidden_size = 768
embedding_size = 400
beta = 0.001
dropout_prob = 0.5

class AttnGating(nn.Module):
  def __init__(self):
    super(AttnGating, self).__init__()
   
    self.linear = nn.Linear(93, embedding_size)
    self.relu = nn.ReLU(inplace=True)

    self.weight_emotion_W1 = nn.Parameter(torch.Tensor(hidden_size+embedding_size, hidden_size))
    self.weight_emotion_W2 = nn.Parameter(torch.Tensor(embedding_size, hidden_size))
 
    
    nn.init.uniform_(self.weight_emotion_W1, -0.1, 0.1)
    nn.init.uniform_(self.weight_emotion_W2, -0.1, 0.1)

    self.LayerNorm = nn.LayerNorm(hidden_size)
    self.dropout = nn.Dropout(dropout_prob)

  def forward(self, embeddings_roberta, linguistic_feature):
     
     # Project linguistic representations into vectors with comparable size
     linguistic_feature = self.linear(linguistic_feature)
     emotion_feature = linguistic_feature.repeat(MAX_LEN, 1, 1) # (50, bs, 200) 
     emotion_feature = emotion_feature.permute(1, 0, 2) # (bs, 50, 200)

     # Concatnate word and linguistic representations  
     features_combine = torch.cat((emotion_feature, embeddings_roberta), axis=2) # (bs, 50, 968)
     
     g_feature = self.relu(torch.matmul(features_combine, self.weight_emotion_W1))

     # Attention gating
     H = torch.mul(g_feature, torch.matmul(emotion_feature, self.weight_emotion_W2))
     alfa = min(beta * (torch.norm(embeddings_roberta)/torch.norm(H)), 1)
     E = torch.add(torch.mul(alfa, H), embeddings_roberta)

     # Layer normalization and dropout 
     embedding_output = self.dropout(self.LayerNorm(E)) 

     return E

In [24]:
class RobertaClassificationModel(nn.Module):
    def __init__(self):
        super(RobertaClassificationModel, self).__init__()

        self.embedding_roberta = AutoModel.from_pretrained("vinai/bertweet-base", output_hidden_states=True)
        #self.attn_gate = AttnGating()

        self.roberta = AutoModel.from_pretrained("vinai/bertweet-base", return_dict=True)

        self.dropout = nn.Dropout(0.2)
        #self.relu = nn.ReLU(inplace=True)

        self.num_labels_bragging = 2
      
        self.classifier_bragging = nn.Linear(768, 2)
        

    def forward(self, input_ids, input_feature, attention_mask, labels=None):

        embedding_roberta = self.embedding_roberta(input_ids)
        #last_hidden_state, pooler_output, all_hidden_states = self.embedding_roberta(input_ids)
        last_hidden_state = embedding_roberta['last_hidden_state']
        pooler_output = embedding_roberta['pooler_output']
        all_hidden_states = embedding_roberta['hidden_states']
        roberta_embed = all_hidden_states[0]
        #combine_embed = self.attn_gate(roberta_embed, input_feature)

        outputs = self.roberta(input_ids=None, inputs_embeds=roberta_embed)
        sequence_output  = outputs.last_hidden_state

        x = sequence_output[:, 0, :]
        x = self.dropout(x)
        x = torch.tanh(x)
        x = self.dropout(x)

        logits_bragging = self.classifier_bragging(x)


        # Initialize loss of bragging 
        loss_bragging = None 

        # Training on bragging
        if labels is not None:
          if self.num_labels_bragging == 1:
            loss_fct_bragging = nn.MSELoss()
            loss_bragging = loss_fct_bragging(logits_bragging.view(-1), labels.view(-1))
          else:
            loss_fct_bragging = nn.CrossEntropyLoss()
            loss_bragging = loss_fct_bragging(logits_bragging.view(-1, self.num_labels_bragging), labels.view(-1))


        output = (logits_bragging,) +outputs[2:]

      
        return ((loss_bragging,) + output) if loss_bragging is not None else output


In [25]:
batch_size = 32
n_epoch = 12

x_train = text_training
y_train = label_training


print(len(x_train))
x_dev, x_testing, y_dev, y_testing, features_dev, features_testing, mask_dev, mask_testing = train_test_split(text_testing, label_testing, features_testing, mask_testing, test_size=0.8, random_state=0, stratify=label_testing)


# sklearn
weight_loss = class_weight.compute_class_weight( class_weight ='balanced', classes =  np.unique(y_train), y = y_train)
weight_loss = torch.tensor(weight_loss, dtype=torch.float32).cpu()

#model_roberta = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).cuda()
#model_roberta = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2).cuda()
model_roberta = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2).cpu()

#model_roberta = RobertaClassificationModel().cpu()
#model_roberta.cuda()

param_optimizer = list(model_roberta.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']        
    
optimizer_grouped_parameters = [
        {'params': [p for n, p in model_roberta.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in model_roberta.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

optimizer = AdamW(optimizer_grouped_parameters, lr=3e-6)
criterion = nn.CrossEntropyLoss(weight=weight_loss, size_average=None, ignore_index=-100, reduce=None, reduction='mean')

x_train = torch.LongTensor(x_train)
x_dev = torch.LongTensor(x_dev)

y_train = torch.LongTensor(y_train)
y_dev = torch.LongTensor(y_dev)

features_train = torch.FloatTensor(features_training)
features_dev = torch.FloatTensor(features_dev)

mask_train = torch.LongTensor(mask_training)
mask_dev = torch.LongTensor(mask_dev)

# Pack to dataLoader
train_data = TensorDataset(x_train, features_train, mask_train, y_train)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    
dev_data = TensorDataset(x_dev, features_dev, mask_dev, y_dev)
dev_sampler = RandomSampler(dev_data)
dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=batch_size) 

# Initialize previous dev loss
previous_valid_loss = 1000

for epoch in range(n_epoch):
    print(epoch)

    # Training
    model_roberta.train()

    train_losses = []
    valid_losses = []

    for step, batch in enumerate(train_dataloader):
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from dataloader
        b_input_ids, b_input_features, b_input_mask, b_labels = batch

        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()
        
        # Generate combined representations      

        
        outputs = model_roberta(input_ids=b_input_ids, input_feature=b_input_features, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        logits = outputs[1]
        #loss = criterion(logits, b_labels)


        # Backward pass
        loss.backward()

        # track train loss
        train_losses.append(loss.item())

        # Update parameters and take a step using the computed gradient
        optimizer.step()
        
    train_loss = np.average(train_losses)
    print('train loss: {}'.format(train_loss))

    # Validation
    model_roberta.eval()

    predictions = []
    targets = []


    # Evaluate data for one epoch
    for batch in dev_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from dataloader
        b_input_ids, b_input_features, b_input_mask, b_labels = batch
        
      
        with torch.no_grad():

            # Generate combined representations
            
            outputs = model_roberta(input_ids=b_input_ids, input_feature=b_input_features, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs[0]
            logits = outputs[1]
            #loss = criterion(logits, b_labels)
          

        valid_losses.append(loss.item())

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        
        labels = b_labels.to('cpu').numpy() 

        predictions = np.append(predictions, np.argmax(logits, axis=1))
        targets = np.append(targets, labels) 

    # Calculate total dev loss 
    valid_loss = np.average(valid_losses)
    print('valid loss: {}'.format(valid_loss))

    # Calculate dev f1 of sverity
    dev_f1 = metrics.f1_score(targets, predictions, average='macro', zero_division=1)
    print("dev_f1:", dev_f1)


    # Save the best model based on dev lossr
    torch.save(model_roberta, './bragging.pkl')
    print("saved")             
  
x_testing = torch.LongTensor(x_testing)
y_testing = torch.LongTensor(y_testing)
features_testing = torch.FloatTensor(features_testing)
mask_testing = torch.LongTensor(mask_testing)
    
test_data = TensorDataset(x_testing, features_testing, mask_testing, y_testing) 
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Testing
bragging_model = torch.load('./bragging.pkl')

test_predictions = []
test_targets = []
bragging_model.eval()

for batch in test_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from dataloader
    b_input_ids, b_input_features, b_input_mask, b_labels = batch
    
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = bragging_model(input_ids=b_input_ids, input_feature=b_input_features, attention_mask=b_input_mask)
        logits = outputs[0]
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    labels = b_labels.to('cpu').numpy()

    test_predictions = np.append(test_predictions, np.argmax(logits, axis=1))
    test_targets = np.append(test_targets, labels)

test_acc = metrics.accuracy_score(test_targets, test_predictions)
test_precision = metrics.precision_score(test_targets, test_predictions, average="macro", zero_division=1)
test_recall = metrics.recall_score(test_targets, test_predictions, average="macro", zero_division=1)
test_f1 = metrics.f1_score(test_targets, test_predictions, average="macro", zero_division=1)
print("test_acc:", test_acc)
print("test_precision:", test_precision)
print("test_recall:", test_recall)
print("test_f1:", test_f1)

target_names = ['class 0', 'class 1']
print(metrics.classification_report(test_targets, test_predictions, target_names=target_names))

3382


Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

0


TypeError: RobertaForSequenceClassification.forward() got an unexpected keyword argument 'input_feature'