### Data preprocessing

* load data

In [1]:
import glob
import os

import html
def clean_text(text):
    text = text.rstrip()

    if '""' in text:
        if text[0] == text[-1] == '"':
            text = text[1:-1]
        text = text.replace('\\""', '"')
        text = text.replace('""', '"')

    text = text.replace('\\""', '"')

    text = html.unescape(text)
    text = ' '.join(text.split())
    return text
  
def label2categories(label):
  codetable={'positive':0, 'neutral':1, 'negative':2}
  return codetable[label]

def read_files(files):
  data={}
  for file in files:
    with open(file, 'r') as f:
      for line in f:
        columns=line.rstrip().split('\t')
        tweet_id, sentiment=columns[0], columns[1]
        text=clean_text(" ".join(columns[2:]))
        data[tweet_id]=(label2categories(sentiment), text)
  return data

In [2]:
#load traning dataset
folder="/home/s2465922/tm/semeval-2017-tweets_Subtask-A/downloaded"
files=glob.glob(os.path.join(folder, "*.tsv"))

data=read_files(files)
      
#load test dataset
gold_file="/home/s2465922/tm/semeval-2017-tweets_Subtask-A/SemEval2017-task4-test.subtask-A.english.txt"
gold=read_files([gold_file])

In [36]:
len(gold)

12284

* split data

In [3]:
def split_data(dataset, ratio):
  train_ratio = ratio
  train_split_index = int(len(dataset) * train_ratio)

  train = dataset[:train_split_index]
  test = dataset[train_split_index:]

  return train, test

train, val=split_data(list(data.values()), 0.9)
X_train, y_train=list(zip(*train))[1], list(zip(*train))[0]
X_val, y_val=list(zip(*val))[1], list(zip(*val))[0]


X_test, y_test=list(zip(*gold.values()))[1], list(zip(*gold.values()))[0]

* text processing

In [4]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

text_processor=TextPreProcessor(
  normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
           'time', 'url', 'date', 'number'],
  include_tags={"hashtag", "allcaps", "elongated", "repeated",
                'emphasis', 'censored'},
  fix_html=True,
  segmenter="twitter",
  corrector="twitter",
  unpack_hashtags=True,
  unpack_contractions=True,
  spell_correct_elong=False,
  tokenizer=SocialTokenizer(lowercase=True).tokenize,
  dicts=[emoticons])

  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


### Baseline: Naive Nayes

* convert text to bag of words

In [5]:
import numpy as np

X_train_full=X_train+X_val
y_train_full=y_train+y_val

# preprcocess text
X_train_preprocessed=np.array([' '.join(text_processor.pre_process_doc(sent)) for sent in X_train_full])
X_test_preprocessed=np.array([' '.join(text_processor.pre_process_doc(sent)) for sent in X_test])

#calculate tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf=TfidfVectorizer(ngram_range=(1, 3),
                      binary=True,
                      smooth_idf=False)
X_train_tfidf=tf_idf.fit_transform(X_train_preprocessed)
X_test_tfidf=tf_idf.transform(X_test_preprocessed)

* use cross-validation to find the best alpha

In [6]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

def get_auc_CV(model):
  kf=StratifiedKFold(10, shuffle=True, random_state=1)
  
  #get AUC scores
  auc=cross_val_score(model, X_train_tfidf, y_train_full, 
                      scoring="accuracy", cv=kf, n_jobs=-1)
  
  return auc.mean()

from sklearn.naive_bayes import MultinomialNB
import pandas as pd
res=pd.Series([get_auc_CV(MultinomialNB(alpha=i))
              for i in np.arange(1, 10, 0.1)],
             index=np.arange(1, 10, 0.1))

best_alpha = np.round(res.idxmax(), 2)
print('Best alpha: ', best_alpha)

Best alpha:  1.0


* evaluate on the test set

In [17]:
from sklearn.metrics import f1_score, precision_score
from sklearn.metrics import recall_score
def evaluate_measures(y_pred, y_true):
  y_pred=np.array(y_pred)
  y_true=np.array(y_true)
  accu=np.mean(y_pred==y_true)
  
  recs=recall_score(y_true, y_pred, average=None)
  mean_rec=recall_score(y_true, y_pred, average='macro')
  
  precs=precision_score(y_true, y_pred, average=None)
  mean_prec=precision_score(y_true, y_pred, average='macro')
  
  f1_pn=f1_score(y_true, y_pred, average='macro', labels=[0,2])
  scores={'accuracy': accu, 
          'recall': {'pos': recs[0], 'neu': recs[1], 'neg': recs[2], 'mean': mean_rec},
          'precision': {'pos': precs[0], 'neu': precs[1], 'neg': precs[2], 'mean': mean_prec},
          'f1_pn': f1_pn}
  return scores

In [18]:
nb_model=MultinomialNB(alpha=best_alpha)
nb_model.fit(X_train_tfidf, y_train_full)
nb_preds=np.argmax(nb_model.predict_proba(X_test_tfidf), axis=1)
nb_scores=evaluate_measures(nb_preds, y_test)

In [35]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, nb_preds)

array([[1228, 1147,    0],
       [ 664, 5273,    0],
       [ 285, 3686,    1]])

In [43]:
2*0.564*0.517/(0.564+0.517)/2

0.2697391304347826

In [19]:
nb_scores

{'accuracy': 0.5293064148485835,
 'recall': {'pos': 0.5170526315789473,
  'neu': 0.888159002863399,
  'neg': 0.00025176233635448137,
  'mean': 0.46848779892623355},
 'precision': {'pos': 0.5640790078089113,
  'neu': 0.5217692459924798,
  'neg': 1.0,
  'mean': 0.6952827512671304},
 'f1_pn': 0.2700232279662767}

### Bert model

In [23]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda:1")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
    

There are 2 GPU(s) available.
Device name: GeForce RTX 2080 Ti


* convert text to a list of token ids

In [24]:
from transformers import BertTokenizer
tokenizer=BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
MAX_LEN=140

def tokenize(data):
  input_ids=[]
  attention_masks=[]
  
  for sent in data:
    encoded_sent=tokenizer.encode_plus(
      text=' '.join(text_processor.pre_process_doc(sent)),
      add_special_tokens=True,
      max_length=MAX_LEN,
      padding='max_length',
      truncation=True,
      return_attention_mask=True
    )
    input_ids.append(encoded_sent.get('input_ids'))
    attention_masks.append(encoded_sent.get('attention_mask'))
    
  input_ids=torch.tensor(input_ids)
  attention_masks=torch.tensor(attention_masks)
  
  return input_ids, attention_masks


In [25]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#get the input to the bert model
train_inputs, train_masks = tokenize(X_train)
val_inputs, val_masks=tokenize(X_val)

#convert other data types to torch.Tensor
train_labels=torch.tensor(y_train)
val_labels=torch.tensor(y_val)

batch_size=32

#create the dataloader for the training set
train_data=TensorDataset(train_inputs, train_masks, train_labels)
train_sampler=RandomSampler(train_data)
train_dataloader=DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

#create the dataloader for the validation set
val_data=TensorDataset(val_inputs, val_masks, val_labels)
val_sampler=RandomSampler(val_data)
val_dataloader=DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

* fine-tune bert model

In [26]:
import torch.nn as nn
from transformers import BertModel

class BertClassifier(nn.Module):
  def __init__(self, freeze_bert=False):
    super(BertClassifier, self).__init__()
    #specify hidden size of BERT, hidden size or classifier, and number of labels
    D_in, H, D_out=768, 50, 3
    
    self.bert=BertModel.from_pretrained('bert-base-uncased')
    
    self.classifier=nn.Sequential(
      nn.Linear(D_in, H),
      nn.ReLU(),
      nn.Dropout(0.5),
      nn.Linear(H, D_out)
    )
    
    if freeze_bert:
      for param in self.bert.parameters():
        param.requires_grad=False
        
  def forward(self, input_ids, attention_mask):
    outputs=self.bert(input_ids=input_ids,
                     attention_mask=attention_mask)
    last_hidden_state_cls=outputs[0][:,0,:]
    logits=self.classifier(last_hidden_state_cls)

    return logits
    

In [27]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(epochs=4):
  bert_classifier=BertClassifier(freeze_bert=False)
  
  bert_classifier.to(device)
  
  optimizer=AdamW(bert_classifier.parameters(),
                 lr=1e-6,
                 eps=1e-8)
  
  total_steps=len(train_dataloader)*epochs
  
  scheduler=get_linear_schedule_with_warmup(optimizer,
                                           num_warmup_steps=0,
                                           num_training_steps=total_steps)
  return bert_classifier, optimizer, scheduler

import random
import numpy as np
import time

loss_fn=nn.CrossEntropyLoss()

def set_seed(seed_value=34):
  random.seed(seed_value)
  np.random.seed(seed_value)
  torch.manual_seed(seed_value)
  torch.cuda.manual_seed_all(seed_value)
  
def train(model, train_dataloader, val_dataloade=None, epochs=4, evaluation=False):
  print("Starting training...\n")
  
  for epoch_i in range(epochs):
    print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
    print("-"*70)
    
    #Measure the elapsed time of each epoch
    t0_epoch, t0_batch=time.time(), time.time()
    
    total_loss, batch_loss, batch_counts=0, 0, 0
    
    model.train()
    
    for step, batch in enumerate(train_dataloader):
      batch_counts+=1
      b_input_ids, b_attn_mask, b_labels=tuple(t.to(device) for t in batch)
      
      model.zero_grad()
      
      logits=model(b_input_ids, b_attn_mask)
      
      loss=loss_fn(logits, b_labels)
      batch_loss+=loss.item()
      total_loss+=loss.item()
      
      loss.backward()
      
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
      
      optimizer.step()
      scheduler.step()
      
      if (step%100==0 and step!=0) or (step==len(train_dataloader)-1):
        time_elapsed=time.time()-t0_batch
        
        print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")
        
        batch_loss, batch_counts=0, 0
        t0_batch=time.time()
        
    avg_train_loss=total_loss/len(train_dataloader)
    print("-"*70)
    
    if evaluation==True:
      val_loss, val_accuracy=evaluate(model, val_dataloader)
                  # Print performance over the entire training data
      time_elapsed = time.time() - t0_epoch
            
      print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
      print("-"*70)
      print("\n")
    
  print("Training complete!")
    
def evaluate(model, val_dataloader):
  model.eval()
  
  val_accuracy=[]
  val_loss=[]
  
  for batch in val_dataloader:
    #load batch to gpu
    b_input_ids, b_attn_mask, b_labels=tuple(t.to(device) for t in batch)
    
    #compute logits
    with torch.no_grad():
      logits=model(b_input_ids, b_attn_mask)
    
    #compute loss
    loss=loss_fn(logits, b_labels)
    val_loss.append(loss.item())
    
    #get the predictions
    preds=torch.argmax(logits, dim=1).flatten()
    
    #calculate the accuracy rate
    accuracy=(preds==b_labels).cpu().numpy().mean()*100
    val_accuracy.append(accuracy)
    
  #compute the average accuracy and loss over the validation set
  val_loss=np.mean(val_loss)
  val_accuracy=np.mean(val_accuracy)
  
  return val_loss, val_accuracy

In [28]:
set_seed(34)
bert_classifier, optimizer, scheduler=initialize_model(epochs=5)
train(bert_classifier, train_dataloader, val_dataloader, epochs=5, evaluation=True)

Starting training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   1    |   100   |   1.079502   |     -      |     -     |   39.93  
   1    |   200   |   1.030324   |     -      |     -     |   39.81  
   1    |   300   |   1.028618   |     -      |     -     |   40.05  
   1    |   400   |   1.004597   |     -      |     -     |   40.14  
   1    |   500   |   0.997979   |     -      |     -     |   40.19  
   1    |   600   |   0.974735   |     -      |     -     |   40.24  
   1    |   700   |   0.954767   |     -      |     -     |   40.10  
   1    |   800   |   0.950345   |     -      |     -     |   40.29  
   1    |   900   |   0.911628   |     -      |     -     |   39.83  
   1    |  1000   |   0.893920   |     -      |     -     |   42.85  
   1    |  1100   |   0.845568   |     -      |     -     |   44.10  
   1    |  1200   |   0.843783   |     -      |     -     |   45.01

* evaluate model on test set

In [29]:
def bert_predict(model, test_dataloader):
  model.eval()
  
  preds=[]
  
  for batch in test_dataloader:
    #load batch to gpu
    b_input_ids, b_attn_mask=tuple(t.to(device) for t in batch)[:2]
    
    #compute logits
    with torch.no_grad():
      logits=model(b_input_ids, b_attn_mask)
    
    #get the predictions
    preds.append(torch.argmax(logits, dim=1).flatten())

  preds=torch.cat(preds, dim=0).cpu().numpy()

  return preds


In [30]:
# Run `preprocessing_for_bert` on the test set
print('Tokenizing data...')
test_inputs, test_masks = tokenize([' '.join(text_processor.pre_process_doc(sent)) for sent in X_test])

# Create the DataLoader for our test set
test_dataset = TensorDataset(test_inputs, test_masks)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=128)

Tokenizing data...


In [32]:
bert_preds=bert_predict(bert_classifier, test_dataloader)
bert_scores=evaluate_measures(bert_preds, y_test)

In [34]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, bert_preds)

array([[1695,  623,   57],
       [ 807, 4294,  836],
       [ 163, 1292, 2517]])

In [33]:
bert_scores

{'accuracy': 0.6924454575056985,
 'recall': {'pos': 0.7136842105263158,
  'neu': 0.7232609061815732,
  'neg': 0.6336858006042296,
  'mean': 0.6902103057707062},
 'precision': {'pos': 0.6360225140712945,
  'neu': 0.6915767434369464,
  'neg': 0.7381231671554253,
  'mean': 0.6885741415545553},
 'f1_pn': 0.6772740320728672}