In [None]:
# set up the GPU
import tensorflow as tf
import torch

device_name = tf.test.gpu_device_name()
if torch.cuda.is_available():     
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
!pip install transformers    

In [None]:
import pandas as pd

from google.colab import files
uploaded = files.upload()

In [None]:
import io
import json

tweets = []
with open('train.jsonl') as train_data:
    for i, line in enumerate(train_data):
        tweets.append(json.loads(line))
print(tweets[0])
print(len(tweets))
df = pd.DataFrame(tweets)

In [None]:
import numpy as np
df['text'] = df['response']
df['label'] = df['label'].apply(lambda x: np.where(df['label'].unique()== x )[0][0])
print(df['text'][0])

In [None]:
import re 
def function_clean(text):
    text = re.sub(r"http\S+", "", text) 
    text = re.sub("@[^\s]*", "", text)
    text = re.sub("#[^\s]*", "", text)
    text = re.sub('[0-9]*[+-:]*[0-9]+', '', text)
    text = re.sub("'s", "", text)
    text = re.sub(r"@USER", "", text) 
    return text
df['text'] = df['text'].apply(lambda text: function_clean(text))
df[['text', 'label']]

In [None]:
# Get the lists of text and their labels.
text = df.text.values
label = df.label.values
print(label)

In [None]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
max_len = 0
for t in text:
    input_ids = tokenizer.encode(t, add_special_tokens = True)
    max_len = max(max_len, len(input_ids))

print('Max text length: ', max_len)

In [None]:
input_ids = []
attention_masks = []

for t in text:
    encoded_dict = tokenizer.encode_plus(
                        t,                      
                        add_special_tokens = True,
                        max_length = 120,           
                        pad_to_max_length = True,
                        return_attention_mask = True,  
                        return_tensors = 'pt',    
                   )
       
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
label = torch.tensor(label)

In [152]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(input_ids, label, test_size = 0.2, random_state = 12)
train_masks, validation_masks, _,_ = train_test_split(attention_masks, label,  test_size = 0.2, random_state = 12)

In [None]:
train_inputs = torch.tensor(X_train)
validation_inputs = torch.tensor(X_test)

train_labels = torch.tensor(y_train)
validation_labels = torch.tensor(y_test)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [155]:
#create iterator
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
batch_size = 16

#Create the DataLoader for the training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = batch_size)

#Create the DataLoader for the validation set
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = RandomSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler = validation_sampler, batch_size = batch_size)

In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

model.cuda()

In [158]:
optimizer = AdamW(model.parameters(),lr = 2e-5, eps = 1e-8)

In [177]:
from transformers import get_linear_schedule_with_warmup
epochs = 2

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [178]:
import time
import datetime

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [179]:
# make prediction suitable to cal flat_f1
def logits_to_preds(logits):
  preds = []
  for i in range(len(logits)):
    temp = np.argmax(logits[i], axis =1).flatten()
    for j in temp:
      preds.append(j)
  return preds

In [180]:
# make label suitable to cal flat_f1
def flatten_label(label):
  labels = []
  for i in range(len(label)):
    for j in label[i]:
      labels.append(j)
  return labels

In [181]:
# Target is for SARCASM
from sklearn.metrics import precision_recall_fscore_support
def flat_f1(logits_list, true_label_list):
  pred_flat = logits_to_preds(logits_list)
  labels_flat = flatten_label(true_label_list)
  metric_res = precision_recall_fscore_support(labels_flat, pred_flat)
  return metric_res

In [None]:
# Training code
import random
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

loss_values = []

for epoch_i in range(0, epochs):
    logits_list = []
    true_label_list = []
    print("")
    print('====== Epoch {:} / {:} ======'.format(epoch_i + 1, epochs))
    print('training...')

    t0 = time.time()
    total_loss = 0

    model.train()

    #for each batch of training data
    for step, batch in enumerate(train_dataloader):

      # Progress update every 100 batches.
      if step % 100 == 0 and not step == 0:
          # Calculate elapsed time in minutes.
          elapsed = format_time(time.time() - t0)
            
          # Report progress.
          print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

      b_input_ids = batch[0].to(device)
      b_input_mask = batch[1].to(device)
      b_labels = batch[2].to(device)

      model.zero_grad()

      outputs = model(b_input_ids,
                      token_type_ids=None,
                      attention_mask=b_input_mask,
                      labels=b_labels)
    
      loss = outputs[0]

      total_loss += loss.item()

      loss.backward()

      torch.nn.utils.clip_grad_norm_(model.parameters(), 1,0)

      optimizer.step()

      scheduler.step()
    
    avg_train_loss = total_loss/len(train_dataloader)

    loss_values.append(avg_train_loss)

    print("")
    print(" Avg training loss: {0:.2f}".format(avg_train_loss))
    print(" Training epcoh took: {:}".format(format_time(time.time() - t0)))

    #############validation
    print("")
    print("Running validation...")

    t0 = time.time()

    model.eval()

    for batch in validation_dataloader:

      batch = tuple(t.to(device) for t in batch)

      b_input_ids, b_input_mask, b_labels = batch

      with torch.no_grad():
        outputs = model(b_input_ids,
                        token_type_ids= None,
                        attention_mask = b_input_mask)
      
      logits = outputs[0]

      logits = logits.detach().cpu().numpy()
      label_ids = b_labels.to('cpu').numpy()
      logits_list.append(logits)
      true_label_list.append(label_ids)
    
    # call f1
    eval = flat_f1(logits_list, true_label_list)

    print(eval)
    print("  validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete")  

In [None]:
print("true label: ",  true_label_list[0])
print("logits:",  logits_list[0])

In [None]:
# Import the test data
import pandas as pd

from google.colab import files
uploaded = files.upload()

In [None]:
import io
import json
test = []
with open('test.jsonl') as fl2:
    for i, line in enumerate(fl2):
        test.append(json.loads(line))
print(len(test))
test_df = pd.DataFrame(test)

In [191]:
# do the same data preparation to the test data
test_df['text'] = test_df['response']
test_df['text'] = test_df['text'].apply(lambda text: function_clean(text))
test_text = test_df.text.values

In [None]:
test_input_ids = []
test_attention_masks = []

for  in test_text:
    encoded_dict = tokenizer.encode_plus(
                        t,                      
                        add_special_tokens = True, 
                        max_length = 120,           
                        pad_to_max_length = True,
                        return_attention_mask = True,   
                        return_tensors = 'pt',    
                   )
       
    test_input_ids.append(encoded_dict['input_ids'])
    test_attention_masks.append(encoded_dict['attention_mask'])

test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)

print('Original: ', test_text[0])
print('Token IDs:', test_input_ids[0])

In [None]:
test_inputs = torch.tensor(test_input_ids)
test_masks = torch.tensor(test_attention_masks)

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
batch_size = 16

test_data = TensorDataset(test_inputs, test_masks)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size = batch_size)

In [None]:
# prediction on test set

print("Predicting labels for {:,} test sentences...".format(len(test_inputs)))

model.eval()
test_logits_list = []
t0 = time.time()

# predict
for(step, batch) in enumerate(test_dataloader):
      batch = tuple(t.to(device) for t in batch)

      # Progress update every 100 batches.
      if step % 100 == 0 and not step == 0:
          # Calculate elapsed time in minutes.
          elapsed = format_time(time.time() - t0)
            
          # Report progress.
          print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(test_dataloader), elapsed))

      b_input_ids, b_input_mask = batch
      with torch.no_grad():
        outputs = model(b_input_ids,
                      token_type_ids=None,
                      attention_mask=b_input_mask)
    
      logits = outputs[0]
      logits = logits.detach().cpu().numpy()
      test_logits_list.append(logits)
    
print("   DONE.")

In [175]:
# Convert the prediction results to submission format
final_predictions = logits_to_preds(test_logits_list)
df_pred = pd.DataFrame(data = final_predictions, columns=["prediction"])
df_pred["twitter"]= (df_pred.index + 1)
df_pred["twitter_index"]= "twitter_" + df_pred["twitter"].astype(str)
df_pred["result"] = df_pred['prediction'].apply(lambda x: 'SARCASM' if x == 0 else 'NOT_SARCASM')
df_pred
df_pred[["twitter_index", "result"]].to_csv("answer.txt", header=None, index=None, sep=',', mode='w')
files.download("answer.txt")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Reference: https://colab.research.google.com/drive/1pTuQhug6Dhl9XalKB0zUGf4FIdYFlpcX#scrollTo=EKOTlwcmxmej 