In [166]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, random, sys, copy
import torch, torch.nn as nn, numpy as np
from tqdm.notebook import tqdm
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from nltk.tokenize import word_tokenize
import statistics
from sklearn.model_selection import train_test_split
import nltk
import math
from sklearn.metrics import f1_score

# Load the Data

In [167]:
data = pd.read_csv('train.En.csv')
test = pd.read_csv('task_A_En_test.csv')

In [168]:
data = data[['tweet', 'sarcastic']]
data.rename(columns={'tweet': 'text'}, inplace=True)
data['text'] = data['text'].astype('string')
data['sarcastic'] = data['sarcastic'].astype('int')
data.dropna(inplace=True)
data

Unnamed: 0,text,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not “forced” to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1
...,...,...
3463,The population spike in Chicago in 9 months is...,0
3464,You'd think in the second to last English clas...,0
3465,I’m finally surfacing after a holiday to Scotl...,0
3466,Couldn't be prouder today. Well done to every ...,0


# Glove Dict Setup

In [169]:
glove_file = 'glove.6B.50d.txt'

word2id = {}
embedding_matrix = []

with open(glove_file, 'r', encoding='utf8') as f:
  for i, line in enumerate(f):
    word, *embed = line.strip().split()
    word2id[word] = i
    embedding_matrix.append(embed)

embedding_matrix = np.array(embedding_matrix, dtype=np.float32)
n_words, embed_dim = embedding_matrix.shape

print('Loaded {} words with {}-dimensional embeddings from glove'.format(n_words, embed_dim))

word2id['<pad>'] = n_words
embedding_matrix = np.concatenate((embedding_matrix, np.zeros((1, embed_dim))), axis=0)

Loaded 400000 words with 50-dimensional embeddings from glove


# Classification Dataset

In [170]:
import re
alpha = re.compile('[^a-z ]')

In [171]:
class SarcasmDataset(torch.utils.data.Dataset):
  def __init__(self, X, y, max_length=256):
    self.max_length = max_length
    self.X = self.tokenize(X)
    self.y = np.array(y)
  
  def tokenize(self, sentences):
    tokenized = []
    for s in sentences:
      tokens = word_tokenize(s)
      ids = [word2id[tok] if tok in word2id else word2id['unk'] for tok in tokens]
      if len(ids) >= self.max_length:
        ids = ids[:self.max_length]
      else:
        ids.extend([word2id['<pad>']]*(self.max_length - len(ids)))
        tokenized.append(torch.tensor(ids))
    return tokenized

  def __len__(self):
    return len(self.y)

  def __getitem__(self, i):
    return self.X[i], self.y[i]

x_train, x_valid, y_train, y_valid = train_test_split(data['text'], data['sarcastic'], test_size = 0.1)
train_ds = SarcasmDataset(x_train, y_train)
valid_ds = SarcasmDataset(x_valid, y_valid)

# Creating a Glove Model

In [172]:
class GloveModel(nn.Module):

    def __init__(self, pretrained_embedding, hidden_dim, num_hidden_layers, max_length=256):
        super().__init__()

        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(pretrained_embedding))
        self.hidden_layer_1 = nn.Linear(pretrained_embedding.shape[1], hidden_dim)
        self.hidden_layers = nn.ModuleList(
            [nn.Linear(hidden_dim, hidden_dim) for _ in range(num_hidden_layers - 1)]
        )
        self.output_layer = nn.Linear(hidden_dim, 1)

        self.relu = nn.ReLU()

    def forward(self, input):

        embedding = self.embedding(input).squeeze(1)
        embedding = torch.sum(embedding, dim=1)

        hidden = self.relu(self.hidden_layer_1(embedding))
        for layer in self.hidden_layers:
            hidden = self.relu(layer(hidden))

        output = self.output_layer(hidden)

        return output

In [173]:
def predict(model, valid_dataloader):
    sigmoid = nn.Sigmoid()
    outputs = []
    Y = []
    for x,y in valid_dataloader:
        x = x.unsqueeze(1)
        outputs.append(np.round(float(sigmoid(model(x)))))
        Y.append(int(y))
    return outputs, Y

In [174]:
def train_classification(model, train_dataset, valid_dataset, accuracyArray, epochs=100, batch_size=32, print_frequency=100):
    criteria = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.AdamW(model.parameters())            
    

    epochs = epochs
    batch_size = batch_size
    print_frequency = print_frequency

    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=1, shuffle=False)

    for i in range(epochs):
        print('### Epoch: ' + str(i+1) + ' ###')

        model.train()
        avg_loss = 0

        for step, data in enumerate(train_dataloader):

            x, y = data
            x = x.unsqueeze(1)

            optimizer.zero_grad()

            model_output = model(x)

            loss = criteria(model_output.squeeze(1), y.float())

            loss.backward()
            optimizer.step()

            avg_loss += loss.item()
            if step % print_frequency == 1:
                print('epoch: {} batch: {} loss: {}'.format(
                    i,
                    step,
                    avg_loss / print_frequency
                ))
                avg_loss = 0

        model.eval()
        with torch.no_grad():
            accuracyArray.append(predict(model, valid_dataloader))
            predict(model, valid_dataloader)
            


In [175]:
print('Loaded {} train examples'.format(train_ds.__len__()))
print('Loaded {} validation examples'.format(valid_ds.__len__()))

Loaded 3120 train examples
Loaded 347 validation examples


In [176]:
accuracyArray = []
glove_model = GloveModel(embedding_matrix, 100, 5)
train_classification(glove_model, train_ds, valid_ds, accuracyArray)

### Epoch: 1 ###
epoch: 0 batch: 1 loss: 0.014434148073196412
### Epoch: 2 ###
epoch: 1 batch: 1 loss: 0.010641739368438721
### Epoch: 3 ###
epoch: 2 batch: 1 loss: 0.009007288217544556
### Epoch: 4 ###
epoch: 3 batch: 1 loss: 0.011633729934692383
### Epoch: 5 ###
epoch: 4 batch: 1 loss: 0.011921521425247193
### Epoch: 6 ###
epoch: 5 batch: 1 loss: 0.010031480193138122
### Epoch: 7 ###
epoch: 6 batch: 1 loss: 0.010972358584403992
### Epoch: 8 ###
epoch: 7 batch: 1 loss: 0.010448537170886993
### Epoch: 9 ###
epoch: 8 batch: 1 loss: 0.011831493973731995
### Epoch: 10 ###
epoch: 9 batch: 1 loss: 0.0102605801820755
### Epoch: 11 ###
epoch: 10 batch: 1 loss: 0.01034174919128418
### Epoch: 12 ###
epoch: 11 batch: 1 loss: 0.010510811805725098
### Epoch: 13 ###
epoch: 12 batch: 1 loss: 0.009871867001056672
### Epoch: 14 ###
epoch: 13 batch: 1 loss: 0.010184126198291779
### Epoch: 15 ###
epoch: 14 batch: 1 loss: 0.007484771311283112
### Epoch: 16 ###
epoch: 15 batch: 1 loss: 0.00808545559644699

In [180]:
test_ds = SarcasmDataset(test['text'], test['sarcastic'])
test_dataloader = torch.utils.data.DataLoader(test_ds, batch_size=1, shuffle=False)

print('Glove model accuracy: ')
submitted, truths = predict(glove_model, test_dataloader)
f1_sarcastic = f1_score(truths,submitted, average = "binary", pos_label = 1)
f1_sarcastic

Glove model accuracy: 


0.17204301075268816

In [178]:
submitted

[0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0

In [179]:
truths

[0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
