In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, random, sys, copy
import torch, torch.nn as nn, numpy as np
from tqdm.notebook import tqdm
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from nltk.tokenize import word_tokenize
import statistics
from sklearn.model_selection import train_test_split
import nltk
import math
from sklearn.metrics import f1_score

# Load the Data

In [2]:
train = pd.read_csv('train.En.csv')
test = pd.read_csv('task_A_En_test.csv')

In [3]:
train = train[['tweet', 'sarcastic']]
train.rename(columns={'tweet': 'text'}, inplace=True)
train

Unnamed: 0,text,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not “forced” to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1
...,...,...
3463,The population spike in Chicago in 9 months is...,0
3464,You'd think in the second to last English clas...,0
3465,I’m finally surfacing after a holiday to Scotl...,0
3466,Couldn't be prouder today. Well done to every ...,0


In [4]:
test

Unnamed: 0,text,sarcastic
0,"Size on the the Toulouse team, That pack is mo...",0
1,Pinball!,0
2,So the Scottish Government want people to get ...,1
3,villainous pro tip : change the device name on...,0
4,I would date any of these men 🥺,0
...,...,...
1395,I’ve just seen this and felt it deserved a Ret...,0
1396,Omg how an earth is that a pen !!! 🤡,0
1397,Bringing Kanye and drake to a tl near you,0
1398,"I love it when women are referred to as ""girl ...",1


# Glove Dict Setup

In [5]:
glove_file = 'glove.6B.50d.txt'

embeddings_dict = {}

with open(glove_file, 'r', encoding='utf8') as f:
    for i, line in enumerate(f):
        if i == 0:
            print(line)
        line = line.strip().split(' ')
        word = line[0]
        embed = np.asarray(line[1:], "float")

        embeddings_dict[word] = embed

print('Loaded {} words from glove'.format(len(embeddings_dict)))

embedding_matrix = np.zeros((len(embeddings_dict)+1, 50)) #add 1 for padding

word2id = {}
for i, word in enumerate(embeddings_dict.keys()):

    word2id[word] = i                                #Map each word to an index
    embedding_matrix[i] = embeddings_dict[word]      #That index holds the Glove embedding in the embedding matrix

# Our joint vocabulary for both models / sanity check to see if we've loaded it correctly:
print(word2id['the'])
print(embedding_matrix[word2id['the']])

word2id['<pad>'] = embedding_matrix.shape[0] - 1
print(embedding_matrix[word2id['<pad>']])


the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0.14767 -0.55677 0.14658 -0.0095095 0.011658 0.10204 -0.12792 -0.8443 -0.12181 -0.016801 -0.33279 -0.1552 -0.23131 -0.19181 -1.8823 -0.76746 0.099051 -0.42125 -0.19526 4.0071 -0.18594 -0.52287 -0.31681 0.00059213 0.0074449 0.17778 -0.15897 0.012041 -0.054223 -0.29871 -0.15749 -0.34758 -0.045637 -0.44251 0.18785 0.0027849 -0.18411 -0.11514 -0.78581

Loaded 400000 words from glove
0
[ 4.1800e-01  2.4968e-01 -4.1242e-01  1.2170e-01  3.4527e-01 -4.4457e-02
 -4.9688e-01 -1.7862e-01 -6.6023e-04 -6.5660e-01  2.7843e-01 -1.4767e-01
 -5.5677e-01  1.4658e-01 -9.5095e-03  1.1658e-02  1.0204e-01 -1.2792e-01
 -8.4430e-01 -1.2181e-01 -1.6801e-02 -3.3279e-01 -1.5520e-01 -2.3131e-01
 -1.9181e-01 -1.8823e+00 -7.6746e-01  9.9051e-02 -4.2125e-01 -1.9526e-01
  4.0071e+00 -1.8594e-01 -5.2287e-01 -3.1681e-01  5.9213e-04  7.4449e-03
  1.7778e-01 -1.5897e-01  1.2041e-02 -5.4223e-02 -2.9871e-01 -1.5749e-01
 -3.47

# Classification Dataset

In [6]:
class SarcasmDataset(torch.utils.data.Dataset):

    def __init__(self, sarcastic=None, nonSarcastic=None, split=None, word2id=None, finalized_data=None, data_limit=250, max_length=256):
        """
        :param sarcastic: The sarcastic dataset
        :param nonSarcastic: The non-sarcastic dataset
        :param split: Train or test
        :param word2id: The generated glove word2id dictionary
        :param finalized_data: We'll use this to initialize a validation set without reloading the data.
        :param data_limit: Limiter on the number of examples we load
        :param max_length: Maximum length of the sequence
        """

        self.data_limit = data_limit
        self.max_length = max_length
        self.word2id = word2id

        if finalized_data:
            self.data = finalized_data

        else:

            pos_examples = sarcastic
            neg_examples = nonSarcastic

            pos_examples_tokenized = [(ids, 1) for ids in self.tokenize(pos_examples)]
            neg_examples_tokenized = [(ids, 0) for ids in self.tokenize(neg_examples)]

            self.data = pos_examples_tokenized + neg_examples_tokenized

            random.shuffle(self.data)

    def tokenize(self, examples):

        example_ids = []
        misses = 0              # Count the number of tokens in our dataset which are not covered by glove -- i.e. percentage of unk tokens
        total = 0
        for example in examples:
            tokens = word_tokenize(example)
            ids = []
            for tok in tokens:
                if tok in word2id:
                    ids.append(word2id[tok])
                else:
                    misses += 1
                    ids.append(word2id['unk'])
                total += 1

            if len(ids) >= self.max_length:
                ids = ids[:self.max_length]
            else:
                ids = ids + [word2id['<pad>']]*(self.max_length - len(ids))
            example_ids.append(torch.tensor(ids))
        print('Missed {} out of {} words -- {:.2f}%'.format(misses, total, misses/total))
        return example_ids

    def generate_validation_split(self, ratio=0.8):

        split_idx = int(ratio * len(self.data))

        # Take a chunk of the processed data, and return it in order to initialize a validation dataset
        validation_split = self.data[split_idx:]

        #We'll remove this data from the training data to prevent leakage
        self.data = self.data[:split_idx]

        return validation_split


    def __getitem__(self, item):
        return self.data[item]

    def __len__(self):
        return len(self.data)


# Creating a Glove Model

In [7]:
class GloveModel(nn.Module):

    def __init__(self, pretrained_embedding, hidden_dim, num_hidden_layers, max_length=256):
        super().__init__()

        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(pretrained_embedding))
        self.hidden_layer_1 = nn.Linear(pretrained_embedding.shape[1], hidden_dim)
        self.hidden_layers = nn.ModuleList(
            [nn.Linear(hidden_dim, hidden_dim) for _ in range(num_hidden_layers - 1)]
        )
        self.output_layer = nn.Linear(hidden_dim, 1)

        self.relu = nn.ReLU()

    def forward(self, input):

        embedding = self.embedding(input).squeeze(1)
        embedding = torch.sum(embedding, dim=1)

        hidden = self.relu(self.hidden_layer_1(embedding))
        for layer in self.hidden_layers:
            hidden = self.relu(layer(hidden))

        output = self.output_layer(hidden)

        return output

In [37]:
def predict(model, valid_dataloader):
    sigmoid = nn.Sigmoid()
    outputs = []
    Y = []
    for x,y in valid_dataloader:
        x = x.unsqueeze(1)
        outputs.append(np.round(float(sigmoid(model(x)))))
        Y.append(int(y))
    return outputs, Y

In [32]:
def train_classification(model, train_dataset, valid_dataset, accuracyArray, epochs=100, batch_size=32, print_frequency=100):
    criteria = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.AdamW(model.parameters())            
    

    epochs = epochs
    batch_size = batch_size
    print_frequency = print_frequency

    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=1, shuffle=False)

    for i in range(epochs):
        print('### Epoch: ' + str(i+1) + ' ###')

        model.train()
        avg_loss = 0

        for step, data in enumerate(train_dataloader):

            x, y = data
            x = x.unsqueeze(1)

            optimizer.zero_grad()

            model_output = model(x)

            loss = criteria(model_output.squeeze(1), y.float())

            loss.backward()
            optimizer.step()

            avg_loss += loss.item()
            if step % print_frequency == 1:
                print('epoch: {} batch: {} loss: {}'.format(
                    i,
                    step,
                    avg_loss / print_frequency
                ))
                avg_loss = 0

        model.eval()
        with torch.no_grad():
            accuracyArray.append(predict(model, valid_dataloader))
            predict(model, valid_dataloader)
            


In [33]:
train[train["sarcastic"] == 1].to_csv("trainSarcastic.csv", index=False)
train[train["sarcastic"] == 0].to_csv("trainNonSarcastic.csv", index=False)
test.to_csv("testDataset.csv", index=False)

In [34]:
train_dataset = SarcasmDataset("trainSarcastic.csv", "trainNonSarcastic.csv", 'train', word2id)
validation_examples = train_dataset.generate_validation_split()
print('Loaded {} train examples'.format(train_dataset.__len__()))

valid_dataset = SarcasmDataset(finalized_data=validation_examples, word2id=word2id)
print('Loaded {} validation examples'.format(valid_dataset.__len__()))

Missed 1 out of 18 words -- 0.06%
Missed 2 out of 21 words -- 0.10%
Loaded 31 train examples
Loaded 8 validation examples


In [35]:
accuracyArray = []
glove_model = GloveModel(embedding_matrix, 100, 5)
train_classification(glove_model, train_dataset, valid_dataset, accuracyArray)

### Epoch: 1 ###
### Epoch: 2 ###
### Epoch: 3 ###
### Epoch: 4 ###
### Epoch: 5 ###
### Epoch: 6 ###
### Epoch: 7 ###
### Epoch: 8 ###
### Epoch: 9 ###
### Epoch: 10 ###
### Epoch: 11 ###
### Epoch: 12 ###
### Epoch: 13 ###
### Epoch: 14 ###
### Epoch: 15 ###
### Epoch: 16 ###
### Epoch: 17 ###
### Epoch: 18 ###
### Epoch: 19 ###
### Epoch: 20 ###
### Epoch: 21 ###
### Epoch: 22 ###
### Epoch: 23 ###
### Epoch: 24 ###
### Epoch: 25 ###
### Epoch: 26 ###
### Epoch: 27 ###
### Epoch: 28 ###
### Epoch: 29 ###
### Epoch: 30 ###
### Epoch: 31 ###
### Epoch: 32 ###
### Epoch: 33 ###
### Epoch: 34 ###
### Epoch: 35 ###
### Epoch: 36 ###
### Epoch: 37 ###
### Epoch: 38 ###
### Epoch: 39 ###
### Epoch: 40 ###
### Epoch: 41 ###
### Epoch: 42 ###
### Epoch: 43 ###
### Epoch: 44 ###
### Epoch: 45 ###
### Epoch: 46 ###
### Epoch: 47 ###
### Epoch: 48 ###
### Epoch: 49 ###
### Epoch: 50 ###
### Epoch: 51 ###
### Epoch: 52 ###
### Epoch: 53 ###
### Epoch: 54 ###
### Epoch: 55 ###
### Epoch: 56 ###
#

In [38]:
test_dataset = SarcasmDataset("testDataset.csv", 'test', word2id)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False)

print('Glove model accuracy: ')
submitted, truths = predict(glove_model, test_dataloader)
f1_sarcastic = f1_score(truths,submitted, average = "binary", pos_label = 1)
f1_sarcastic

Missed 1 out of 15 words -- 0.07%
Missed 0 out of 4 words -- 0.00%
Glove model accuracy: 


0.4

In [39]:
submitted

[0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0]

In [40]:
truths

[1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]