In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, random, sys, copy
import torch, torch.nn as nn, numpy as np
from tqdm.notebook import tqdm
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from nltk.tokenize import word_tokenize
import statistics
from sklearn.model_selection import train_test_split
import nltk
import math

# Load the Data

In [2]:
data = pd.read_csv('train.En.csv')
valid = pd.read_csv('task_A_En_test.csv')

In [3]:
data = data[['tweet', 'sarcastic']]
data.rename(columns={'tweet': 'text'}, inplace=True)
data

Unnamed: 0,text,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not “forced” to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1
...,...,...
3463,The population spike in Chicago in 9 months is...,0
3464,You'd think in the second to last English clas...,0
3465,I’m finally surfacing after a holiday to Scotl...,0
3466,Couldn't be prouder today. Well done to every ...,0


In [4]:
train, test = train_test_split(data)

In [5]:
train

Unnamed: 0,text,sarcastic
337,@kidcuisine98 Based confidence and believing i...,1
1330,@sonofsama1 @BoqorofCeel U got more than me🥺,0
576,i love siriusxm. siriusxm is my friend,1
813,my favorite gay drama is the social network,1
2375,Happy New Year!,0
...,...,...
3009,"orange is a fruit, a flavour and a colour and ...",0
3330,there's so much going on in the world right no...,0
969,Trains are so unreliable 🙄,0
2051,I keep seeing people with knockoff skzoo plush...,0


In [6]:
test

Unnamed: 0,text,sarcastic
178,A wrong impression is once again my specialty,1
2069,That was the most mentally draining match I've...,0
2121,When your too good to people they will take &a...,0
1893,I’m waiting for this yt at work to just just h...,0
2357,u know those days where you just need to stare...,0
...,...,...
3185,some of you actually got to ride in the car sh...,0
2545,Every year I watch Soccer Aid and I always for...,0
2294,just got a strawberry coconut refresher from @...,0
2660,We've moved from spoopy season to soupy season.,0


In [7]:
valid

Unnamed: 0,text,sarcastic
0,"Size on the the Toulouse team, That pack is mo...",0
1,Pinball!,0
2,So the Scottish Government want people to get ...,1
3,villainous pro tip : change the device name on...,0
4,I would date any of these men 🥺,0
...,...,...
1395,I’ve just seen this and felt it deserved a Ret...,0
1396,Omg how an earth is that a pen !!! 🤡,0
1397,Bringing Kanye and drake to a tl near you,0
1398,"I love it when women are referred to as ""girl ...",1


# Evaluate

In [8]:
from sklearn.metrics import f1_score

## Testing Randomness

In [9]:
f1_score(np.random.randint(0, 2, test['sarcastic'].shape), test['sarcastic'])

0.35114503816793896

In [10]:
unsupExamples = []
for row in train.iterrows():
    unsupExamples.append(row[1]['text'])

In [11]:
rawVocab = []
vocabulary = {}
special_tokens = ['<s>', '</s>', '<unk>']

for row_idx, row in train.iterrows():
    example = row['text']
    if pd.isna(example) or example.strip() == '':
        continue # skip rows with empty strings
    words = nltk.word_tokenize(str(example).strip())
    for word_idx, word in enumerate(words):
        if word_idx == 0:
            rawVocab.append([special_tokens[0], word])
        else:
            rawVocab[-1].append(word)
        if word not in vocabulary:
            vocabulary[word] = len(vocabulary) + len(special_tokens)

# Add special tokens to the vocabulary
for i, token in enumerate(special_tokens):
    vocabulary[token] = i

print(vocabulary)



In [12]:
tokenized_examples = []
sos_id = vocabulary['<s>'] #start of sequence
eos_id = vocabulary['</s>'] #end of sequence
unk_id = vocabulary['<unk>']

for row_idx, row in train.iterrows():
    example = str(row['text']).strip()
    print(example)

    example_tokens = [token.lower() for token in word_tokenize(example)]

    token_ids = [sos_id]
    for token in example_tokens:
        if token not in vocabulary:
            token_ids.append(unk_id)
        else:
            token_ids.append(vocabulary[token])

    token_ids.append(eos_id)
    tokenized_examples.append(token_ids)

print(len(tokenized_examples[0]))
print((tokenized_examples[0]))

@kidcuisine98 Based confidence and believing in proper eye contact-pilled!
@sonofsama1 @BoqorofCeel U got more than me🥺
i love siriusxm. siriusxm is my friend
my favorite gay drama is the social network
Happy New Year!
hi, I’m doing my first fieldwork in assistive tech and I’m soooo!!! excited!!!
@TotalHansi @JoeBiden Why do so many educationally subnormal Trump supporters really think Alaska is not part of the US?
Can’t wait to be back at uni so I can order more shoes and clothes without my mum telling me off x
I hope the Knicks make a run to the finals. This atmosphere is so fun to watch.
(Shameless Dead Snow 2 misquote) The sequel I did 'nazi' coming! Can't wait! https://t.co/gngxw5kbDh
HAPPY BIRTHDAY MEL!!!!
bring back theme songs!!!
Can anyone direct me to who can help, please?I bought an umbrella from the shop,and only just realised it’s missing its sheath/cover which l obviously want.Who can I contact to having one sent to me?Thanks! #PokemonCenterLondon #pokemoncentrelondon @Po

# Trigram Dataset

In [13]:
class TrigramDataset(torch.utils.data.Dataset):

    def __init__(self, tokenized_data):

        self.examples = []
        for example in tokenized_data:              #Iterate over our dataset
            for i in range(0,len(example) - 2):     #Iterate over the tokens of the example
                if example[i] != '<unk>' and example[i+1] != '<unk>' and example[i+2] != '<unk>':
                    self.examples.append(((example[i], example[i+1]), example[i+2]))
                    

    def __getitem__(self, idx):

        return self.examples[idx]

    def __len__(self):

        return len(self.examples)

# Creating a Trigram Model

In [14]:
class TrigramLM(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_hidden_layers):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.hidden_layer_1 = nn.Linear(embedding_dim*2, hidden_dim)
        self.hidden_layers = nn.ModuleList(
            [nn.Linear(hidden_dim, hidden_dim) for _ in range(num_hidden_layers - 1)]
        )
        self.output_layer = nn.Linear(hidden_dim, vocab_size)
        
        self.relu = nn.ReLU()

    def forward(self, input_1, input_2):

        embedding_1 = self.embedding(input_1)
        embedding_2 = self.embedding(input_2)
        embedding = torch.cat((embedding_1, embedding_2), dim=2)

        hidden = self.relu(self.hidden_layer_1(embedding))

        for layer in self.hidden_layers:
            hidden = self.relu(layer(hidden))

        output = self.output_layer(hidden)

        return output

In [15]:
def train_trigram(trigram_model, trigram_dataset):

    criteria = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(trigram_model.parameters())
    # optimizer = torch.optim.SGD(trigram_model.parameters(), lr=0.5)

    softmax = nn.Softmax(dim=2)

    epochs = 3
    batch_size = 32


    train_dataloader = torch.utils.data.DataLoader(trigram_dataset, batch_size=batch_size, shuffle=True)

    for i in range(epochs):
        print('### Epoch: ' + str(i+1) + ' ###')

        trigram_model.train()

        for step, data in enumerate(train_dataloader):

            x, y = data

            x = (x[0].unsqueeze(1), x[1].unsqueeze(1))

            optimizer.zero_grad()
            model_output = trigram_model(x[0], x[1])
            model_output_probabilities = softmax(model_output)

            loss = criteria(model_output_probabilities.squeeze(1), y)

            loss.backward()
            optimizer.step()

In [16]:
trigram_model = TrigramLM(len(vocabulary), 50, 50, 1)
trigram_dataset = TrigramDataset(tokenized_examples[:5000])


train_trigram(trigram_model, trigram_dataset)

### Epoch: 1 ###


KeyboardInterrupt: 

Unnamed: 0,text,sarcastic
0,"Size on the the Toulouse team, That pack is mo...",0
1,Pinball!,0
2,So the Scottish Government want people to get ...,1
3,villainous pro tip : change the device name on...,0
4,I would date any of these men 🥺,0
...,...,...
1395,I’ve just seen this and felt it deserved a Ret...,0
1396,Omg how an earth is that a pen !!! 🤡,0
1397,Bringing Kanye and drake to a tl near you,0
1398,"I love it when women are referred to as ""girl ...",1


In [24]:
def trigramEval(testDf, outputDf):
        f1_sarcastic = f1_score(testDf["sarcastic"],outputDf["sarcastic"], average = "binary", pos_label = 1)
        print(f1_sarcastic)

trigramEval(valid, valid)


1.0


# Eval

Running the trigram Evaluation Script is as easy as importing your file into a pandas DataFrame, then calling the trigramEval() function, with the Final Test DataFrame as the first input (in this case that is called "valid") and the trainedOutputDataFrame as your second input (in our case we don't yet have the model fully functioning so we are using the valid DF for both inputs)