In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, random, sys, copy
import torch, torch.nn as nn, numpy as np
from tqdm.notebook import tqdm
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from nltk.tokenize import word_tokenize
import statistics
from sklearn.model_selection import train_test_split
import nltk
import math

# Load the Data

In [3]:
data = pd.read_csv('train.En.csv')
valid = pd.read_csv('task_A_En_test.csv')

In [4]:
data = data[['tweet', 'sarcastic']]
data.rename(columns={'tweet': 'text'}, inplace=True)
data

Unnamed: 0,text,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not “forced” to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1
...,...,...
3463,The population spike in Chicago in 9 months is...,0
3464,You'd think in the second to last English clas...,0
3465,I’m finally surfacing after a holiday to Scotl...,0
3466,Couldn't be prouder today. Well done to every ...,0


In [5]:
train, test = train_test_split(data)

In [6]:
train

Unnamed: 0,text,sarcastic
845,hot girls have tree shaped dents in their rear...,1
538,thats it im packing my things and moving to it...,1
2666,Is adulthood really just doing lots of things ...,0
2823,why u being weird TO ME,0
2839,Parents always mess up ur plans😒,0
...,...,...
3301,You don't realize how much you need the homies...,0
361,All the shade i have been hearing about Ben Pl...,1
1558,@MartinSLewis @GMB We share a birthday Martin!...,0
1119,@FeeyaCruz But you get to see me?!,0


In [7]:
test

Unnamed: 0,text,sarcastic
3312,when my mom was preggo with me she craved brus...,0
640,Can’t wait for those $2000 checks to go out im...,1
475,love waking up in a panic 🥴🥴,1
840,Being a business major is legal conversion the...,1
1456,So do men really not understand the expenses o...,0
...,...,...
2953,"I love asking my boyfriend, the biology studen...",0
866,"I might be rubbish at driving, and have a less...",1
1022,"Broadband back, starting to feel a bit more no...",0
1138,I want a bloomin’ onion,0


In [8]:
valid

Unnamed: 0,text,sarcastic
0,"Size on the the Toulouse team, That pack is mo...",0
1,Pinball!,0
2,So the Scottish Government want people to get ...,1
3,villainous pro tip : change the device name on...,0
4,I would date any of these men 🥺,0
...,...,...
1395,I’ve just seen this and felt it deserved a Ret...,0
1396,Omg how an earth is that a pen !!! 🤡,0
1397,Bringing Kanye and drake to a tl near you,0
1398,"I love it when women are referred to as ""girl ...",1


# Evaluate

In [9]:
from sklearn.metrics import f1_score

## Testing Randomness

In [10]:
f1_score(np.random.randint(0, 2, test['sarcastic'].shape), test['sarcastic'])

0.32339089481946626

In [11]:
unsupExamples = []
for row in train.iterrows():
    unsupExamples.append(row[1]['text'])

In [12]:
rawVocab = []
vocabulary = {}
special_tokens = ['<s>', '</s>', '<unk>']

for row_idx, row in train.iterrows():
    example = row['text']
    if pd.isna(example) or example.strip() == '':
        continue # skip rows with empty strings
    words = nltk.word_tokenize(str(example).strip())
    for word_idx, word in enumerate(words):
        if word_idx == 0:
            rawVocab.append([special_tokens[0], word])
        else:
            rawVocab[-1].append(word)
        if word not in vocabulary:
            vocabulary[word] = len(vocabulary) + len(special_tokens)

# Add special tokens to the vocabulary
for i, token in enumerate(special_tokens):
    vocabulary[token] = i

print(vocabulary)



In [13]:
tokenized_examples = []
sos_id = vocabulary['<s>'] #start of sequence
eos_id = vocabulary['</s>'] #end of sequence
unk_id = vocabulary['<unk>']

for row_idx, row in train.iterrows():
    example = str(row['text']).strip()
    print(example)

    example_tokens = [token.lower() for token in word_tokenize(example)]

    token_ids = [sos_id]
    for token in example_tokens:
        if token not in vocabulary:
            token_ids.append(unk_id)
        else:
            token_ids.append(vocabulary[token])

    token_ids.append(eos_id)
    tokenized_examples.append(token_ids)

print(len(tokenized_examples[0]))
print((tokenized_examples[0]))

hot girls have tree shaped dents in their rear bumpers ❤️
thats it im packing my things and moving to italy see u all in a month xo
Is adulthood really just doing lots of things then feeling absolutely exhausted for the rest of the day?
why u being weird TO ME
Parents always mess up ur plans😒
Efy is great
@dave26318632 I don’t get anywhere near £500 a month on UC though. I basically get told “you get what you get, and if you don’t like it you can fuck off and die.”

One thing’s for certain, I wouldn’t cross a puddle for the benefits I’m on, let alone the Channel.
@ArenaSwansea @creedfs Specialty starter or light bite featuring laverbread.
i love the word junction
everyone gets all hyped about ancient debris, but where's my modern debris?
When you're feeling all in your feelings, but a certain BEST FRIEND is making you see reason, and maybe you'll be okay. So damn rude. 😤 @LadyDeadlight
$WISH from email newsletter today - 0.5bn customers..? Amazing if literally 'shoppers' and not repeat

# Trigram Dataset

In [14]:
class TrigramDataset(torch.utils.data.Dataset):

    def __init__(self, tokenized_data):

        self.examples = []
        for example in tokenized_data:              #Iterate over our dataset
            for i in range(0,len(example) - 2):     #Iterate over the tokens of the example
                if example[i] != '<unk>' and example[i+1] != '<unk>' and example[i+2] != '<unk>':
                    self.examples.append(((example[i], example[i+1]), example[i+2]))
                    

    def __getitem__(self, idx):

        return self.examples[idx]

    def __len__(self):

        return len(self.examples)

# Creating a Trigram Model

In [15]:
class TrigramLM(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_hidden_layers):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.hidden_layer_1 = nn.Linear(embedding_dim*2, hidden_dim)
        self.hidden_layers = nn.ModuleList(
            [nn.Linear(hidden_dim, hidden_dim) for _ in range(num_hidden_layers - 1)]
        )
        self.output_layer = nn.Linear(hidden_dim, vocab_size)
        
        self.relu = nn.ReLU()

    def forward(self, input_1, input_2):

        embedding_1 = self.embedding(input_1)
        embedding_2 = self.embedding(input_2)
        embedding = torch.cat((embedding_1, embedding_2), dim=2)

        hidden = self.relu(self.hidden_layer_1(embedding))

        for layer in self.hidden_layers:
            hidden = self.relu(layer(hidden))

        output = self.output_layer(hidden)

        return output

In [16]:

def train_trigram(trigram_model, trigram_dataset):

    criteria = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(trigram_model.parameters())
    # optimizer = torch.optim.SGD(trigram_model.parameters(), lr=0.5)

    softmax = nn.Softmax(dim=2)

    epochs = 3
    batch_size = 32


    train_dataloader = torch.utils.data.DataLoader(trigram_dataset, batch_size=batch_size, shuffle=True)

    for i in range(epochs):
        print('### Epoch: ' + str(i+1) + ' ###')

        trigram_model.train()

        for step, data in enumerate(train_dataloader):

            x, y = data

            x = (x[0].unsqueeze(1), x[1].unsqueeze(1))

            optimizer.zero_grad()
            model_output = trigram_model(x[0], x[1])
            model_output_probabilities = softmax(model_output)

            loss = criteria(model_output_probabilities.squeeze(1), y)

            loss.backward()
            optimizer.step()

In [17]:
trigram_model = TrigramLM(len(vocabulary), 50, 50, 1)
trigram_dataset = TrigramDataset(tokenized_examples[:5000])


train_trigram(trigram_model, trigram_dataset)

### Epoch: 1 ###
### Epoch: 2 ###
### Epoch: 3 ###


In [20]:
def trigramTraining(testDf, outputDf):
        f1_sarcastic = f1_score(test["sarcastic"],outputDf["sarcastic"], average = "binary", pos_label = 1)
        print(f1_sarcastic)