In [1]:
import numpy as np
import pandas as pd

# Load the Data

In [2]:
data = pd.read_csv('train.En.csv')
test = pd.read_csv('task_A_En_test.csv')

In [3]:
data = data[['tweet', 'sarcastic']]
data.rename(columns={'tweet': 'text'}, inplace=True)
data['text'] = data['text'].astype('string')
data['sarcastic'] = data['sarcastic'].astype('int')
data.dropna(inplace=True)
data

Unnamed: 0,text,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not “forced” to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1
...,...,...
3463,The population spike in Chicago in 9 months is...,0
3464,You'd think in the second to last English clas...,0
3465,I’m finally surfacing after a holiday to Scotl...,0
3466,Couldn't be prouder today. Well done to every ...,0


# Tokenize Data

In [4]:
from sklearn.model_selection import train_test_split
from collections import Counter
import re

In [5]:
alpha = re.compile('[^a-z ]')

In [6]:
class Tokenizer:
  def __init__(self, sentences):
    words = [w for s in self.get_words(sentences) for w in s]
    self.wtoi = {w:i for i,(w,c) in enumerate(Counter(words).most_common(5000))}
    self.n_words = len(self.wtoi)
  def get_words(self, sentences):
    a = [alpha.sub('', s.lower()).split() for s in sentences]
    return [[' '.join(p) for p in zip(ws, ws[1:])] if len(ws) > 2 else ' '.join(ws) for ws in a]
  def tokenize(self, sentences):
    vec = np.zeros((len(sentences), self.n_words))
    for i, s in enumerate(self.get_words(sentences)):
      for w in s:
        if w in self.wtoi:
          vec[i][self.wtoi[w]] += 1
    return vec

tz = Tokenizer(data['text'])
x_train, x_valid, y_train, y_valid = train_test_split(tz.tokenize(data['text']), data['sarcastic'], test_size = 0.1)

# Setup PyTorch

In [7]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

In [8]:
device = (
  "cuda"
  if torch.cuda.is_available()
  else "mps"
  if torch.backends.mps.is_available()
  else "cpu"
)
print(f"Using {device} device")

Using cuda device


# Create a Dataset Object

In [9]:
class Tensor_Dataset(Dataset):
  def __init__(self, X, y, device):
    self.X = torch.tensor(np.array(X), dtype=torch.float, device=device)
    self.y = torch.tensor(np.array(y), dtype=torch.float, device=device)

  def __len__(self):
    return len(self.y)

  def __getitem__(self, i):
    return self.X[i], self.y[i]

train_ds = Tensor_Dataset(x_train, y_train, device=device)
valid_ds = Tensor_Dataset(x_valid, y_valid, device=device)

# Train Model

In [10]:
class NeuralNetwork(nn.Module):
  def __init__(self, device):
    super().__init__()
    self.model = nn.Sequential(
      nn.Linear(5000, 5000),
      nn.ReLU(),
      nn.Linear(5000, 5000),
      nn.ReLU(),
      nn.Linear(5000, 5000),
      nn.ReLU(),
      nn.Linear(5000, 5000),
      nn.ReLU(),
      nn.Linear(5000, 2048),
      nn.ReLU(),
      nn.Linear(2048, 1024),
      nn.ReLU(),
      nn.Linear(1024, 128),
      nn.ReLU(),
      nn.Linear(128, 1),
    )
    self.optimizer = torch.optim.AdamW(self.parameters())
    self.criteria = nn.BCEWithLogitsLoss()
    self.to(device=device)
    self.device = device
  def forward(self, X):
    logits = self.model(X)
    return logits
  def fit(self, X, epochs, batch_size, print_freq=100):
    dataloader = DataLoader(X, batch_size, shuffle=True)
    for epoch in range(0, epochs):
      self.train()
      avg_loss = 0
      for step, (x, y) in enumerate(dataloader):
        # Training
        self.optimizer.zero_grad()
        logits = self(x)
        loss = self.criteria(logits, y.unsqueeze(1))
        loss.backward()
        self.optimizer.step()
        # Logging
        avg_loss += loss.item()
        if step % print_freq == 1:
          print('epoch: {} batch: {} loss: {}'.format(epoch, step, avg_loss / print_freq))
          avg_loss = 0
  def predict(self, X):
    logits = self(torch.tensor(X, dtype=torch.float, device=self.device))
    return torch.clamp(logits, 0, 1).cpu().detach().numpy().flatten().round().astype(int)

model = NeuralNetwork(device=device)
print(model)

NeuralNetwork(
  (model): Sequential(
    (0): Linear(in_features=5000, out_features=5000, bias=True)
    (1): ReLU()
    (2): Linear(in_features=5000, out_features=5000, bias=True)
    (3): ReLU()
    (4): Linear(in_features=5000, out_features=5000, bias=True)
    (5): ReLU()
    (6): Linear(in_features=5000, out_features=5000, bias=True)
    (7): ReLU()
    (8): Linear(in_features=5000, out_features=2048, bias=True)
    (9): ReLU()
    (10): Linear(in_features=2048, out_features=1024, bias=True)
    (11): ReLU()
    (12): Linear(in_features=1024, out_features=128, bias=True)
    (13): ReLU()
    (14): Linear(in_features=128, out_features=1, bias=True)
  )
  (criteria): BCEWithLogitsLoss()
)


In [16]:
model.fit(train_ds, epochs=5, batch_size=128, print_freq=1000)

epoch: 0 batch: 1 loss: 5.231396667659283e-05
epoch: 1 batch: 1 loss: 7.647320628166198e-05
epoch: 2 batch: 1 loss: 8.596857078373432e-05
epoch: 3 batch: 1 loss: 1.3636870309710502e-05
epoch: 4 batch: 1 loss: 6.632392853498459e-05


In [17]:
np.average(model.predict(x_valid) == np.array(y_valid))

0.6685878962536023

# Evaluate

In [18]:
from sklearn.metrics import f1_score
# Testing Randomness
f1_score(np.random.randint(0, 2, test['sarcastic'].shape), test['sarcastic'])

0.2042648709315376

## Testing our model

In [19]:
f1_score(model.predict(x_valid), y_valid)

0.08

In [20]:
f1_score(model.predict(tz.tokenize(test['text'])), test['sarcastic'])

0.17989417989417988