In [5]:
import os
import cv2
import time
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

In [6]:
# tool for text
import spacy

# load information about words
!python3 -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

Collecting en-core-web-sm==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6 MB)
[K     |████████████████████████████████| 13.6 MB 378 kB/s eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [7]:
df = pd.read_json("Sarcasm_Headlines_Dataset_v2.json",lines=True)
df

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...
...,...,...,...
28614,1,jews to celebrate rosh hashasha or something,https://www.theonion.com/jews-to-celebrate-ros...
28615,1,internal affairs investigator disappointed con...,https://local.theonion.com/internal-affairs-in...
28616,0,the most beautiful acceptance speech this week...,https://www.huffingtonpost.com/entry/andrew-ah...
28617,1,mars probe destroyed by orbiting spielberg-gat...,https://www.theonion.com/mars-probe-destroyed-...


# 1st step, clean the dataframe, preprocessing, dataset ready, using word tokenizer

In [8]:
class HeadDataset(Dataset):
    def __init__(self, df, word_dict, max_length):
        self.df = df
        self.word_dict = word_dict
        #self.sent_dict = {0: 0, 1: 1}
        self.max_len = max_length
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        review = row['cleaned'].split(' ')
        x = torch.zeros(self.max_len)
        
        # get review as a list of integers
        for idx in range(len(review)):
            
            # we want to front pad for RNN
            x[self.max_len - len(review) + idx] = self.word_dict[review[idx]]
            
        y = torch.tensor(int(row['is_sarcastic'])).float()
        
        # embedding likes long tensors
        return x.long(), y

In [9]:
import re
# disabling some fancy features of spacy for speed
nlp = spacy.load('en_core_web_sm', disable = ['ner', 'parser'])

rows = []
for idx in tqdm(range(len(df))):
    row = df.iloc[idx].copy()
    # first we remove numeric characters and lowercase everything
    cleaned_headline = re.sub("[^A-Za-z']+", ' ', row['headline']).lower()
    # we let spaCy tokenize and lemmatize the text for us
    tokenized_headline = nlp(cleaned_headline)
    cleaned_tokenized = [token.lemma_ for token in tokenized_headline if ((not token.is_stop) or (' ' in token.text))]
    
    if len(cleaned_tokenized) > 1:
        row['cleaned'] = ' '.join(cleaned_tokenized)
    rows.append(row)
df_clean = pd.DataFrame(rows)
df_clean.dropna(inplace=True)
df_clean.to_csv('headline_cleaned.csv')

  0%|          | 0/28619 [00:00<?, ?it/s]

In [11]:
from collections import Counter
reviews = [review.split(' ') for review in list(df_clean['cleaned'])]
word_freq = dict(Counter([token for review in reviews for token in review]).most_common())
print(len(word_freq))
min_freq = 5
word_dict = {}

# sending all the unknowns to 0
i = 1
for word in word_freq:
    if word_freq[word] > min_freq:
        word_dict[word] = i
        i += 1
    else:
            word_dict[word] = 0


20159


In [12]:
dict_length = max(word_dict.values()) + 1

In [13]:
def pad_sam_len(df_clean):
    max_length = 0
    for idx in tqdm(range(len(df_clean))):
        row = df_clean.iloc[idx]
        length = len(row['cleaned'].split(' '))
        if length > max_length:
            max_length = length
    return max_length
max_length = pad_sam_len(df_clean)

  0%|          | 0/28539 [00:00<?, ?it/s]

In [14]:
# train/val split
np.random.seed(3)
msk=np.random.rand(len(df_clean))<0.8
train=df_clean[msk].reset_index(drop=True)
val=df_clean[~msk].reset_index(drop=True)
train_ds = HeadDataset(train, word_dict, max_length)
val_ds = HeadDataset(val, word_dict, max_length)

# load into dataloader
train_dl = DataLoader(train_ds, batch_size=1000, shuffle=True)
valid_dl = DataLoader(val_ds, batch_size=1000, shuffle=False)

# LSTM model construction, naive linear+relu

In [16]:
# LSTM model for sentiment analysis
# train the embedding during training
class LSTM(nn.Module):
    def __init__(self, dict_length, embedding_size, hidden_size):
        super(LSTM, self).__init__()
        
        self.word_emb = nn.Embedding(dict_length, embedding_size, padding_idx=0)
        
        self.lstm = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, batch_first=True)
        
        self.linear1 = nn.Linear(hidden_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, 1)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        # embed it
        x = self.word_emb(x)
        # pass through LSTM
        output, hidden = self.lstm(x)
        # take the final hidden state
        x = hidden[0]
        # pass it through two linear layers
        x = self.relu(self.linear1(x))
        x = self.linear2(x)
        return torch.squeeze(x)

# make sure everything is working here...
x, y = next(iter(train_dl))
lstm_model = LSTM(dict_length, 50, 50)
lstm_model(x).shape

torch.Size([1000])

# Start to train, metric = prec, recall, acc  Loss=BCElogit

In [17]:
def one_pass(model, dataloader, optimizer, lossFun, backwards=True, print_loss=False):
    
    if backwards == True:
        model.train()
    else:
        model.eval()
    
    total_loss = 0.0
    for x, y in tqdm(dataloader):
        
        y_pred = model(x)
        loss = lossFun(y_pred, y)
        total_loss += loss.item()
        
        if backwards == True:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    avg_loss = total_loss / len(dataloader)
    
    if print_loss == True:
        print(avg_loss)
    
    return avg_loss

def one_pass_metrics(model, dataloader):
    model.eval()
    acc = 0.0
    prec = 0.0
    recall = 0.0
        
    for x, y in tqdm(dataloader):
        y_pred = (torch.sigmoid(model(x)) > 0.5).long()
        total_correct = torch.sum(y == y_pred).item()
        total_true_pos = torch.sum(torch.logical_and(y == 1, y == y_pred)).item()        
        
        acc += total_correct / y.nelement()
        prec += total_true_pos / torch.sum(y_pred).item()
        recall += total_true_pos / torch.sum(y).item()
    
    acc = acc / len(dataloader)
    prec = prec / len(dataloader)
    recall = recall / len(dataloader)
    
    return acc, prec, recall

In [18]:
from torch.utils.data import Dataset, DataLoader,random_split, TensorDataset, RandomSampler, SequentialSampler
import torch
import torch.nn as nn
from tqdm.notebook import tqdm
import torch.optim as optim
from sklearn.metrics import precision_recall_fscore_support, f1_score,confusion_matrix

In [21]:
lossFun = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(lstm_model.parameters(), lr = 0.01)

num_epochs = 5

for epoch in tqdm(range(num_epochs)):
    print('Epoch: ', epoch)
    
    loss = one_pass(lstm_model, train_dl, optimizer, lossFun)
    print('Train Loss: ', loss)
    
    loss = one_pass(lstm_model, valid_dl, optimizer, lossFun, backwards=False)
    print('Validation Loss: ', loss)
    
    acc, prec, recall = one_pass_metrics(lstm_model, train_dl)
    print('Train Accuracy: ', acc, 'Train Precision: ', prec,'Train Recall: ', recall)
    
    acc, prec, recall = one_pass_metrics(lstm_model, valid_dl)
    print('Val Accuracy: ', acc, 'Val Precision: ', prec,'Val Recall: ', recall)

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch:  0


  0%|          | 0/23 [00:00<?, ?it/s]

Train Loss:  0.5249267401902572


  0%|          | 0/6 [00:00<?, ?it/s]

Validation Loss:  0.5309344530105591


  0%|          | 0/23 [00:00<?, ?it/s]

Train Accuracy:  0.8044111365369946 Train Precision:  0.7754461783336721 Train Recall:  0.8328075534025942


  0%|          | 0/6 [00:00<?, ?it/s]

Val Accuracy:  0.730229044834308 Val Precision:  0.6927842500201572 Val Recall:  0.765178315811679
Epoch:  1


  0%|          | 0/23 [00:00<?, ?it/s]

Train Loss:  0.4003036631190259


  0%|          | 0/6 [00:00<?, ?it/s]

Validation Loss:  0.5045761466026306


  0%|          | 0/23 [00:00<?, ?it/s]

Train Accuracy:  0.8638263412153574 Train Precision:  0.8610727851120344 Train Recall:  0.8533170245244422


  0%|          | 0/6 [00:00<?, ?it/s]

Val Accuracy:  0.7531656920077974 Val Precision:  0.7358994243249207 Val Recall:  0.7399281567364264
Epoch:  2


  0%|          | 0/23 [00:00<?, ?it/s]

Train Loss:  0.3185231374657672


  0%|          | 0/6 [00:00<?, ?it/s]

Validation Loss:  0.5313525497913361


  0%|          | 0/23 [00:00<?, ?it/s]

Train Accuracy:  0.9038743961352654 Train Precision:  0.8946748128442458 Train Recall:  0.9061932106865203


  0%|          | 0/6 [00:00<?, ?it/s]

Val Accuracy:  0.7613976608187135 Val Precision:  0.7399019569190594 Val Recall:  0.7592026039021048
Epoch:  3


  0%|          | 0/23 [00:00<?, ?it/s]

Train Loss:  0.24506212900514188


  0%|          | 0/6 [00:00<?, ?it/s]

Validation Loss:  0.5960125426451365


  0%|          | 0/23 [00:00<?, ?it/s]

Train Accuracy:  0.9339562674802949 Train Precision:  0.9219125952694869 Train Recall:  0.9417330589132714


  0%|          | 0/6 [00:00<?, ?it/s]

Val Accuracy:  0.7621666666666668 Val Precision:  0.7373047931807627 Val Recall:  0.7670637113574288
Epoch:  4


  0%|          | 0/23 [00:00<?, ?it/s]

Train Loss:  0.17909498188806616


  0%|          | 0/6 [00:00<?, ?it/s]

Validation Loss:  0.7535238564014435


  0%|          | 0/23 [00:00<?, ?it/s]

Train Accuracy:  0.961539282990084 Train Precision:  0.9603416598252399 Train Recall:  0.9594038275018586


  0%|          | 0/6 [00:00<?, ?it/s]

Val Accuracy:  0.7634746588693958 Val Precision:  0.7459429593139157 Val Recall:  0.7539466353308549


# The best acc_val is 0.763 for now

# Step 2, try a sub_word tokenizaer, see if it performs better

In [22]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

special_tokens = ["[UNK]", "[PAD]"]
trainer = trainers.WordPieceTrainer(vocab_size=2000, special_tokens=special_tokens)

In [23]:
tokenizer.train_from_iterator(list(df['headline']), trainer=trainer)
encoding = tokenizer.encode("the time has come, the walrus said")
encoding.tokens

['the', 'time', 'has', 'come', ',', 'the', 'wal', '##ru', '##s', 'sa', '##id']

In [24]:
encoding.ids

[167, 464, 388, 1337, 13, 167, 929, 1822, 90, 1395, 184]

In [25]:
max_length = max([len(tokenizer.encode(x).ids) for x in df['headline']])
max_length

319

In [32]:
trained_tokenizer = train_tokenizer(text)

In [26]:
class HeadlineDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_length
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        headline_ids = tokenizer.encode(row['headline']).ids
        x = torch.zeros(self.max_len)
        
        # get headline as a list of integers
        for idx in range(len(headline_ids)):
            
            # we want to front pad for RNN
            x[self.max_len - len(headline_ids) + idx] = headline_ids[idx]
            
        y = torch.tensor(row['is_sarcastic']).float()
        
        # embedding likes long tensors
        return x.long(), y
    
ds = HeadlineDataset(df, tokenizer, max_length)

next(iter(ds))

(tensor([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,  

In [28]:
train_len = round(0.8 * len(df))
df_train = df[:train_len]
df_val = df[train_len:].reset_index(drop=True)

ds_train = HeadlineDataset(df_train, tokenizer, max_length)
ds_val = HeadlineDataset(df_val, tokenizer, max_length)

dl_train = DataLoader(ds_train, batch_size=1000, shuffle=True)
dl_val = DataLoader(ds_val, batch_size=1000, shuffle=True)
dict_length = tokenizer.get_vocab_size()

# LSTM model building, train

In [29]:
# LSTM-style Model
class LSTM(nn.Module):
    def __init__(self, dict_length, embedding_size, hidden_size):
        super(LSTM, self).__init__()
        
        self.word_emb = nn.Embedding(dict_length, embedding_size, padding_idx=0)
        
        self.lstm = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, batch_first=True)
        
        self.linear1 = nn.Linear(hidden_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, 1)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        # embed it
        x = self.word_emb(x)
        # pass through LSTM
        output, hidden = self.lstm(x)
        # take the final hidden state
        x = hidden[0]
        # pass it through two linear layers
        x = self.relu(self.linear1(x))
        x = self.linear2(x)
        return torch.squeeze(x)

# make sure everything is working here...
x, y = next(iter(dl_train))
lstm_model = LSTM(dict_length, 50, 50)
lstm_model(x).shape

torch.Size([1000])

In [30]:
lossFun = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(lstm_model.parameters(), lr = 0.01)

num_epochs = 5

for epoch in tqdm(range(num_epochs)):
    print('Epoch: ', epoch)
    
    loss = one_pass(lstm_model, dl_train, optimizer, lossFun)
    print('Train Loss: ', loss)
    
    loss = one_pass(lstm_model, dl_val, optimizer, lossFun, backwards=False)
    print('Validation Loss: ', loss)
    
    acc, prec, recall = one_pass_metrics(lstm_model, dl_train)
    print('Train Accuracy: ', acc, 'Train Precision: ', prec,'Train Recall: ', recall)
    
    acc, prec, recall = one_pass_metrics(lstm_model, dl_val)
    print('Val Accuracy: ', acc, 'Val Precision: ', prec,'Val Recall: ', recall)

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch:  0


  0%|          | 0/23 [00:00<?, ?it/s]

Train Loss:  0.6974437133125637


  0%|          | 0/6 [00:00<?, ?it/s]

Validation Loss:  0.6700439949830373


  0%|          | 0/23 [00:00<?, ?it/s]

Train Accuracy:  0.6278897255282975 Train Precision:  0.6585727834084771 Train Recall:  0.45759876261697074


  0%|          | 0/6 [00:00<?, ?it/s]

Val Accuracy:  0.6131058931860037 Val Precision:  0.6329680137070719 Val Recall:  0.4352263814199126
Epoch:  1


  0%|          | 0/23 [00:00<?, ?it/s]

Train Loss:  0.6150106632191202


  0%|          | 0/6 [00:00<?, ?it/s]

Validation Loss:  0.589085727930069


  0%|          | 0/23 [00:00<?, ?it/s]

Train Accuracy:  0.7133871751275199 Train Precision:  0.8540885919795914 Train Recall:  0.4816049464731283


  0%|          | 0/6 [00:00<?, ?it/s]

Val Accuracy:  0.7028324125230202 Val Precision:  0.8330345815837749 Val Recall:  0.46414149425794243
Epoch:  2


  0%|          | 0/23 [00:00<?, ?it/s]

Train Loss:  0.48949449347413104


  0%|          | 0/6 [00:00<?, ?it/s]

Validation Loss:  0.5368085404237112


  0%|          | 0/23 [00:00<?, ?it/s]

Train Accuracy:  0.7761488948263298 Train Precision:  0.7861760838839804 Train Recall:  0.7288739165828794


  0%|          | 0/6 [00:00<?, ?it/s]

Val Accuracy:  0.7349125230202579 Val Precision:  0.744260504402368 Val Recall:  0.6704885476228698
Epoch:  3


  0%|          | 0/23 [00:00<?, ?it/s]

Train Loss:  0.43828802005104395


  0%|          | 0/6 [00:00<?, ?it/s]

Validation Loss:  0.44355252385139465


  0%|          | 0/23 [00:00<?, ?it/s]

Train Accuracy:  0.8534185086227835 Train Precision:  0.8415728344749366 Train Recall:  0.8533892346553446


  0%|          | 0/6 [00:00<?, ?it/s]

Val Accuracy:  0.8033112338858195 Val Precision:  0.7924068681524297 Val Recall:  0.7919459780861979
Epoch:  4


  0%|          | 0/23 [00:00<?, ?it/s]

Train Loss:  0.33212971169015637


  0%|          | 0/6 [00:00<?, ?it/s]

Validation Loss:  0.416333610812823


  0%|          | 0/23 [00:00<?, ?it/s]

Train Accuracy:  0.8916917658489191 Train Precision:  0.8795391393984445 Train Recall:  0.8952401318798817


  0%|          | 0/6 [00:00<?, ?it/s]

Val Accuracy:  0.8223360957642726 Val Precision:  0.8137444539128732 Val Recall:  0.8095306966958855


# Step 3, try pretrained word embedding, word2vec glove

In [31]:
import gensim.downloader as api

info = api.info()  # show info about available models/datasets
w2v_model = api.load("glove-wiki-gigaword-100")

In [33]:
df_big = pd.read_csv('abcnews-date-text.csv').rename(columns={'headline_text': 'headline'})

# disabling some fancy features of spacy for speed
nlp = spacy.load('en_core_web_sm', disable = ['ner', 'parser'])

In [34]:
rows = []
for idx in tqdm(range(len(df_big))):
    row = df_big.iloc[idx].copy()
    
    # first we remove numeric characters and lowercase everything
    cleaned_headline = re.sub("[^A-Za-z']+", ' ', row['headline']).lower()
    
    # we let spaCy tokenize and lemmatize the text for us
    tokenized_headline = nlp(cleaned_headline)
    cleaned_tokenized = [
        token.lemma_ for token in tokenized_headline if ((not token.is_stop) or (' ' in token.text))]
    
    if len(cleaned_tokenized) > 1:
        row['cleaned'] = ' '.join(cleaned_tokenized)
    rows.append(row)
df_big_clean = pd.DataFrame(rows)
df_big_clean.to_csv('cleaned_big_headline.csv', index=False)
df_big_clean.head()

  0%|          | 0/1244184 [00:00<?, ?it/s]

Unnamed: 0,publish_date,headline,cleaned
0,20030219,aba decides against community broadcasting lic...,aba decide community broadcasting licence
1,20030219,act fire witnesses must be aware of defamation,act fire witness aware defamation
2,20030219,a g calls for infrastructure protection summit,g call infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise,air nz staff aust strike pay rise
4,20030219,air nz strike to affect australian travellers,air nz strike affect australian traveller


In [37]:
print('Number of Rows before cleaning:', len(df_big_clean))
df_clean = df_big_clean.dropna()
print('Number of Rows after cleaning:', len(df_clean))

Number of Rows before cleaning: 1244184
Number of Rows after cleaning: 1240796


In [42]:
df_clean['cleaned'].isna().sum()

0

## Need to re-read df_clean, variable been used accidentally

In [59]:
df_clean=pd.read_csv('headline_cleaned.csv')

# W2V part

In [60]:
from gensim.models import Word2Vec

sents = list(df_clean['cleaned'])
sents = [sent.split(' ') for sent in sents]

# different options for how to perform word2vec training
# check out documentation for more options related to sampling frequent vs. infrequent words
w2v_model = Word2Vec(# only consider words that show up at least 5 times
                     min_count = 5, 
                     window = 3,
                     vector_size = 50)
w2v_model.build_vocab(sents, progress_per=10000)

In [61]:
w2v_model.train(sents, total_examples = w2v_model.corpus_count, epochs = 100, report_delay=1)

(16887498, 20521100)

## some basic tests 

In [62]:
w2v_emb = w2v_model.wv
w2v_emb.most_similar('president')

[('donald', 0.5253666639328003),
 ('adviser', 0.5106316208839417),
 ('gulf', 0.5073413848876953),
 ('daca', 0.4630649983882904),
 ('mount', 0.46194523572921753),
 ('cabinet', 0.45437824726104736),
 ('congress', 0.4464903771877289),
 ('trade', 0.43497225642204285),
 ('rescind', 0.4297899603843689),
 ('macron', 0.422260046005249)]

w2v_emb.get_index('president')

In [64]:
max_length = max([len(sent) for sent in sents])
max_length

104

In [65]:
df_clean.head()

Unnamed: 0.1,Unnamed: 0,is_sarcastic,headline,article_link,cleaned
0,0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...,thirtysomethe scientist unveil doomsday clock ...
1,1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...,dem rep totally nail congress fall short gende...
2,2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...,eat veggie deliciously different recipe
3,3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...,inclement weather prevent liar get work
4,4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...,mother come pretty close word ' streaming ' co...


In [66]:
# max length here will be maximum length of the sequence predicted
class HeadlineDataset(Dataset):
    def __init__(self, df, w2v_model, max_length):
        self.df = df
        self.w2v_model = w2v_model
        self.max_len = max_length
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        headline = row['cleaned'].split(' ')
        x = torch.zeros(self.max_len)
        
        # skip all tokens missing from dictionary
        cleaned_headline = []
        for token in headline:
            try:
                token_id = self.w2v_model.wv.get_index(token)
                # shifting the index by one so I can make 0 the padding index
                cleaned_headline.append(token_id+1)
            except:
                None

                # get headline as a list of integers
        for idx in range(len(cleaned_headline)):

            x[self.max_len - len(cleaned_headline) + idx] = cleaned_headline[idx]
            
        y = torch.tensor(row['is_sarcastic']).float()
        
        # embedding likes long tensors
        return x.long(), y
    
ds = HeadlineDataset(df_clean, w2v_model, max_length)

next(iter(ds))

(tensor([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,  254,  255, 5252, 2046,  579,  949]),
 tensor(1.))

In [67]:
train_len = round(0.8 * len(df_clean))
df_train = df_clean[:train_len]
df_val = df_clean[train_len:].reset_index(drop=True)

ds_train = HeadlineDataset(df_train, w2v_model, max_length)
ds_val = HeadlineDataset(df_val, w2v_model, max_length)

dl_train = DataLoader(ds_train, batch_size=1000, shuffle=True)
dl_val = DataLoader(ds_val, batch_size=1000, shuffle=True)

In [68]:
# LSTM-style Model
class LSTM(nn.Module):
    def __init__(self, w2v_model, embedding_size, hidden_size):
        super(LSTM, self).__init__()
        
        # add a row of zeros to gensim word embedding matrix so we can have a padding index
        weights = torch.tensor(w2v_model.wv.vectors)
        padding_row = torch.zeros((1,50))
        weights = torch.cat((padding_row,weights),axis=0)

        self.word_emb = nn.Embedding.from_pretrained(weights,
                                                     freeze=True, padding_idx=0)
        
        self.lstm = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, batch_first=True)
        
        self.linear1 = nn.Linear(hidden_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, 1)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        # embed it
        x = self.word_emb(x)
        # pass through LSTM
        output, hidden = self.lstm(x)
        # take the final hidden state
        x = hidden[0]
        # pass it through two linear layers
        x = self.relu(self.linear1(x))
        x = self.linear2(x)
        return torch.squeeze(x)

# make sure everything is working here...
x, y = next(iter(dl_train))
lstm_model = LSTM(w2v_model, 50, 50)
lstm_model(x).shape

torch.Size([1000])

In [69]:
lossFun = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(lstm_model.parameters(), lr = 0.01)

num_epochs = 5

for epoch in tqdm(range(num_epochs)):
    print('Epoch: ', epoch)
    
    loss = one_pass(lstm_model, dl_train, optimizer, lossFun)
    print('Train Loss: ', loss)
    
    loss = one_pass(lstm_model, dl_val, optimizer, lossFun, backwards=False)
    print('Validation Loss: ', loss)
    
    acc, prec, recall = one_pass_metrics(lstm_model, dl_train)
    print('Train Accuracy: ', acc, 'Train Precision: ', prec,'Train Recall: ', recall)
    
    acc, prec, recall = one_pass_metrics(lstm_model, dl_val)
    print('Val Accuracy: ', acc, 'Val Precision: ', prec,'Val Recall: ', recall)

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch:  0


  0%|          | 0/23 [00:00<?, ?it/s]

Train Loss:  0.6088835311972577


  0%|          | 0/6 [00:00<?, ?it/s]

Validation Loss:  0.5587714811166128


  0%|          | 0/23 [00:00<?, ?it/s]

Train Accuracy:  0.7136361638675247 Train Precision:  0.7084503739818707 Train Recall:  0.681232083493559


  0%|          | 0/6 [00:00<?, ?it/s]

Val Accuracy:  0.7117297551789076 Val Precision:  0.7031657375359383 Val Recall:  0.6844846843528227
Epoch:  1


  0%|          | 0/23 [00:00<?, ?it/s]

Train Loss:  0.5238144553225973


  0%|          | 0/6 [00:00<?, ?it/s]

Validation Loss:  0.49935297667980194


  0%|          | 0/23 [00:00<?, ?it/s]

Train Accuracy:  0.7714901899230892 Train Precision:  0.7431576176291562 Train Recall:  0.7974149264996717


  0%|          | 0/6 [00:00<?, ?it/s]

Val Accuracy:  0.7500583804143126 Val Precision:  0.7218585462412374 Val Recall:  0.7693823760873245
Epoch:  2


  0%|          | 0/23 [00:00<?, ?it/s]

Train Loss:  0.4656127121137536


  0%|          | 0/6 [00:00<?, ?it/s]

Validation Loss:  0.48145603636900586


  0%|          | 0/23 [00:00<?, ?it/s]

Train Accuracy:  0.8056036728927956 Train Precision:  0.7906347522665943 Train Recall:  0.8070549040238658


  0%|          | 0/6 [00:00<?, ?it/s]

Val Accuracy:  0.7644416195856875 Val Precision:  0.7493637033683916 Val Recall:  0.7545628714608621
Epoch:  3


  0%|          | 0/23 [00:00<?, ?it/s]

Train Loss:  0.41896376791207685


  0%|          | 0/6 [00:00<?, ?it/s]

Validation Loss:  0.4709816922744115


  0%|          | 0/23 [00:00<?, ?it/s]

Train Accuracy:  0.8368003976351176 Train Precision:  0.8314167311943301 Train Recall:  0.8264365892892426


  0%|          | 0/6 [00:00<?, ?it/s]

Val Accuracy:  0.7707956685499058 Val Precision:  0.7632286724218584 Val Recall:  0.750807478613328
Epoch:  4


  0%|          | 0/23 [00:00<?, ?it/s]

Train Loss:  0.37944700018219324


  0%|          | 0/6 [00:00<?, ?it/s]

Validation Loss:  0.4794476677974065


  0%|          | 0/23 [00:00<?, ?it/s]

Train Accuracy:  0.8521327368806572 Train Precision:  0.8262923360393609 Train Recall:  0.8743720774778541


  0%|          | 0/6 [00:00<?, ?it/s]

Val Accuracy:  0.7669915254237288 Val Precision:  0.7406090624368415 Val Recall:  0.784071314400872


# Best val_acc is about 0.77, thats it?


# Try Bert, but frozen params vision, otherwise never stop even if CUDA

In [3]:
from transformers import AutoModel
from transformers import AutoTokenizer

In [4]:
model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
from torch.utils.data import Dataset, DataLoader,random_split, TensorDataset, RandomSampler, SequentialSampler
import torch
import torch.nn as nn
from tqdm.notebook import tqdm
import torch.optim as optim
from sklearn.metrics import precision_recall_fscore_support, f1_score,confusion_matrix

In [7]:
import os
import cv2
import time
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

In [8]:
df=pd.read_csv('headline_cleaned.csv')

In [9]:
tokenizer.tokenize(df.iloc[0]['headline'])

['thirty',
 '##some',
 '##thing',
 'scientists',
 'un',
 '##ve',
 '##il',
 'doom',
 '##sd',
 '##ay',
 'clock',
 'of',
 'hair',
 'loss']

In [10]:
max_length = tokenizer.model_max_length
max_length

512

In [11]:
tokenizer(df.iloc[0]['headline'], return_tensors="pt")

{'input_ids': tensor([[  101,  4228, 14045, 20744,  6529,  4895,  3726,  4014, 12677, 16150,
          4710,  5119,  1997,  2606,  3279,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [12]:
class HeadlineDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        headline = row['headline']
        y = torch.tensor(row['is_sarcastic']).float()
        
        # bert model will want all of these
        # max_length here is 512
        output_dict = tokenizer(headline, padding="max_length", return_tensors="pt")
        
        # had to squeeze it to get it to work
        return (torch.squeeze(output_dict['input_ids']),
                torch.squeeze(output_dict['token_type_ids']),
                torch.squeeze(output_dict['attention_mask']), y)
    
ds = HeadlineDataset(df, tokenizer)

input_ids, token_type_ids, attention_mask, y = next(iter(ds))

In [13]:
train_len = round(0.8 * len(df))
df_train = df[:train_len]
df_val = df[train_len:].reset_index(drop=True)

ds_train = HeadlineDataset(df_train, tokenizer)
ds_val = HeadlineDataset(df_val, tokenizer)

# had to turn the batch size down to keep my laptop from crashing
dl_train = DataLoader(ds_train, batch_size=10, shuffle=True)
dl_val = DataLoader(ds_val, batch_size=10, shuffle=True)

In [14]:
import torch
print(torch.__version__)

1.12.1


In [15]:
input_ids, token_type_ids, attention_mask, y = next(iter(dl_train))

# freeze the parameters so we don't compute backprop w.r.t BERT!
# will help computation time
for param in model.parameters():
    param.requires_grad = False

outputs = model(input_ids, token_type_ids, attention_mask)

In [18]:
class BERT_sarcasm(nn.Module):
    def __init__(self, model, hidden_size):
        super(BERT_sarcasm, self).__init__()
        
        # freeze the parameters
        for param in model.parameters():
            param.requires_grad = False #forzen must, otherwise kernal dead
        self.bert = model
        
        # size of embedding vector for BERT is 768
        self.linear1 = nn.Linear(768, hidden_size)
        self.linear2 = nn.Linear(hidden_size, 1)
        self.relu = nn.ReLU()
        
    def forward(self, input_ids, token_type_ids, attention_mask):
        # get the bert emebedding
        outputs = self.bert(input_ids, token_type_ids, attention_mask)
        x = outputs[0][:,0,:]
        
        x = self.relu(self.linear1(x))
        x = self.linear2(x)
        return torch.squeeze(x)

# make sure everything is working here...
input_ids, token_type_ids, attention_mask, y = next(iter(dl_train))
BERT_model = BERT_sarcasm(model, 50)
BERT_model(input_ids, token_type_ids, attention_mask).shape

torch.Size([10])

In [19]:
def one_pass(model, dataloader, optimizer, lossFun, backwards=True, print_loss=False):
    
    if backwards == True:
        model.train()
    else:
        model.eval()
    
    total_loss = 0.0
    for input_ids, token_type_ids, attention_mask, y in tqdm(dataloader):
        
        y_pred = model(input_ids, token_type_ids, attention_mask)
        loss = lossFun(y_pred, y)
        total_loss += loss.item()
        
        if backwards == True:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    avg_loss = total_loss / len(dataloader)
    
    if print_loss == True:
        print(avg_loss)
    
    return avg_loss

def one_pass_metrics(model, dataloader):
    model.eval()
    acc = 0.0
    prec = 0.0
    recall = 0.0
        
    for input_ids, token_type_ids, attention_mask, y in tqdm(dataloader):
        y_pred = (torch.sigmoid(model(input_ids, token_type_ids, attention_mask)) > 0.5).long()
        total_correct = torch.sum(y == y_pred).item()
        total_true_pos = torch.sum(torch.logical_and(y == 1, y == y_pred)).item()        
        
        acc += total_correct / y.nelement()
        prec += total_true_pos / torch.sum(y_pred).item()
        recall += total_true_pos / torch.sum(y).item()
    
    acc = acc / len(dataloader)
    prec = prec / len(dataloader)
    recall = recall / len(dataloader)
    
    return acc, prec, recall

In [21]:
lossFun = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(BERT_model.parameters(), lr = 0.01)

num_epochs = 5

for epoch in tqdm(range(num_epochs)):
    print('Epoch: ', epoch)
    
    loss = one_pass(BERT_model, dl_train, optimizer, lossFun)
    print('Train Loss: ', loss)
    
    loss = one_pass(BERT_model, dl_val, optimizer, lossFun, backwards=False)
    print('Validation Loss: ', loss)
    
    acc, prec, recall = one_pass_metrics(BERT_model, dl_train)
    print('Train Accuracy: ', acc, 'Train Precision: ', prec,'Train Recall: ', recall)
    
    acc, prec, recall = one_pass_metrics(BERT_model, dl_val)
    print('Val Accuracy: ', acc, 'Val Precision: ', prec,'Val Recall: ', recall)

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch:  0


  0%|          | 0/2284 [00:00<?, ?it/s]

KeyboardInterrupt: 

## due to vpn issue, colab can't be used at this moment, thus this cell takes too long to finish.

In [22]:
df

Unnamed: 0.1,Unnamed: 0,is_sarcastic,headline,article_link,cleaned
0,0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...,thirtysomethe scientist unveil doomsday clock ...
1,1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...,dem rep totally nail congress fall short gende...
2,2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...,eat veggie deliciously different recipe
3,3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...,inclement weather prevent liar get work
4,4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...,mother come pretty close word ' streaming ' co...
...,...,...,...,...,...
28534,28614,1,jews to celebrate rosh hashasha or something,https://www.theonion.com/jews-to-celebrate-ros...,jews celebrate rosh hashasha
28535,28615,1,internal affairs investigator disappointed con...,https://local.theonion.com/internal-affairs-in...,internal affair investigator disappoint conspi...
28536,28616,0,the most beautiful acceptance speech this week...,https://www.huffingtonpost.com/entry/andrew-ah...,beautiful acceptance speech week come queer ko...
28537,28617,1,mars probe destroyed by orbiting spielberg-gat...,https://www.theonion.com/mars-probe-destroyed-...,mars probe destroy orbit spielberg gate space ...
