# Imports

In [161]:
import pandas as pd
import numpy as np

#to create partial pos tag fn
from functools import partial

#Spacy imports
import spacy
nlp = spacy.load("en_core_web_sm")

#NLTK imports
#Stopwords
from nltk.corpus import stopwords
#Tokenization
from nltk.tokenize import word_tokenize
#POS Tagging
from nltk.tag import pos_tag
#Stemming
from nltk.stem import PorterStemmer
#Lemmatization
from nltk.stem import WordNetLemmatizer

#Huggingface Tokenizers (BPE & WordPiece)
from tokenizers import Tokenizer, models, trainers, pre_tokenizers

NLTK SSL workaround for Mac Users

In [5]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [109]:
#nltk.download('averaged_perceptron_tagger_eng')
# nltk.download('wordnet')
# nltk.download('universal_tagset')
# nltk.download('stopwords')

# Data Loading

In [170]:
cleaned_df = pd.read_csv('data/sarcasm_headlines_cleaned.csv')
print(cleaned_df.shape)
print()
cleaned_df.head()

(26709, 5)



Unnamed: 0,article_link,headline,is_sarcastic,headline_cleaned,num_changes
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0,former versace store clerk sues over secret bl...,1
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0,the roseanne revival catches up to our thorny ...,1
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1,mom starting to fear sons web series closest t...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1,boehner just wants wife to listen not come up ...,1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0,jk rowling wishes snape happy birthday in the ...,1


# Combined Preprocessor
This combines the two sections below into a single preprocessing function

In [181]:
#caching parsed_text
parsed_text = cleaned_df['headline_cleaned'].apply(lambda x: nlp(x))

In [182]:
def preprocess_data(df:pd.DataFrame, stopword_removed:bool=False) -> pd.DataFrame:
    """
    This function encapsulates the preprocessing logic for the News Headline Sarcasm Detection dataset.
    The following steps are applied: stopword removal (optional), word tokenization, pos tagging, stemming (nltk only),
    lemmatization, BPE tokenization and WordPiece tokenization.

    :param df: pd.DataFrame of cleaned data
    :param stopword_removed: Boolean, whether to remove stopwords
    :return: pd.DataFrame of preprocessed data
    """

    cleaned_df = df.copy()
    #NOTE: The function takes approximately 2 mins per execution because of this spacy processing pipeline
    #Comment out the below line in favor of the previous cell IF you intend on running the function multiple times
    # parsed_text = cleaned_df['headline_cleaned'].apply(lambda x: nlp(x))

    nltk_stopwords = set(stopwords.words('english'))
    universal_pos_tag = partial(pos_tag, tagset='universal')
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    bpe_tokenizer = Tokenizer(models.BPE())
    bpe_tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    bpe_trainer = trainers.BpeTrainer(vocab_size=1000, min_frequency=2, special_tokens=["<unk>", "<pad>", "<s>", "</s>"])

    wordpiece_tokenizer = Tokenizer(models.WordPiece())
    wordpiece_tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    wordpiece_trainer = trainers.WordPieceTrainer(vocab_size=1000, min_frequency=2, continuing_subword_prefix='_', special_tokens=["<unk>", "<pad>", "<s>", "</s>"])

    overall_corpus = ' '.join(preprocessed_df['headline_cleaned'])

    bpe_tokenizer.train_from_iterator([overall_corpus], bpe_trainer)
    wordpiece_tokenizer.train_from_iterator([overall_corpus], wordpiece_trainer)

    if stopword_removed:
        text_props = [list(zip(*[(token.text, token.pos_, token.lemma_) for token in doc if not token.is_stop])) for doc in parsed_text]
        cleaned_df['tokenized_text_nltk'] = cleaned_df['headline_cleaned'].apply(word_tokenize)\
                                                                            .apply(lambda x: [word for word in x if lemmatizer.lemmatize(word) not in nltk_stopwords])
    else:
        text_props = [list(zip(*[(token.text, token.pos_, token.lemma_) for token in doc])) for doc in parsed_text]
        cleaned_df['tokenized_text_nltk'] = cleaned_df['headline_cleaned'].apply(word_tokenize)

    tokenized_text = [list(props[0]) if props else np.nan for props in text_props]
    pos_tagged_text = [list(zip(props[0], props[1])) if props else np.nan for props in text_props]
    lemmatized_text = [list(props[2]) if props else np.nan for props in text_props]

    cleaned_df['tokenized_text_spacy'] = tokenized_text
    cleaned_df['pos_tagged_text_spacy'] = pos_tagged_text
    cleaned_df['lemmatized_text_spacy'] = lemmatized_text

    cleaned_df['pos_tagged_text_nltk'] = cleaned_df['tokenized_text_nltk'].apply(universal_pos_tag)
    cleaned_df['stemmed_text_nltk'] = cleaned_df['pos_tagged_text_nltk'].apply(lambda x: [stemmer.stem(word) for word, tag in x if tag.startswith('ADJ') #adjectives
                                                                       or tag.startswith('NOUN') #nouns
                                                                       or tag.startswith('ADV') #adverbs
                                                                       or tag.startswith('VERB')]) #verbs
    cleaned_df['lemmatized_text_nltk'] = cleaned_df['pos_tagged_text_nltk'].apply(lambda x:
                                                                          [lemmatizer.lemmatize(word) for word, tag in x if tag.startswith('ADJ') #adjectives
                                                                           or tag.startswith('NOUN') #nouns
                                                                           or tag.startswith('ADV') #adverbs
                                                                           or tag.startswith('VERB')]) #verb





    # Tokenize each headline
    cleaned_df['bpe_tokens'] = cleaned_df['headline_cleaned'].apply(lambda x: bpe_tokenizer.encode(x).tokens)
    cleaned_df['wordpiece_tokens'] = cleaned_df['headline_cleaned'].apply(lambda x: tokenizer.encode(x).tokens)

    #Save processed data
    if stopword_removed:
        cleaned_df.to_csv('./data/preprocessed_stopwords_removed.csv', index=False)
    else:
        cleaned_df.to_csv('./data/preprocessed.csv', index=False)

    return cleaned_df

Articles 5322, 7005 and 1561 ('i was, but now I am', 'you are enough', 'what's in a name?') are null if stopwords are removed

In [None]:
preprocessed_df_sw_removed = preprocess_data(cleaned_df, stopword_removed=True)
preprocessed_df = preprocess_data(cleaned_df, stopword_removed=False)

# Spacy Preprocessing
By Vaishnav (Pythonic optimizations added by Vig)

In [82]:
parsed_text = cleaned_df['headline_cleaned'].apply(lambda x: nlp(x))

In [122]:
text_props = [list(zip(*[(token.text, token.pos_, token.lemma_) for token in doc])) for doc in parsed_text]

## Tokenization

In [123]:
tokenized_text = [list(props[0]) for props in text_props]
cleaned_df['tokenized_text_spacy'] = tokenized_text
cleaned_df.head()

Unnamed: 0,article_link,headline,is_sarcastic,headline_cleaned,num_changes,tokenized_text_spacy
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0,former versace store clerk sues over secret bl...,1,"(former, versace, store, clerk, sues, over, se..."
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0,the roseanne revival catches up to our thorny ...,1,"(the, roseanne, revival, catches, up, to, our,..."
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1,mom starting to fear sons web series closest t...,1,"(mom, starting, to, fear, sons, web, series, c..."
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1,boehner just wants wife to listen not come up ...,1,"(boehner, just, wants, wife, to, listen, not, ..."
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0,jk rowling wishes snape happy birthday in the ...,1,"(jk, rowling, wishes, snape, happy, birthday, ..."


## POS Tagging

In [124]:
pos_tagged_text = [list(zip(props[0], props[1])) for props in text_props]
cleaned_df['pos_tagged_text_spacy'] = pos_tagged_text

In [125]:
cleaned_df.head()

Unnamed: 0,article_link,headline,is_sarcastic,headline_cleaned,num_changes,tokenized_text_spacy,pos_tagged_text_spacy
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0,former versace store clerk sues over secret bl...,1,"(former, versace, store, clerk, sues, over, se...","[(former, ADJ), (versace, NOUN), (store, NOUN)..."
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0,the roseanne revival catches up to our thorny ...,1,"(the, roseanne, revival, catches, up, to, our,...","[(the, DET), (roseanne, PROPN), (revival, PROP..."
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1,mom starting to fear sons web series closest t...,1,"(mom, starting, to, fear, sons, web, series, c...","[(mom, NOUN), (starting, VERB), (to, PART), (f..."
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1,boehner just wants wife to listen not come up ...,1,"(boehner, just, wants, wife, to, listen, not, ...","[(boehner, NOUN), (just, ADV), (wants, VERB), ..."
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0,jk rowling wishes snape happy birthday in the ...,1,"(jk, rowling, wishes, snape, happy, birthday, ...","[(jk, PROPN), (rowling, PROPN), (wishes, VERB)..."


## Lemmatization

In [126]:
lemmatized_text = [list(props[2]) for props in text_props]
cleaned_df['lemmatized_text_spacy'] = lemmatized_text

In [88]:
cleaned_df.head()

Unnamed: 0,article_link,headline,is_sarcastic,headline_cleaned,num_changes,tokenized_text_spacy,pos_tagged_text_spacy,lemmatized_text_spacy
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0,former versace store clerk sues over secret bl...,1,"[former, versace, store, clerk, sues, over, se...","[(former, ADJ), (versace, NOUN), (store, NOUN)...","[former, versace, store, clerk, sue, over, sec..."
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0,the roseanne revival catches up to our thorny ...,1,"[the, roseanne, revival, catches, up, to, our,...","[(the, DET), (roseanne, PROPN), (revival, PROP...","[the, roseanne, revival, catch, up, to, our, t..."
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1,mom starting to fear sons web series closest t...,1,"[mom, starting, to, fear, sons, web, series, c...","[(mom, NOUN), (starting, VERB), (to, PART), (f...","[mom, start, to, fear, son, web, series, close..."
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1,boehner just wants wife to listen not come up ...,1,"[boehner, just, wants, wife, to, listen, not, ...","[(boehner, NOUN), (just, ADV), (wants, VERB), ...","[boehner, just, want, wife, to, listen, not, c..."
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0,jk rowling wishes snape happy birthday in the ...,1,"[jk, rowling, wishes, snape, happy, birthday, ...","[(jk, PROPN), (rowling, PROPN), (wishes, VERB)...","[jk, rowling, wish, snape, happy, birthday, in..."


# NLTK Preprocessing
By Vig

## Tokenization
Uses the NLTKWordTokenizer

In [89]:
tokenized_df = cleaned_df.copy()

tokenized_df['tokenized_text_nltk'] = tokenized_df['headline_cleaned'].apply(word_tokenize)

In [90]:
tokenized_df.head()

Unnamed: 0,article_link,headline,is_sarcastic,headline_cleaned,num_changes,tokenized_text_spacy,pos_tagged_text_spacy,lemmatized_text_spacy,tokenized_text_nltk
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0,former versace store clerk sues over secret bl...,1,"[former, versace, store, clerk, sues, over, se...","[(former, ADJ), (versace, NOUN), (store, NOUN)...","[former, versace, store, clerk, sue, over, sec...","[former, versace, store, clerk, sues, over, se..."
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0,the roseanne revival catches up to our thorny ...,1,"[the, roseanne, revival, catches, up, to, our,...","[(the, DET), (roseanne, PROPN), (revival, PROP...","[the, roseanne, revival, catch, up, to, our, t...","[the, roseanne, revival, catches, up, to, our,..."
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1,mom starting to fear sons web series closest t...,1,"[mom, starting, to, fear, sons, web, series, c...","[(mom, NOUN), (starting, VERB), (to, PART), (f...","[mom, start, to, fear, son, web, series, close...","[mom, starting, to, fear, sons, web, series, c..."
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1,boehner just wants wife to listen not come up ...,1,"[boehner, just, wants, wife, to, listen, not, ...","[(boehner, NOUN), (just, ADV), (wants, VERB), ...","[boehner, just, want, wife, to, listen, not, c...","[boehner, just, wants, wife, to, listen, not, ..."
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0,jk rowling wishes snape happy birthday in the ...,1,"[jk, rowling, wishes, snape, happy, birthday, ...","[(jk, PROPN), (rowling, PROPN), (wishes, VERB)...","[jk, rowling, wish, snape, happy, birthday, in...","[jk, rowling, wishes, snape, happy, birthday, ..."


## POS Tagging

Uses [Universal POS Tags](https://universaldependencies.org/u/pos/)

In [94]:
pos_tagged_df = tokenized_df.copy()
#create pos tagger with universal tagset
universal_pos_tag = partial(pos_tag, tagset='universal')

pos_tagged_df['pos_tagged_text_nltk'] = pos_tagged_df['tokenized_text_nltk'].apply(universal_pos_tag)

In [95]:
pos_tagged_df.head()

Unnamed: 0,article_link,headline,is_sarcastic,headline_cleaned,num_changes,tokenized_text_spacy,pos_tagged_text_spacy,lemmatized_text_spacy,tokenized_text_nltk,pos_tagged_text_nltk
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0,former versace store clerk sues over secret bl...,1,"[former, versace, store, clerk, sues, over, se...","[(former, ADJ), (versace, NOUN), (store, NOUN)...","[former, versace, store, clerk, sue, over, sec...","[former, versace, store, clerk, sues, over, se...","[(former, ADJ), (versace, NOUN), (store, NOUN)..."
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0,the roseanne revival catches up to our thorny ...,1,"[the, roseanne, revival, catches, up, to, our,...","[(the, DET), (roseanne, PROPN), (revival, PROP...","[the, roseanne, revival, catch, up, to, our, t...","[the, roseanne, revival, catches, up, to, our,...","[(the, DET), (roseanne, NOUN), (revival, NOUN)..."
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1,mom starting to fear sons web series closest t...,1,"[mom, starting, to, fear, sons, web, series, c...","[(mom, NOUN), (starting, VERB), (to, PART), (f...","[mom, start, to, fear, son, web, series, close...","[mom, starting, to, fear, sons, web, series, c...","[(mom, NOUN), (starting, VERB), (to, PRT), (fe..."
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1,boehner just wants wife to listen not come up ...,1,"[boehner, just, wants, wife, to, listen, not, ...","[(boehner, NOUN), (just, ADV), (wants, VERB), ...","[boehner, just, want, wife, to, listen, not, c...","[boehner, just, wants, wife, to, listen, not, ...","[(boehner, NOUN), (just, ADV), (wants, VERB), ..."
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0,jk rowling wishes snape happy birthday in the ...,1,"[jk, rowling, wishes, snape, happy, birthday, ...","[(jk, PROPN), (rowling, PROPN), (wishes, VERB)...","[jk, rowling, wish, snape, happy, birthday, in...","[jk, rowling, wishes, snape, happy, birthday, ...","[(jk, NOUN), (rowling, VERB), (wishes, NOUN), ..."


## Stemming
Uses a PorterStemmer

In [96]:
stemmer = PorterStemmer()
stemmed_df = pos_tagged_df.copy()

stemmed_df['stemmed_text_nltk'] = stemmed_df['pos_tagged_text_nltk'].apply(lambda x: [stemmer.stem(word) for word, tag in x if tag.startswith('ADJ') #adjectives
                                                                           or tag.startswith('NOUN') #nouns
                                                                           or tag.startswith('ADV') #adverbs
                                                                           or tag.startswith('VERB')]) #verbs

In [97]:
stemmed_df.head()

Unnamed: 0,article_link,headline,is_sarcastic,headline_cleaned,num_changes,tokenized_text_spacy,pos_tagged_text_spacy,lemmatized_text_spacy,tokenized_text_nltk,pos_tagged_text_nltk,stemmed_text_nltk
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0,former versace store clerk sues over secret bl...,1,"[former, versace, store, clerk, sues, over, se...","[(former, ADJ), (versace, NOUN), (store, NOUN)...","[former, versace, store, clerk, sue, over, sec...","[former, versace, store, clerk, sues, over, se...","[(former, ADJ), (versace, NOUN), (store, NOUN)...","[former, versac, store, clerk, sue, secret, bl..."
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0,the roseanne revival catches up to our thorny ...,1,"[the, roseanne, revival, catches, up, to, our,...","[(the, DET), (roseanne, PROPN), (revival, PROP...","[the, roseanne, revival, catch, up, to, our, t...","[the, roseanne, revival, catches, up, to, our,...","[(the, DET), (roseanne, NOUN), (revival, NOUN)...","[roseann, reviv, catch, thorni, polit, mood, b..."
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1,mom starting to fear sons web series closest t...,1,"[mom, starting, to, fear, sons, web, series, c...","[(mom, NOUN), (starting, VERB), (to, PART), (f...","[mom, start, to, fear, son, web, series, close...","[mom, starting, to, fear, sons, web, series, c...","[(mom, NOUN), (starting, VERB), (to, PRT), (fe...","[mom, start, fear, son, web, seri, closest, th..."
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1,boehner just wants wife to listen not come up ...,1,"[boehner, just, wants, wife, to, listen, not, ...","[(boehner, NOUN), (just, ADV), (wants, VERB), ...","[boehner, just, want, wife, to, listen, not, c...","[boehner, just, wants, wife, to, listen, not, ...","[(boehner, NOUN), (just, ADV), (wants, VERB), ...","[boehner, just, want, wife, listen, not, come,..."
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0,jk rowling wishes snape happy birthday in the ...,1,"[jk, rowling, wishes, snape, happy, birthday, ...","[(jk, PROPN), (rowling, PROPN), (wishes, VERB)...","[jk, rowling, wish, snape, happy, birthday, in...","[jk, rowling, wishes, snape, happy, birthday, ...","[(jk, NOUN), (rowling, VERB), (wishes, NOUN), ...","[jk, rowl, wish, snape, happi, birthday, most,..."


## Lemmatization

In [98]:
lemmatizer = WordNetLemmatizer()
lemmatized_df = stemmed_df.copy()

lemmatized_df['lemmatized_text_nltk'] = lemmatized_df['pos_tagged_text_nltk'].apply(lambda x:
                                                                          [lemmatizer.lemmatize(word) for word, tag in x if tag.startswith('ADJ') #adjectives
                                                                           or tag.startswith('NOUN') #nouns
                                                                           or tag.startswith('ADV') #adverbs
                                                                           or tag.startswith('VERB')]) #verbs

In [99]:
lemmatized_df.head()

Unnamed: 0,article_link,headline,is_sarcastic,headline_cleaned,num_changes,tokenized_text_spacy,pos_tagged_text_spacy,lemmatized_text_spacy,tokenized_text_nltk,pos_tagged_text_nltk,stemmed_text_nltk,lemmatized_text_nltk
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0,former versace store clerk sues over secret bl...,1,"[former, versace, store, clerk, sues, over, se...","[(former, ADJ), (versace, NOUN), (store, NOUN)...","[former, versace, store, clerk, sue, over, sec...","[former, versace, store, clerk, sues, over, se...","[(former, ADJ), (versace, NOUN), (store, NOUN)...","[former, versac, store, clerk, sue, secret, bl...","[former, versace, store, clerk, sue, secret, b..."
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0,the roseanne revival catches up to our thorny ...,1,"[the, roseanne, revival, catches, up, to, our,...","[(the, DET), (roseanne, PROPN), (revival, PROP...","[the, roseanne, revival, catch, up, to, our, t...","[the, roseanne, revival, catches, up, to, our,...","[(the, DET), (roseanne, NOUN), (revival, NOUN)...","[roseann, reviv, catch, thorni, polit, mood, b...","[roseanne, revival, catch, thorny, political, ..."
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1,mom starting to fear sons web series closest t...,1,"[mom, starting, to, fear, sons, web, series, c...","[(mom, NOUN), (starting, VERB), (to, PART), (f...","[mom, start, to, fear, son, web, series, close...","[mom, starting, to, fear, sons, web, series, c...","[(mom, NOUN), (starting, VERB), (to, PRT), (fe...","[mom, start, fear, son, web, seri, closest, th...","[mom, starting, fear, son, web, series, closes..."
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1,boehner just wants wife to listen not come up ...,1,"[boehner, just, wants, wife, to, listen, not, ...","[(boehner, NOUN), (just, ADV), (wants, VERB), ...","[boehner, just, want, wife, to, listen, not, c...","[boehner, just, wants, wife, to, listen, not, ...","[(boehner, NOUN), (just, ADV), (wants, VERB), ...","[boehner, just, want, wife, listen, not, come,...","[boehner, just, want, wife, listen, not, come,..."
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0,jk rowling wishes snape happy birthday in the ...,1,"[jk, rowling, wishes, snape, happy, birthday, ...","[(jk, PROPN), (rowling, PROPN), (wishes, VERB)...","[jk, rowling, wish, snape, happy, birthday, in...","[jk, rowling, wishes, snape, happy, birthday, ...","[(jk, NOUN), (rowling, VERB), (wishes, NOUN), ...","[jk, rowl, wish, snape, happi, birthday, most,...","[jk, rowling, wish, snape, happy, birthday, mo..."


In [101]:
preprocessed_df = lemmatized_df.copy()
preprocessed_df.head()

Unnamed: 0,article_link,headline,is_sarcastic,headline_cleaned,num_changes,tokenized_text_spacy,pos_tagged_text_spacy,lemmatized_text_spacy,tokenized_text_nltk,pos_tagged_text_nltk,stemmed_text_nltk,lemmatized_text_nltk
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0,former versace store clerk sues over secret bl...,1,"[former, versace, store, clerk, sues, over, se...","[(former, ADJ), (versace, NOUN), (store, NOUN)...","[former, versace, store, clerk, sue, over, sec...","[former, versace, store, clerk, sues, over, se...","[(former, ADJ), (versace, NOUN), (store, NOUN)...","[former, versac, store, clerk, sue, secret, bl...","[former, versace, store, clerk, sue, secret, b..."
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0,the roseanne revival catches up to our thorny ...,1,"[the, roseanne, revival, catches, up, to, our,...","[(the, DET), (roseanne, PROPN), (revival, PROP...","[the, roseanne, revival, catch, up, to, our, t...","[the, roseanne, revival, catches, up, to, our,...","[(the, DET), (roseanne, NOUN), (revival, NOUN)...","[roseann, reviv, catch, thorni, polit, mood, b...","[roseanne, revival, catch, thorny, political, ..."
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1,mom starting to fear sons web series closest t...,1,"[mom, starting, to, fear, sons, web, series, c...","[(mom, NOUN), (starting, VERB), (to, PART), (f...","[mom, start, to, fear, son, web, series, close...","[mom, starting, to, fear, sons, web, series, c...","[(mom, NOUN), (starting, VERB), (to, PRT), (fe...","[mom, start, fear, son, web, seri, closest, th...","[mom, starting, fear, son, web, series, closes..."
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1,boehner just wants wife to listen not come up ...,1,"[boehner, just, wants, wife, to, listen, not, ...","[(boehner, NOUN), (just, ADV), (wants, VERB), ...","[boehner, just, want, wife, to, listen, not, c...","[boehner, just, wants, wife, to, listen, not, ...","[(boehner, NOUN), (just, ADV), (wants, VERB), ...","[boehner, just, want, wife, listen, not, come,...","[boehner, just, want, wife, listen, not, come,..."
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0,jk rowling wishes snape happy birthday in the ...,1,"[jk, rowling, wishes, snape, happy, birthday, ...","[(jk, PROPN), (rowling, PROPN), (wishes, VERB)...","[jk, rowling, wish, snape, happy, birthday, in...","[jk, rowling, wishes, snape, happy, birthday, ...","[(jk, NOUN), (rowling, VERB), (wishes, NOUN), ...","[jk, rowl, wish, snape, happi, birthday, most,...","[jk, rowling, wish, snape, happy, birthday, mo..."


# Tokenizers (BPE, WordPiece)
By Vaishnav and Vig (BPE and Wordpiece respectively)

## BPE Tokenization

In [105]:
overall_corpus = ' '.join(preprocessed_df['headline_cleaned'])
# Initialize a BPE tokenizer
tokenizer = Tokenizer(models.BPE())

# Set pre-tokenization rules (whitespace-based)
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Train the tokenizer on the corpus
trainer = trainers.BpeTrainer(vocab_size=1000, min_frequency=2, special_tokens=["<unk>", "<pad>", "<s>", "</s>"])
tokenizer.train_from_iterator([overall_corpus], trainer)

# Tokenize each headline
preprocessed_df['bpe_tokens'] = preprocessed_df['headline_cleaned'].apply(lambda x: tokenizer.encode(x).tokens)

preprocessed_df.head()






Unnamed: 0,article_link,headline,is_sarcastic,headline_cleaned,num_changes,tokenized_text_spacy,pos_tagged_text_spacy,lemmatized_text_spacy,tokenized_text_nltk,pos_tagged_text_nltk,stemmed_text_nltk,lemmatized_text_nltk,bpe_tokens
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0,former versace store clerk sues over secret bl...,1,"[former, versace, store, clerk, sues, over, se...","[(former, ADJ), (versace, NOUN), (store, NOUN)...","[former, versace, store, clerk, sue, over, sec...","[former, versace, store, clerk, sues, over, se...","[(former, ADJ), (versace, NOUN), (store, NOUN)...","[former, versac, store, clerk, sue, secret, bl...","[former, versace, store, clerk, sue, secret, b...","[for, mer, vers, ace, st, ore, cl, er, k, su, ..."
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0,the roseanne revival catches up to our thorny ...,1,"[the, roseanne, revival, catches, up, to, our,...","[(the, DET), (roseanne, PROPN), (revival, PROP...","[the, roseanne, revival, catch, up, to, our, t...","[the, roseanne, revival, catches, up, to, our,...","[(the, DET), (roseanne, NOUN), (revival, NOUN)...","[roseann, reviv, catch, thorni, polit, mood, b...","[roseanne, revival, catch, thorny, political, ...","[the, ro, se, an, ne, re, v, iv, al, cat, ch, ..."
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1,mom starting to fear sons web series closest t...,1,"[mom, starting, to, fear, sons, web, series, c...","[(mom, NOUN), (starting, VERB), (to, PART), (f...","[mom, start, to, fear, son, web, series, close...","[mom, starting, to, fear, sons, web, series, c...","[(mom, NOUN), (starting, VERB), (to, PRT), (fe...","[mom, start, fear, son, web, seri, closest, th...","[mom, starting, fear, son, web, series, closes...","[mom, start, ing, to, fe, ar, s, ons, we, b, s..."
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1,boehner just wants wife to listen not come up ...,1,"[boehner, just, wants, wife, to, listen, not, ...","[(boehner, NOUN), (just, ADV), (wants, VERB), ...","[boehner, just, want, wife, to, listen, not, c...","[boehner, just, wants, wife, to, listen, not, ...","[(boehner, NOUN), (just, ADV), (wants, VERB), ...","[boehner, just, want, wife, listen, not, come,...","[boehner, just, want, wife, listen, not, come,...","[bo, e, h, ner, just, wants, wi, fe, to, list,..."
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0,jk rowling wishes snape happy birthday in the ...,1,"[jk, rowling, wishes, snape, happy, birthday, ...","[(jk, PROPN), (rowling, PROPN), (wishes, VERB)...","[jk, rowling, wish, snape, happy, birthday, in...","[jk, rowling, wishes, snape, happy, birthday, ...","[(jk, NOUN), (rowling, VERB), (wishes, NOUN), ...","[jk, rowl, wish, snape, happi, birthday, most,...","[jk, rowling, wish, snape, happy, birthday, mo...","[j, k, r, ow, ling, w, ish, es, sn, ap, e, hap..."


In [107]:
# Initialize a WordPiece tokenizer
tokenizer = Tokenizer(models.WordPiece())

# Set pre-tokenization rules (whitespace-based)
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Train the tokenizer on the corpus
trainer = trainers.WordPieceTrainer(vocab_size=1000, min_frequency=2, continuing_subword_prefix='_', special_tokens=["<unk>", "<pad>", "<s>", "</s>"])
tokenizer.train_from_iterator([overall_corpus], trainer)

# Tokenize each headline
preprocessed_df['wordpiece_tokens'] = preprocessed_df['headline_cleaned'].apply(lambda x: tokenizer.encode(x).tokens)

preprocessed_df.head()






Unnamed: 0,article_link,headline,is_sarcastic,headline_cleaned,num_changes,tokenized_text_spacy,pos_tagged_text_spacy,lemmatized_text_spacy,tokenized_text_nltk,pos_tagged_text_nltk,stemmed_text_nltk,lemmatized_text_nltk,bpe_tokens,wordpiece_tokens
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0,former versace store clerk sues over secret bl...,1,"[former, versace, store, clerk, sues, over, se...","[(former, ADJ), (versace, NOUN), (store, NOUN)...","[former, versace, store, clerk, sue, over, sec...","[former, versace, store, clerk, sues, over, se...","[(former, ADJ), (versace, NOUN), (store, NOUN)...","[former, versac, store, clerk, sue, secret, bl...","[former, versace, store, clerk, sue, secret, b...","[for, mer, vers, ace, st, ore, cl, er, k, su, ...","[for, _mer, v, _ers, _ace, st, _ore, cle, _r, ..."
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0,the roseanne revival catches up to our thorny ...,1,"[the, roseanne, revival, catches, up, to, our,...","[(the, DET), (roseanne, PROPN), (revival, PROP...","[the, roseanne, revival, catch, up, to, our, t...","[the, roseanne, revival, catches, up, to, our,...","[(the, DET), (roseanne, NOUN), (revival, NOUN)...","[roseann, reviv, catch, thorni, polit, mood, b...","[roseanne, revival, catch, thorny, political, ...","[the, ro, se, an, ne, re, v, iv, al, cat, ch, ...","[the, r, _ose, _an, _n, _e, rev, _iv, _al, c, ..."
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1,mom starting to fear sons web series closest t...,1,"[mom, starting, to, fear, sons, web, series, c...","[(mom, NOUN), (starting, VERB), (to, PART), (f...","[mom, start, to, fear, son, web, series, close...","[mom, starting, to, fear, sons, web, series, c...","[(mom, NOUN), (starting, VERB), (to, PRT), (fe...","[mom, start, fear, son, web, seri, closest, th...","[mom, starting, fear, son, web, series, closes...","[mom, start, ing, to, fe, ar, s, ons, we, b, s...","[mom, start, _ing, to, fe, _ar, so, _ns, we, _..."
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1,boehner just wants wife to listen not come up ...,1,"[boehner, just, wants, wife, to, listen, not, ...","[(boehner, NOUN), (just, ADV), (wants, VERB), ...","[boehner, just, want, wife, to, listen, not, c...","[boehner, just, wants, wife, to, listen, not, ...","[(boehner, NOUN), (just, ADV), (wants, VERB), ...","[boehner, just, want, wife, listen, not, come,...","[boehner, just, want, wife, listen, not, come,...","[bo, e, h, ner, just, wants, wi, fe, to, list,...","[bo, _e, _hn, _er, just, want, _s, w, _ife, to..."
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0,jk rowling wishes snape happy birthday in the ...,1,"[jk, rowling, wishes, snape, happy, birthday, ...","[(jk, PROPN), (rowling, PROPN), (wishes, VERB)...","[jk, rowling, wish, snape, happy, birthday, in...","[jk, rowling, wishes, snape, happy, birthday, ...","[(jk, NOUN), (rowling, VERB), (wishes, NOUN), ...","[jk, rowl, wish, snape, happi, birthday, most,...","[jk, rowling, wish, snape, happy, birthday, mo...","[j, k, r, ow, ling, w, ish, es, sn, ap, e, hap...","[j, _k, r, _ow, _ling, w, _ish, _es, sn, _ap, ..."


# Save Processed Data

In [100]:
preprocessed_df = lemmatized_df.copy()
preprocessed_df.head()

Unnamed: 0,article_link,headline,is_sarcastic,headline_cleaned,num_changes,tokenized_text_spacy,pos_tagged_text_spacy,lemmatized_text_spacy,tokenized_text_nltk,pos_tagged_text_nltk,stemmed_text_nltk,lemmatized_text_nltk
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0,former versace store clerk sues over secret bl...,1,"[former, versace, store, clerk, sues, over, se...","[(former, ADJ), (versace, NOUN), (store, NOUN)...","[former, versace, store, clerk, sue, over, sec...","[former, versace, store, clerk, sues, over, se...","[(former, ADJ), (versace, NOUN), (store, NOUN)...","[former, versac, store, clerk, sue, secret, bl...","[former, versace, store, clerk, sue, secret, b..."
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0,the roseanne revival catches up to our thorny ...,1,"[the, roseanne, revival, catches, up, to, our,...","[(the, DET), (roseanne, PROPN), (revival, PROP...","[the, roseanne, revival, catch, up, to, our, t...","[the, roseanne, revival, catches, up, to, our,...","[(the, DET), (roseanne, NOUN), (revival, NOUN)...","[roseann, reviv, catch, thorni, polit, mood, b...","[roseanne, revival, catch, thorny, political, ..."
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1,mom starting to fear sons web series closest t...,1,"[mom, starting, to, fear, sons, web, series, c...","[(mom, NOUN), (starting, VERB), (to, PART), (f...","[mom, start, to, fear, son, web, series, close...","[mom, starting, to, fear, sons, web, series, c...","[(mom, NOUN), (starting, VERB), (to, PRT), (fe...","[mom, start, fear, son, web, seri, closest, th...","[mom, starting, fear, son, web, series, closes..."
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1,boehner just wants wife to listen not come up ...,1,"[boehner, just, wants, wife, to, listen, not, ...","[(boehner, NOUN), (just, ADV), (wants, VERB), ...","[boehner, just, want, wife, to, listen, not, c...","[boehner, just, wants, wife, to, listen, not, ...","[(boehner, NOUN), (just, ADV), (wants, VERB), ...","[boehner, just, want, wife, listen, not, come,...","[boehner, just, want, wife, listen, not, come,..."
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0,jk rowling wishes snape happy birthday in the ...,1,"[jk, rowling, wishes, snape, happy, birthday, ...","[(jk, PROPN), (rowling, PROPN), (wishes, VERB)...","[jk, rowling, wish, snape, happy, birthday, in...","[jk, rowling, wishes, snape, happy, birthday, ...","[(jk, NOUN), (rowling, VERB), (wishes, NOUN), ...","[jk, rowl, wish, snape, happi, birthday, most,...","[jk, rowling, wish, snape, happy, birthday, mo..."


In [54]:
preprocessed_df.to_csv('./intermediate_files/preprocessed.csv', index=False)