## Data Preprocessing

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('intermediate_files\sarcasm_headlines_cleaned.csv')
df.head()

Unnamed: 0,article_link,headline,is_sarcastic,headline_cleaned,num_changes
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0,former versace store clerk sues over secret bl...,1
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0,the roseanne revival catches up to our thorny ...,1
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1,mom starting to fear sons web series closest t...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1,boehner just wants wife to listen not come up ...,1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0,jk rowling wishes snape happy birthday in the ...,1


In [10]:
import spacy
nlp = spacy.load("en_core_web_sm")


In [17]:
# Lemmatize df 
df['lemmatized_headline'] = df['headline_cleaned'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))
df.head()

Unnamed: 0,article_link,headline,is_sarcastic,headline_cleaned,num_changes,lemmatized_headline
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0,former versace store clerk sues over secret bl...,1,former versace store clerk sue over secret bla...
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0,the roseanne revival catches up to our thorny ...,1,the roseanne revival catch up to our thorny po...
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1,mom starting to fear sons web series closest t...,1,mom start to fear son web series close thing s...
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1,boehner just wants wife to listen not come up ...,1,boehner just want wife to listen not come up w...
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0,jk rowling wishes snape happy birthday in the ...,1,jk rowling wish snape happy birthday in the mo...


In [18]:
# Part of Speech Tagging
df['pos_tags'] = df['headline_cleaned'].apply(lambda x: [(token.text, token.pos_) for token in nlp(x)])
df.head()

Unnamed: 0,article_link,headline,is_sarcastic,headline_cleaned,num_changes,lemmatized_headline,pos_tags
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0,former versace store clerk sues over secret bl...,1,former versace store clerk sue over secret bla...,"[(former, ADJ), (versace, NOUN), (store, NOUN)..."
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0,the roseanne revival catches up to our thorny ...,1,the roseanne revival catch up to our thorny po...,"[(the, DET), (roseanne, PROPN), (revival, PROP..."
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1,mom starting to fear sons web series closest t...,1,mom start to fear son web series close thing s...,"[(mom, NOUN), (starting, VERB), (to, PART), (f..."
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1,boehner just wants wife to listen not come up ...,1,boehner just want wife to listen not come up w...,"[(boehner, NOUN), (just, ADV), (wants, VERB), ..."
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0,jk rowling wishes snape happy birthday in the ...,1,jk rowling wish snape happy birthday in the mo...,"[(jk, PROPN), (rowling, PROPN), (wishes, VERB)..."


In [None]:
# Stop word removal
df['stopwords_removed'] = df['headline_cleaned'].apply(lambda x: ' '.join([token.text for token in nlp(x) if not token.is_stop]))
df.head()

Unnamed: 0,article_link,headline,is_sarcastic,headline_cleaned,num_changes,lemmatized_headline,pos_tags,stopwords_removed
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0,former versace store clerk sues over secret bl...,1,former versace store clerk sue over secret bla...,"[(former, ADJ), (versace, NOUN), (store, NOUN)...",versace store clerk sues secret black code min...
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0,the roseanne revival catches up to our thorny ...,1,the roseanne revival catch up to our thorny po...,"[(the, DET), (roseanne, PROPN), (revival, PROP...",roseanne revival catches thorny political mood...
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1,mom starting to fear sons web series closest t...,1,mom start to fear son web series close thing s...,"[(mom, NOUN), (starting, VERB), (to, PART), (f...",mom starting fear sons web series closest thin...
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1,boehner just wants wife to listen not come up ...,1,boehner just want wife to listen not come up w...,"[(boehner, NOUN), (just, ADV), (wants, VERB), ...",boehner wants wife listen come alternative deb...
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0,jk rowling wishes snape happy birthday in the ...,1,jk rowling wish snape happy birthday in the mo...,"[(jk, PROPN), (rowling, PROPN), (wishes, VERB)...",jk rowling wishes snape happy birthday magical...


In [4]:
# BPE Tokenization
from tokenizers import Tokenizer, models, trainers, pre_tokenizers

overall_corpus = ' '.join(df['headline_cleaned'])
# Initialize a BPE tokenizer
tokenizer = Tokenizer(models.BPE())

# Set pre-tokenization rules (whitespace-based)
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Train the tokenizer on the corpus
trainer = trainers.BpeTrainer(vocab_size=1000, min_frequency=2, special_tokens=["<unk>", "<pad>", "<s>", "</s>"])
tokenizer.train_from_iterator([overall_corpus], trainer)

# Tokenize each headline
df['bpe_tokens'] = df['headline_cleaned'].apply(lambda x: tokenizer.encode(x).tokens)


In [5]:
df.head()

Unnamed: 0,article_link,headline,is_sarcastic,headline_cleaned,num_changes,lemmatized_headline,pos_tags,stopwords_removed,bpe_tokens
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0,former versace store clerk sues over secret bl...,1,former versace store clerk sue over secret bla...,"[('former', 'ADJ'), ('versace', 'NOUN'), ('sto...",versace store clerk sues secret black code min...,"[for, mer, vers, ace, st, ore, cl, er, k, su, ..."
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0,the roseanne revival catches up to our thorny ...,1,the roseanne revival catch up to our thorny po...,"[('the', 'DET'), ('roseanne', 'PROPN'), ('revi...",roseanne revival catches thorny political mood...,"[the, ro, se, an, ne, re, v, iv, al, cat, ch, ..."
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1,mom starting to fear sons web series closest t...,1,mom start to fear son web series close thing s...,"[('mom', 'NOUN'), ('starting', 'VERB'), ('to',...",mom starting fear sons web series closest thin...,"[mom, start, ing, to, fe, ar, s, ons, we, b, s..."
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1,boehner just wants wife to listen not come up ...,1,boehner just want wife to listen not come up w...,"[('boehner', 'NOUN'), ('just', 'ADV'), ('wants...",boehner wants wife listen come alternative deb...,"[bo, e, h, ner, just, wants, wi, fe, to, list,..."
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0,jk rowling wishes snape happy birthday in the ...,1,jk rowling wish snape happy birthday in the mo...,"[('jk', 'PROPN'), ('rowling', 'PROPN'), ('wish...",jk rowling wishes snape happy birthday magical...,"[j, k, r, ow, ling, w, ish, es, sn, ap, e, hap..."


In [6]:
df.to_csv('intermediate_files/spacy_preprocessing.csv', index=False)