For BERT and DISTILBERT, we only need to do basic preprocessing of the text data

In [18]:
import re
import numpy as np
import pandas as pd
import contractions
from sklearn.model_selection import train_test_split

In [11]:
df = pd.read_csv('data/dataset.csv')

In [12]:
HTML_TAG_PATTERN = re.compile(r'<[^>]+>', re.IGNORECASE)

def remove_html(data):
    data = HTML_TAG_PATTERN.sub('', data)
    return data


def remove_url(data):
    url_clean= re.compile(r"https://\S+|www\.\S+")
    data=url_clean.sub(r'',data)
    return data


def remove_emoji(data):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"  # additional symbols
                               u"\U000024C2-\U0001F251"  # additional symbols
                               u"\U0001F900-\U0001F9FF"  # supplementary symbols and pictographs
                               u"\U0001FA00-\U0001FA6F"  # chess symbols, extended pictographs
                               u"\U0001FA70-\U0001FAFF"  # more extended pictographs
                               u"\U00002600-\U000026FF"  # miscellaneous symbols
                               u"\U00002700-\U000027BF"  # dingbats
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U000E0020-\U000E007F"  # Tag characters for emoji
                               "]+", flags=re.UNICODE)
    data = emoji_pattern.sub(r'', data)
    return data


def lowercase(text):
  word_list = str(text).split(' ')
  return ' '.join([word.lower() for word in word_list])

In [13]:
df['cleaned_text'] = df['text'].apply(lambda z: remove_html(z))
df['cleaned_text'] = df['cleaned_text'].apply(lambda z: remove_url(z))
df['cleaned_text'] = df['cleaned_text'].apply(lambda z: remove_emoji(z))
df['cleaned_text'] = df['cleaned_text'].apply(lambda z: lowercase(z))

# remove contractions
df['cleaned_text'] = df['cleaned_text'].apply(contractions.fix)

In [14]:
df.head()

Unnamed: 0,text,humor,cleaned_text
0,"Joe biden rules out 2020 bid: 'guys, i'm not r...",False,"joe biden rules out 2020 bid: 'guys, i am not ..."
1,Watch: darvish gave hitter whiplash with slow ...,False,watch: darvish gave hitter whiplash with slow ...
2,What do you call a turtle without its shell? d...,True,what do you call a turtle without its she will...
3,5 reasons the 2016 election feels so personal,False,5 reasons the 2016 election feels so personal
4,"Pasco police shot mexican migrant from behind,...",False,"pasco police shot mexican migrant from behind,..."


In [15]:
df['text'] = df['cleaned_text']
df.head()

Unnamed: 0,text,humor,cleaned_text
0,"joe biden rules out 2020 bid: 'guys, i am not ...",False,"joe biden rules out 2020 bid: 'guys, i am not ..."
1,watch: darvish gave hitter whiplash with slow ...,False,watch: darvish gave hitter whiplash with slow ...
2,what do you call a turtle without its she will...,True,what do you call a turtle without its she will...
3,5 reasons the 2016 election feels so personal,False,5 reasons the 2016 election feels so personal
4,"pasco police shot mexican migrant from behind,...",False,"pasco police shot mexican migrant from behind,..."


In [16]:
df = df.drop(columns=['cleaned_text'])

In [17]:
df.head()

Unnamed: 0,text,humor
0,"joe biden rules out 2020 bid: 'guys, i am not ...",False
1,watch: darvish gave hitter whiplash with slow ...,False
2,what do you call a turtle without its she will...,True
3,5 reasons the 2016 election feels so personal,False
4,"pasco police shot mexican migrant from behind,...",False


Train test split of data

In [19]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['humor'], test_size=0.2, random_state=42, stratify=df['humor'])

In [20]:
train_split = pd.concat([pd.DataFrame(X_train), pd.DataFrame(y_train)], axis=1)
train_split.reset_index(inplace=True)
train_split = train_split.rename(columns={0: 'text', 1: 'humor'})
train_split.drop(columns=['index'], inplace=True)
train_split.head()

Unnamed: 0,text,humor
0,watch this swimmer disappear into winter storm...,False
1,"they laughed at reagan, too: trump's ideas wil...",False
2,"hey, are you cold? go over to the corner, it i...",True
3,cannot get a standing desk? these are almost a...,False
4,want to hear a joke about my penis? never mind...,True


In [21]:
train_split['humor'].value_counts()

humor
False    80000
True     80000
Name: count, dtype: int64

In [22]:
test_split = pd.concat([pd.DataFrame(X_test), pd.DataFrame(y_test)], axis=1)
test_split.reset_index(inplace=True)
test_split = test_split.rename(columns={0: 'text', 1: 'humor'})
test_split.drop(columns=['index'], inplace=True)
test_split.head()

Unnamed: 0,text,humor
0,thought up a reddit joke today. when is a tria...,True
1,how much do pirates pay for corn? a buck an ear!,True
2,hillary clinton sent her book to every gop can...,False
3,italian unions lambast new museum boss for wor...,False
4,life below the ocean’s surface wholly depends ...,False


In [23]:
test_split['humor'].value_counts()

humor
True     20000
False    20000
Name: count, dtype: int64

Export the dataframes as csv files

In [24]:
train_split.to_csv("data/DL/DL_train.csv", index = False)
test_split.to_csv("data/DL/DL_test.csv", index = False)