## Cleaning the text data for machine learning models

1. lowercase text
2. removal of punctuation marks (only for ML)
3. removal of unicode characters
4. tokenization
5. tokenization
6. lemmatization

In [1]:
import pandas as pd
from numpy import *

import re

import contractions
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from numpy import *
import seaborn as sns
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from wordcloud import WordCloud, STOPWORDS


# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Functions to help us remove HTML, url, unicode characters and lowercase text

In [2]:
HTML_TAG_PATTERN = re.compile(r'<[^>]+>', re.IGNORECASE)

def remove_html(data):
    data = HTML_TAG_PATTERN.sub('', data)
    return data


def remove_url(data):
    url_clean= re.compile(r"https://\S+|www\.\S+")
    data=url_clean.sub(r'',data)
    return data


def remove_emoji(data):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"  # additional symbols
                               u"\U000024C2-\U0001F251"  # additional symbols
                               u"\U0001F900-\U0001F9FF"  # supplementary symbols and pictographs
                               u"\U0001FA00-\U0001FA6F"  # chess symbols, extended pictographs
                               u"\U0001FA70-\U0001FAFF"  # more extended pictographs
                               u"\U00002600-\U000026FF"  # miscellaneous symbols
                               u"\U00002700-\U000027BF"  # dingbats
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U000E0020-\U000E007F"  # Tag characters for emoji
                               "]+", flags=re.UNICODE)
    data = emoji_pattern.sub(r'', data)
    return data


def lowercase(text):
  word_list = str(text).split(' ')
  return ' '.join([word.lower() for word in word_list])

In [3]:
df = pd.read_csv('data/dataset.csv')
df.head()

Unnamed: 0,text,humor
0,"Joe biden rules out 2020 bid: 'guys, i'm not r...",False
1,Watch: darvish gave hitter whiplash with slow ...,False
2,What do you call a turtle without its shell? d...,True
3,5 reasons the 2016 election feels so personal,False
4,"Pasco police shot mexican migrant from behind,...",False


In [4]:
df['text'] = df['text'].apply(remove_html)
df['text'] = df['text'].apply(remove_url)
df['text'] = df['text'].apply(remove_emoji)
df['text'] = df['text'].apply(lowercase)

df.head()

Unnamed: 0,text,humor
0,"joe biden rules out 2020 bid: 'guys, i'm not r...",False
1,watch: darvish gave hitter whiplash with slow ...,False
2,what do you call a turtle without its shell? d...,True
3,5 reasons the 2016 election feels so personal,False
4,"pasco police shot mexican migrant from behind,...",False


Remove contractions

In [5]:
df['text'] = df['text'].apply(contractions.fix)

Remove punctuation marks

In [6]:
def remove_punctuation(text):
    punctuation_pattern = re.compile(r'[^\w\s]')
    clean_text = punctuation_pattern.sub('', text)
    return clean_text

In [7]:
df['text'] = df['text'].apply(remove_punctuation)
df.head()

Unnamed: 0,text,humor
0,joe biden rules out 2020 bid guys i am not run...,False
1,watch darvish gave hitter whiplash with slow p...,False
2,what do you call a turtle without its she will...,True
3,5 reasons the 2016 election feels so personal,False
4,pasco police shot mexican migrant from behind ...,False


Tokenization

In [8]:
tokenizer = TweetTokenizer()
df['text'] = df['text'].apply(tokenizer.tokenize)
df.head()

Unnamed: 0,text,humor
0,"[joe, biden, rules, out, 2020, bid, guys, i, a...",False
1,"[watch, darvish, gave, hitter, whiplash, with,...",False
2,"[what, do, you, call, a, turtle, without, its,...",True
3,"[5, reasons, the, 2016, election, feels, so, p...",False
4,"[pasco, police, shot, mexican, migrant, from, ...",False


Stopword Removal

In [9]:
# Just to check list of stopwords to ensure that the keep tokens are not redundant
stop_words = set(stopwords.words('english'))
print(stop_words)

{'them', 'but', 'any', 'that', 'only', 're', 'if', 'had', 'mustn', 'd', 'for', "didn't", 'its', "that'll", 'having', 'hadn', 'should', 'ma', 'myself', 'over', 'is', 'wasn', 'about', 'did', 'has', 'or', "it's", "don't", 'needn', 'with', 'mightn', 'those', 'once', 'so', 'above', 'ain', 'doing', 'are', 'our', 'itself', 'during', 'after', 'm', 'now', 'whom', "hasn't", 'into', 'ours', 'own', 'herself', "you're", 'such', "should've", "isn't", 'your', 'which', 'few', 'then', 'theirs', "shouldn't", 'an', 'same', "needn't", 'shan', 'how', 'who', 'do', 'my', 'yourself', 'between', 'too', 'being', 'his', 'were', 'against', 'by', 'some', 'me', "wasn't", 'was', 'no', 'this', 'most', 'off', "weren't", 'while', 'nor', "you'd", 'haven', "wouldn't", 'himself', 'not', 'it', 'to', 'am', 'weren', 'hers', 'more', 'aren', "aren't", 'themselves', 'won', 'out', 'what', 'in', 'until', 'below', 'be', 'of', 'ourselves', 'y', 'shouldn', 'their', 'will', 'have', 'at', 'can', 'on', 'yours', 'each', 'didn', 'other',

In [10]:
def filter_tokens(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return filtered_tokens

In [11]:
df['text'] = df['text'].apply(filter_tokens)

In [12]:
df.head()

Unnamed: 0,text,humor
0,"[joe, biden, rules, 2020, bid, guys, running]",False
1,"[watch, darvish, gave, hitter, whiplash, slow,...",False
2,"[call, turtle, without, dead]",True
3,"[5, reasons, 2016, election, feels, personal]",False
4,"[pasco, police, shot, mexican, migrant, behind...",False


Lemmatization

In [13]:
lemmatizer = WordNetLemmatizer()
def lemmatize_tokens(filtered_tokens):
  return ' '.join([lemmatizer.lemmatize(token) for token in filtered_tokens])

In [14]:
df['text'] = df['text'].apply(lemmatize_tokens)

In [15]:
df.head()

Unnamed: 0,text,humor
0,joe biden rule 2020 bid guy running,False
1,watch darvish gave hitter whiplash slow pitch,False
2,call turtle without dead,True
3,5 reason 2016 election feel personal,False
4,pasco police shot mexican migrant behind new a...,False


# Export train and test dataframes

In [16]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['humor'], test_size=0.2, random_state=42, stratify=df['humor'])

In [17]:
train_split = pd.concat([pd.DataFrame(X_train), pd.DataFrame(y_train)], axis=1)
train_split.reset_index(inplace=True)
train_split = train_split.rename(columns={0: 'text', 1: 'humor'})
train_split.drop(columns=['index'], inplace=True)
train_split.head()

Unnamed: 0,text,humor
0,watch swimmer disappear winter storm jonas,False
1,laughed reagan trump idea outlast political stage,False
2,hey cold go corner 90 degress,True
3,cannot get standing desk almost good,False
4,want hear joke penis never mind long,True


In [18]:
train_split['humor'].value_counts()

humor
False    80000
True     80000
Name: count, dtype: int64

In [19]:
test_split = pd.concat([pd.DataFrame(X_test), pd.DataFrame(y_test)], axis=1)
test_split.reset_index(inplace=True)
test_split = test_split.rename(columns={0: 'text', 1: 'humor'})
test_split.drop(columns=['index'], inplace=True)
test_split.head()

Unnamed: 0,text,humor
0,thought reddit joke today triangle rectangle f...,True
1,much pirate pay corn buck ear,True
2,hillary clinton sent book every gop candidatee...,False
3,italian union lambast new museum bos working hard,False
4,life ocean surface wholly depends live,False


In [20]:
test_split['humor'].value_counts()

humor
True     20000
False    20000
Name: count, dtype: int64

In [21]:
# Export dataframes as csv files
train_split.to_csv("data/ML/ML_train.csv", index = False)
test_split.to_csv("data/ML/ML_test.csv", index = False)