In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import re
import spacy
import nltk
from bs4 import BeautifulSoup
import unicodedata
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.toktok import ToktokTokenizer
from tqdm import tqdm
from wordsegment import load, segment
from autocorrect import Speller
load()
spell = Speller()

## Load the data.

In [2]:
train =pd.read_csv('data/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
df = train[['target','comment_text']]

## Preprocessing 

In [3]:
df.head()

Unnamed: 0,id,tweet
0,1635430535047811073,"""Please help stop college vaccine mandates by ..."
1,1635430534657593347,RT @SandraYozipovic: Alberta woman's COVID-19 ...
2,1635430531767865344,RT @amuse: CBS is now admitting Fauci funded c...
3,1635430530735824897,@RonFilipkowski I take no responsibility for C...
4,1635430529813151746,OMG. The 🤡🌎 we live in. 😆\n\nWell done @covid_...


In [4]:
df.shape

(98, 2)

In [5]:
wpt=nltk.WordPunctTokenizer()

In [6]:
stopwords=nltk.corpus.stopwords.words('english')

In [7]:
stemmer=PorterStemmer()

In [8]:
lemmer=WordNetLemmatizer()

In [9]:
nlp = spacy.load("en_core_web_sm") # Initialize Spacy for english language.

In [10]:
CONTRACTION_MAP = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

In [11]:
def split_hash_tags(text):
    split_text = " ".join(segment(text))
    return split_text

In [12]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")  ## calling BeautifulSoup function to process text
    stripped_text = soup.get_text()  ## calling get_text to get the cleaned text
    return stripped_text   ## return the clean text

In [13]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [14]:
# # Expanding Contractions
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match) \
                                   if contraction_mapping.get(match) \
                                    else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

2) Stop word and punctuation removal,

In [15]:
# # Removing Stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = wpt.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [16]:
# # Removing Special Characters
def remove_special_characters(text):
    text = re.sub('[^a-zA-Z0-9\s]', '', text)   ## Remove the characters other than a-z A-Z 0-9 and space. 
    return text

3) Stemming and lemmatization

In [17]:
# # Lemmatizing text
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [18]:
def correct_spelling(text):
    corrected_text = " ".join([spell(word) for word in text.split()])
    return corrected_text

In [19]:
def normalize_corpus(corpus, hashtag_splitting=True, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     spell_correct = True, stopword_removal=True):
    
    normalized_corpus = []
    
    for doc in tqdm(corpus):
        
        if html_stripping:
            doc = strip_html_tags(doc)
        
        if accented_char_removal:
            doc = remove_accented_chars(doc)
            
        if contraction_expansion:
            doc = expand_contractions(doc)
            
        if text_lower_case:
            doc = doc.lower()
            
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # insert spaces between special characters to isolate them    
        special_char_pattern = re.compile(r'([{.(-)!}])')
        doc = special_char_pattern.sub(" \\1 ", doc)
        
        if text_lemmatization:
            doc = lemmatize_text(doc)
            
        if special_char_removal:
            doc = remove_special_characters(doc)  
        
        if hashtag_splitting:
            doc = split_hash_tags(doc)
        
        if spell_correct:
            doc = correct_spelling(doc)
            
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

In [20]:
df['tweet'].head()

0    "Please help stop college vaccine mandates by ...
1    RT @SandraYozipovic: Alberta woman's COVID-19 ...
2    RT @amuse: CBS is now admitting Fauci funded c...
3    @RonFilipkowski I take no responsibility for C...
4    OMG. The 🤡🌎 we live in. 😆\n\nWell done @covid_...
Name: tweet, dtype: object

In [21]:
norm_text = normalize_corpus(df['tweet'])
#df['comment_text_normalized'] = norm_text
#df.to_csv('df_train.csv', index=False)
df['tweet_normalized'] = norm_text
df.to_csv('tweets.csv', index=False)

100%|██████████████████████████████████████████████████████████████████████████████████| 98/98 [00:20<00:00,  4.87it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tweet_normalized'] = norm_text
