This notebook contains function to pre-process raw text inputs.

In [None]:
import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# 1. Noise Entity Removal
This section is about noise entity removal. First, we convert all the words to lower case. We then remove html tags, non-word character, digits and extra spaces. Finally, we remove stopwords from the documents.

In [None]:
HTML_PATTERN = re.compile('<.*?>')
STOPWORDS_LIST = set(stopwords.words('english'))

def noise_entity_removal(target_input):
    # convert to lower case
    target_input = target_input.lower()
    
    # remove html tags
    target_input = re.sub(HTML_PATTERN, '', target_input)
    
    # remove non-word characters like #,*,% etc
    target_input = re.sub(r'\W',' ', target_input)
    
    #will remove digits
    target_input = re.sub(r'\d',' ',target_input)
    
    #will remove extra spaces
    target_input = re.sub(r'\s+',' ',target_input)
    
    # remove stopwords
    target_input_tokens = nltk.word_tokenize(target_input)
    target_input_tokens_wo_stopwords = [i for i in target_input_tokens if i not in STOPWORDS_LIST and i]
    
    # join the list of tokens back to string
    output = " ".join(target_input_tokens_wo_stopwords)
    
    return output

## 2. Text Normalization
In this section, we normalize our documents by either stemming or lemmatizing. Since lemmatization is able to retain the sentiment meanings, we will make lemmatization as the default.

In [None]:
LEMMATIZER = WordNetLemmatizer()
STEMMER = PorterStemmer()

def text_normalization(target_input, method = 'lemmatize'):
    target_input_tokens = nltk.word_tokenize(target_input)
    
    if method == 'lemmatize':
        lemmatized_tokens = [LEMMATIZER.lemmatize(word, pos='v') for word in target_input_tokens]
        output = " ".join(lemmatized_tokens)
    
    if method == 'stem':
        stemmed_tokens = [STEMMER.stem(word) for word in target_input_tokens]
        output = " ".join(stemmed_tokens)
        
    return output

## 3. Stardardise Labels (if applicable)
0: positive; 1: negative

0: positive; 1: neutral; 2: negative

In [None]:
'''
If sentiments are labelled with words, we need to convert these into integer classes
'''

def label_to_integer(sentiment_label):
    if sentiment_label == 'positive':
        return 0
    elif sentiment_label == 'negative':
        return 1
    else:
        return None

'''
Assume sentiment score ranges from 0 to 5.
score <= 2: negative
score = 3: neutral
score >= 4: positive
'''
def process_score(sentiment_score):
    if sentiment_score >= 4:
        return 0
    elif sentiment_score == 3:
        return 1
    else:
        return 2