This notebook contains functions to pre-process raw text inputs - Noise Entity Removal, Text Normalization and Conversion of Labels

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# 1. Noise Entity Removal
This section is about noise entity removal. First, we convert all the words to lower case. We then remove html tags, non-word characters, digits and extra spaces. Finally, we remove stopwords from the documents.

In [2]:
HTML_PATTERN = re.compile('<.*?>')
nltk.download('stopwords')
STOPWORDS_LIST = set(stopwords.words('english'))

def noise_entity_removal(target_input):
    # convert to lower case
    target_input = target_input.lower()
    
    # remove html tags
    target_input = re.sub(HTML_PATTERN, ' ', target_input)
    
    # remove non-word characters like #,*,% etc
    target_input = re.sub(r'\W',' ', target_input)
    
    #will remove digits
    target_input = re.sub(r'\d',' ',target_input)
    
    #will remove extra spaces
    target_input = re.sub(r'\s+',' ',target_input)
    
    # remove stopwords
    target_input_tokens = nltk.word_tokenize(target_input)
    target_input_tokens_wo_stopwords = [i for i in target_input_tokens if i not in STOPWORDS_LIST and i]
    
    # join the list of tokens back to string
    output = " ".join(target_input_tokens_wo_stopwords)
    
    return output

## 2. Text Normalization
In this section, we normalize our documents by either stemming or lemmatizing. Since lemmatization is able to retain the sentiment meanings, we will make lemmatization as the default.

In [3]:
LEMMATIZER = WordNetLemmatizer()
STEMMER = PorterStemmer()

# POS Tags to be kept (Noun, Verb, Adjective, Adverb) (n,v,a,r)
KEPT_POSTAGS = ['JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS', 'VBZ', 'VBP', 'VBN', 'VBG','VBD', 'VB', 'RBS', 'RB', 'RBR']
NOUN_POSTAGS = ['NN', 'NNS', 'NNP', 'NNPS']
VERB_POSTAGS = ['VBZ', 'VBP', 'VBN', 'VBG','VBD', 'VB']

def mylemmatize(word, pos):
    if pos in VERB_POSTAGS:
        return LEMMATIZER.lemmatize(word, pos = 'v')
    elif pos in NOUN_POSTAGS:
        return LEMMATIZER.lemmatize(word, pos = 'n')
    else:
        return word

def text_normalization(target_input, method = 'lemmatize'):
    target_input_tokens = nltk.word_tokenize(target_input)
    
    if method == 'lemmatize':
        #lemmatized_tokens = [LEMMATIZER.lemmatize(word, pos='v') for word in target_input_tokens]
        lemmatized_tokens = [mylemmatize(*word_tup) for word_tup in nltk.pos_tag(target_input_tokens)]
        output = " ".join(lemmatized_tokens)
    
    if method == 'stem':
        stemmed_tokens = [STEMMER.stem(word) for word in target_input_tokens]
        output = " ".join(stemmed_tokens)
        
    return output

## 3. Stardardise Labels
Since the dataset labels for sentiments are in the form of words (i.e. positive and negative), we will convert these labels to integers instead. <br>
Positive: 0 <br>
Negative: 1

In [4]:
def label_to_integer(sentiment_label):
    if sentiment_label == 'positive':
        return 0
    elif sentiment_label == 'negative':
        return 1
    else:
        return None

# Read in and Process Dataset

In [5]:
df = pd.read_csv('raw/reviews.csv')
df

Unnamed: 0,Sentiment,Time,Text
0,positive,18/6/21,This is a very healthy dog food. Good for thei...
1,positive,7/7/21,I've been very pleased with the Natural Balanc...
2,positive,18/6/21,"Before I was educated about feline nutrition, ..."
3,positive,7/7/21,"My holistic vet recommended this, along with a..."
4,positive,1/7/21,I bought this coffee because its much cheaper ...
...,...,...,...
5439,negative,26/2/21,"This is an okay gift box, only if you like med..."
5440,negative,18/12/19,It looks llike I just walked into a raw deal. ...
5441,negative,19/1/20,Thank god that i tasted the metal before i swa...
5442,negative,13/9/20,This product was very good when I began buying...


In [6]:
df['processed_text'] = df['Text'].apply(lambda x:noise_entity_removal(x))
df['processed_text'] = df['processed_text'].apply(lambda x:text_normalization(x))
df['Sentiment'] = df['Sentiment'].apply(lambda x:label_to_integer(x))

In [7]:
df.to_csv('curated/reviews/yiting_cleaned_reviews.csv', index = False)