## Data Cleaning of Text Reviews 


### Techniques
1. Tranform all reviews to lower case 
2. Remove special characters and punctuations 
3. Remove numbers 
4. Remove stop words 
5. Remove words with repeating characters 
6. Apply Part of Speech (POS) tagging and lemmatization to get root words 

### Output 
Dataframe with cleaned text reviews for further analysis 


In [1]:
import pickle
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import re

In [2]:
data = pd.read_pickle("reviews.pkl")
data.head(10)

Unnamed: 0,overall,reviewText,department
0,5.0,No adverse comment.,Beverages
1,5.0,Gift for college student.,Beverages
2,5.0,"If you like strong tea, this is for you. It mi...",Beverages
3,5.0,Love the tea. The flavor is way better than th...,Beverages
4,5.0,I have searched everywhere until I browsed Ama...,Beverages
5,4.0,Tea made with Lipton Yellow Label teabags is m...,Beverages
6,5.0,"I love this tea! Okay, I'm not a high falutin...",Beverages
7,5.0,Discovered this tea at a local Med. Rest....a ...,Beverages
8,4.0,Well I bought this tea after being in Malaysia...,Beverages
9,5.0,We really like this tea. It is definitely dif...,Beverages


In [3]:
# Join al the reviews as a body of text 

text_old = "".join(r for r in data['reviewText'])

In [4]:
# Finding special characters that are non alphanumeric and not spaces

sp = set([char for char in text_old if not char.isalnum() and char != ' '])
print("Special characters include:",sp)

# Findind words wih repeating characters 
rep = re.findall(r'(\w)(\1{2,})', text_old)
rep = set([y for x, y in rep])
print("Repeating characters include:",rep)

Special characters include: {'+', '.', '\x03', '}', '&', '/', '<', '\x11', ')', '#', '\x08', "'", '{', '|', ';', '?', '\x10', '`', ']', ',', '-', ':', '=', '%', '$', '\n', '"', '[', '^', '@', '~', '!', '_', '\x7f', '\x1b', '*', '(', '\\', '>'}
Repeating characters include: {'RRRRRR', 'OOOOOOOOOOOOOOOOOOO', 'eeeeeeeee', 'GGG', 'hhhhhhhhh', 'nnnn', 'sssssssss', 'ssssssssssss', '1111', 'OOOOOOO', 'kkkk', 'AAAAAAAA', 'QQ', 'OO', 'dddddd', 'ooooooooooooooooooo', 'AA', 'AAAAAAAAAAAAAAAAAAA', 'eeeee', 'llll', 'uuuuu', 'SSSSSS', 'hhhhhhh', 'VVV', 'UUU', 'fff', 'IIIIIIII', 'aaaaa', 'nnn', 'RRRRRRRRRRRRRRRRRRR', 'hhhhhh', 'wwwwwwwwwwwwwwww', 'nnnnn', 'rrrrr', 'rrrrrrrrrrr', 'vvvvvvvvv', '____________', 'GG', 'sssssss', '0000000000', 'WW', 'nnnnnnnn', 'lllllllll', 'EEEEEEEEEEE', 'hhhhhhhhhhhhhhhhhhhhhhhh', 'ssss', '___________', 'OOO', 'yyyy', '_______________________________________________________________________________________________________', 'HHHH', 'zzzzzzzz', 'ZZZ', 'zzz', 'ooooooo', 'oo

In [3]:
#first cleaning round 

def clean_1(review):
    words = str(review).lower()  # lowercase all words
    words = re.sub(r'[^A-Za-z ]+', ' ', words) # replace anything that is not alphabet with space 
    words = re.sub(r'(.)\1+', r'\1\1', words) # turn"soooo goooood" into "so good"
    return(words)

data['clean1'] = data['reviewText'].apply(lambda x: clean_1(x))

In [4]:
data.head(20)

Unnamed: 0,overall,reviewText,department,clean1
0,5.0,No adverse comment.,Beverages,no adverse comment
1,5.0,Gift for college student.,Beverages,gift for college student
2,5.0,"If you like strong tea, this is for you. It mi...",Beverages,if you like strong tea this is for you it mi...
3,5.0,Love the tea. The flavor is way better than th...,Beverages,love the tea the flavor is way better than th...
4,5.0,I have searched everywhere until I browsed Ama...,Beverages,i have searched everywhere until i browsed ama...
5,4.0,Tea made with Lipton Yellow Label teabags is m...,Beverages,tea made with lipton yellow label teabags is m...
6,5.0,"I love this tea! Okay, I'm not a high falutin...",Beverages,i love this tea okay i m not a high falutin ...
7,5.0,Discovered this tea at a local Med. Rest....a ...,Beverages,discovered this tea at a local med rest a gre...
8,4.0,Well I bought this tea after being in Malaysia...,Beverages,well i bought this tea after being in malaysia...
9,5.0,We really like this tea. It is definitely dif...,Beverages,we really like this tea it is definitely diff...


In [7]:
# check after cleaning 

clean = "".join(r for r in data['clean1'])

sp = set([char for char in clean if not char.isalnum() and char != ' '])
print("Special characters include:",sp)

# Findind words wih repeating characters 
rep = re.findall(r'(\w)(\1{2,})', clean)
rep = set([y for x, y in rep])
print("Repeating characters include:",rep)

Special characters include: set()
Repeating characters include: {'yy', 'll', 'mmm', 'ss', 'ff', 'oo', 'aa', 'ww', 'aaa', 'ii', 'tt', 'ee', 'mm', 'dd'}


In [8]:
print("- Old Review -")
print(data['reviewText'][7102])
print("\n- New Review -")
print(data['clean1'][7102])

- Old Review -
GREAT PRODUCT, FAST DELIVERY, THANK YOU WILL ORDER AGAIN SOON .

- New Review -
great product  fast delivery  thank you will order again soon  


In [5]:
# Add tokenized column

data['tokens'] = data['clean1'].apply(lambda x: word_tokenize(x))

In [51]:
# Reviews before lower case, special characters, and numbers removal 
w_lists = [word_tokenize(r) for r in data['reviewText']]
words = [w for review in w_lists for w in review]
prev = len(set(words))
print("Number of unique words before lower case, special characters, and numbers removal: ",prev)

# Reviews after lower case, special characters, and numbers removal 
words_clean = [w for review in data['tokens'] for w in review]
post = len(set(words_clean))
print("Number of unique words after lower case, special characters, and numbers removal: ", post)

Number of unique words before lower case, special characters, and numbers removal:  222703
Number of unique words after lower case, special characters, and numbers removal:  98785


In [52]:
# Number of unique words before lower case, special characters, and numbers removal:  222,703
# Number of unique words after lower case, special characters, and numbers removal:  98,785
print("{:.2f}".format(((post/prev)*100)), '% of word were removed after lower case, special characters, and numbers removal')

44.36 % of word were removed after lower case, special characters, and numbers removal


## Lemmatize Words and Remove Stopwords

Stopwords are commonly used words in any language. For example, some stopwords in English include "the", "her", "can". While these words are important for sentence structures, they do not carry a lot of meaning in our analysis. Therefore, they will be removed from the reviews. 

Different forms of a word often convey the same meaning. For example, "boxes" and "box" essentially represent the same object. The same word may also appear in different tenses (i.e. "look", "looked", and "looking"). These different forms of the word are known as **inflections**. In general, inflections should be analyzed as a single term since they represent the same meaning. 

In the following section, each word in the review will be tagged for its respectice Part of Speech (POS), i.e. noun, verb, adjective, or adjective. Each word in the reviews will be tokenized. Then the nltk pos_tag library will be used to identify POS of each word. 

After that, the process of lemmatization is applied using the nltk wordnet library. Lemmatization takes into consideration the morphological analysis of the words. So lemmatization considers the grammar of the word and tries to find the root word instead of just getting to the root word by brute force methods.


For more information on lemmatization, please visit the below resource:

https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html


In [6]:
#List of English stopwords to remove, excluding negative words since they will give us weights in bigrams 

remove_from_stop = ["not", "don't", "no"]
add_to_stop = ["one", "get", "go", "even", "amazon", "could", "would", "use", "make"]
stopwords = set(stopwords.words('english')).union(set(add_to_stop)) - set(remove_from_stop)
stopwords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'amazon',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'could',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 'down',
 'during',
 'each',
 'even',
 'few',
 'for',
 'from',
 'further',
 'get',
 'go',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'make',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'nor',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'one',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'our

In [None]:
#Second round of cleaning: initialize lemmatizer and remove stop words 

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(word):
    "Map POS tag to first character lemmatize() accepts"
    tag = nltk.pos_tag([word])[0][1][0]
    if tag == 'J':
        return wordnet.ADJ
    elif tag == 'V':
        return wordnet.VERB
    elif tag == 'N':
        return wordnet.NOUN
    elif tag == 'R':
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemma_sw_removal(review):
    words = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in review]
    words = [w for w in words if w not in stopwords]
    words = " ".join(w for w in words)
    return(words)

data['clean2'] = pd.DataFrame(data['tokens'].apply(lambda x: lemma_sw_removal(x)))


In [None]:
data[50:60]

In [None]:
#find the empty cells after cleaning

print("There are:", len(data[data['clean2'] == ''].index), "empty reviews after pre-processing") 

In [None]:
#replace empty cells with NaN values 
data['clean2'].replace('', np.nan, inplace=True)

#drop rows with NaN values 
data.dropna(subset=['clean2'], inplace=True)

In [None]:
data.to_pickle("clean_df.pkl")