In [1]:
import pandas as pd
import numpy as np
from textblob import TextBlob, Word
import string
import nltk
from nltk.corpus import stopwords
from nltk import PorterStemmer
import re
from nltk.stem import WordNetLemmatizer
from autocorrect import Speller

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
STOPWORDS = stopwords.words("english") #stopwords are the most common unnecessary words. eg is, he, that, etc.

[nltk_data] Downloading package stopwords to C:\Users\Tu
[nltk_data]     Lam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Tu
[nltk_data]     Lam\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Tu
[nltk_data]     Lam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Tu Lam\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
def deEmojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii') # A function to remove emojis from the reviews

In [4]:
def lemmatize_with_postag(sentence):
    sent = TextBlob(sentence)
    tag_dict = {"J": 'a', 
                "N": 'n', 
                "V": 'v', 
                "R": 'r'}
    words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]    
    lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
    return " ".join(lemmatized_list)

In [5]:
# Contraction convert, spelling check
spell = Speller(lang='en')
contractions_dict = {     
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I had",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "iit will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that had",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there had",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they had",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

def expand_contractions(text, contractions_dict):
    contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())),
                                      flags=re.IGNORECASE | re.DOTALL)

    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contractions_dict.get(match) \
            if contractions_dict.get(match) \
            else contractions_dict.get(match.lower())
        expanded_contraction = expanded_contraction
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

In [6]:
# Remove number
def more_clean_text(text):
    if not isinstance(text, str):
        return ''
    output = ''.join(c for c in text if not c.isdigit())
    text=expand_contractions(output,contractions_dict)
    word_tokens = nltk.word_tokenize(text)
    spells = ' '.join(spell(w) for w in (word_tokens))
    return spells

In [7]:
len(STOPWORDS)

179

In [8]:
# Remove 'not' for sentiment analysis
STOPWORDS.remove('not')
len(STOPWORDS)

178

In [10]:
def clean_text(text):
    if not isinstance(text, str):
        return ''
    text=deEmojify(text) # remove emojis
    text_cleaned="".join([x for x in text if x not in string.punctuation]) # remove punctuation
    text_cleaned=re.sub(' +', ' ', text_cleaned) # remove extra white spaces
    text_cleaned=text_cleaned.lower() # converting to lowercase
    text_cleaned=more_clean_text(text_cleaned) # remove numbers
    text_cleaned = expand_contractions(text_cleaned, contractions_dict) # contraction & spelling check
    
    tokens=text_cleaned.split(" ")
    tokens=[token for token in tokens if token not in STOPWORDS] # Taking only those words which are not stopwords
    
    #Converting to lemma
    text_cleaned = lemmatize_with_postag(str(tokens))
    for r in (("\' ", ''), ('\'', ''), ('[',''),  (']','')):
        text_cleaned = text_cleaned.replace(*r)
    return text_cleaned

In [11]:
# Load current dataframe
df = pd.read_csv('final_goodreads.csv')
df

Unnamed: 0,book_id,overall,reviewTime,asin,reviewText
0,22551730,4,"Dec 14, 2016",0307408868,Another hard to put down nonfiction book from ...
1,18176747,5,"Dec 21, 2016",0062273205,I haven't read many (any?) books that are writ...
2,137554,0,"Mar 20, 2014",006073731X,Sacca and Nate recommend
3,40955,5,"Dec 21, 2016",0071424911,A truly inspirational book by a truly inspirat...
4,9850443,3,"Aug 05, 2012",0062041266,"A fun, dark, slightly comical western about tw..."
...,...,...,...,...,...
906871,4405141,3,"Aug 19, 2014",0061698954,While i liked it and appreciated all the infor...
906872,4405141,5,"Apr 15, 2013",0061698954,If you know anyone suffering from an eating di...
906873,4405141,5,"Jul 28, 2015",0061698954,Fabulous insight to what people struggling wit...
906874,4405141,5,"Mar 30, 2009",0061698954,This is an excellent resource -best book I hav...


In [12]:
# Create new column for cleaned text
df['cleaned_text'] = np.nan
df

Unnamed: 0,book_id,overall,reviewTime,asin,reviewText,cleaned_text
0,22551730,4,"Dec 14, 2016",0307408868,Another hard to put down nonfiction book from ...,
1,18176747,5,"Dec 21, 2016",0062273205,I haven't read many (any?) books that are writ...,
2,137554,0,"Mar 20, 2014",006073731X,Sacca and Nate recommend,
3,40955,5,"Dec 21, 2016",0071424911,A truly inspirational book by a truly inspirat...,
4,9850443,3,"Aug 05, 2012",0062041266,"A fun, dark, slightly comical western about tw...",
...,...,...,...,...,...,...
906871,4405141,3,"Aug 19, 2014",0061698954,While i liked it and appreciated all the infor...,
906872,4405141,5,"Apr 15, 2013",0061698954,If you know anyone suffering from an eating di...,
906873,4405141,5,"Jul 28, 2015",0061698954,Fabulous insight to what people struggling wit...,
906874,4405141,5,"Mar 30, 2009",0061698954,This is an excellent resource -best book I hav...,


In [13]:
# Track progress
from tqdm import tqdm
tqdm.pandas()

  from pandas import Panel


In [None]:
# Clean text
df['cleaned_text'] = df.progress_apply(lambda row: clean_text(row.reviewText), axis=1)

 24%|████████████████▌                                                   | 220641/906876 [12:47:24<20:22:49,  9.35it/s]

In [None]:
# Check dataframe
df

In [None]:
# Export dataframe to csv
df.to_csv('goodreads_processed_text_new.csv', index=False)