In [1]:
import pandas as pd
import numpy as np
from textblob import TextBlob, Word
from matplotlib.font_manager import FontProperties
import seaborn as sns
from matplotlib import rcParams
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.ticker as ticker
import datetime
import warnings 
#sns.set_style("darkgrid",{"axes.axisbelow" : False })
warnings.simplefilter('ignore')
import string
import nltk
from nltk.corpus import stopwords
from nltk import PorterStemmer
import string   
import re
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
STOPWORDS = stopwords.words("english") #stopwords are the most common unnecessary words. eg is, he, that, etc.

[nltk_data] Downloading package stopwords to C:\Users\Tu
[nltk_data]     Lam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Tu
[nltk_data]     Lam\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Tu
[nltk_data]     Lam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Tu Lam\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
def deEmojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii') # A function to remove emojis from the reviews

In [4]:
def lemmatize_with_postag(sentence):
    sent = TextBlob(sentence)
    tag_dict = {"J": 'a', 
                "N": 'n', 
                "V": 'v', 
                "R": 'r'}
    words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]    
    lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
    return " ".join(lemmatized_list)

In [5]:
def clean_text(text):
    #ps=PorterStemmer()
    #wordnet_lemmatizer = WordNetLemmatizer()
    if not isinstance(text, str):
        return ''
    text=deEmojify(text) # remove emojis
    text_cleaned="".join([x for x in text if x not in string.punctuation]) # remove punctuation
    
    text_cleaned=re.sub(' +', ' ', text_cleaned) # remove extra white spaces
    text_cleaned=text_cleaned.lower() # converting to lowercase
    tokens=text_cleaned.split(" ")
    tokens=[token for token in tokens if token not in STOPWORDS] # Taking only those words which are not stopwords
    
    #Converting to lemma
    #text_cleaned=" ".join([wordnet_lemmatizer.lemmatize(token) for token in tokens])
    #text_cleaned=" ".join([ps.stem(token) for token in tokens])
    text_cleaned = lemmatize_with_postag(str(tokens))
    for r in (("\' ", ''), ('\'', ''), ('[',''),  (']','')):
        text_cleaned = text_cleaned.replace(*r)
    #print(text_cleaned)                            
    return text_cleaned

In [6]:
# Load current dataframe
df = pd.read_csv('final_goodreads.csv')
df

Unnamed: 0,book_id,overall,reviewTime,asin,reviewText
0,22551730,4,"Dec 14, 2016",0307408868,Another hard to put down nonfiction book from ...
1,18176747,5,"Dec 21, 2016",0062273205,I haven't read many (any?) books that are writ...
2,137554,0,"Mar 20, 2014",006073731X,Sacca and Nate recommend
3,40955,5,"Dec 21, 2016",0071424911,A truly inspirational book by a truly inspirat...
4,9850443,3,"Aug 05, 2012",0062041266,"A fun, dark, slightly comical western about tw..."
...,...,...,...,...,...
906871,4405141,3,"Aug 19, 2014",0061698954,While i liked it and appreciated all the infor...
906872,4405141,5,"Apr 15, 2013",0061698954,If you know anyone suffering from an eating di...
906873,4405141,5,"Jul 28, 2015",0061698954,Fabulous insight to what people struggling wit...
906874,4405141,5,"Mar 30, 2009",0061698954,This is an excellent resource -best book I hav...


In [7]:
# Create new column for cleaned text
df['cleaned_text'] = np.nan
df

Unnamed: 0,book_id,overall,reviewTime,asin,reviewText,cleaned_text
0,22551730,4,"Dec 14, 2016",0307408868,Another hard to put down nonfiction book from ...,
1,18176747,5,"Dec 21, 2016",0062273205,I haven't read many (any?) books that are writ...,
2,137554,0,"Mar 20, 2014",006073731X,Sacca and Nate recommend,
3,40955,5,"Dec 21, 2016",0071424911,A truly inspirational book by a truly inspirat...,
4,9850443,3,"Aug 05, 2012",0062041266,"A fun, dark, slightly comical western about tw...",
...,...,...,...,...,...,...
906871,4405141,3,"Aug 19, 2014",0061698954,While i liked it and appreciated all the infor...,
906872,4405141,5,"Apr 15, 2013",0061698954,If you know anyone suffering from an eating di...,
906873,4405141,5,"Jul 28, 2015",0061698954,Fabulous insight to what people struggling wit...,
906874,4405141,5,"Mar 30, 2009",0061698954,This is an excellent resource -best book I hav...,


In [10]:
# Track progress
from tqdm import tqdm
tqdm.pandas()

In [11]:
# Clean text
df['cleaned_text'] = df.progress_apply(lambda row: clean_text(row.reviewText), axis=1)

100%|████████████████████████████████████████████████████████████████████████| 906876/906876 [5:20:15<00:00, 47.20it/s]


In [12]:
# Check dataframe
df

Unnamed: 0,book_id,overall,reviewTime,asin,reviewText,cleaned_text
0,22551730,4,"Dec 14, 2016",0307408868,Another hard to put down nonfiction book from ...,another hard put nonfiction book erik larson ...
1,18176747,5,"Dec 21, 2016",0062273205,I haven't read many (any?) books that are writ...,havent books ceos ceos ceo aspire ceo really ...
2,137554,0,"Mar 20, 2014",006073731X,Sacca and Nate recommend,sacca nate recommend
3,40955,5,"Dec 21, 2016",0071424911,A truly inspirational book by a truly inspirat...,truly inspirational book truly inspirational ...
4,9850443,3,"Aug 05, 2012",0062041266,"A fun, dark, slightly comical western about tw...",fun dark slightly comical western two killers...
...,...,...,...,...,...,...
906871,4405141,3,"Aug 19, 2014",0061698954,While i liked it and appreciated all the infor...,liked appreciated information pertains suppor...
906872,4405141,5,"Apr 15, 2013",0061698954,If you know anyone suffering from an eating di...,know anyone suffering eating book tons inform...
906873,4405141,5,"Jul 28, 2015",0061698954,Fabulous insight to what people struggling wit...,fabulous insight struggling illness feel expe...
906874,4405141,5,"Mar 30, 2009",0061698954,This is an excellent resource -best book I hav...,excellent resource best book subject


In [13]:
# Export dataframe to csv
df.to_csv('goodreads_processed_text.csv', index=False)