In [1]:
import pandas as pd
import numpy as np
from textblob import TextBlob, Word
import string
import nltk
from nltk.corpus import stopwords
from nltk import PorterStemmer
import re
from nltk.stem import WordNetLemmatizer
from autocorrect import Speller
import multiprocessing as mp

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
STOPWORDS = stopwords.words("english") #stopwords are the most common unnecessary words. eg is, he, that, etc.

[nltk_data] Downloading package stopwords to C:\Users\Tu
[nltk_data]     Lam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Tu
[nltk_data]     Lam\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Tu
[nltk_data]     Lam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Tu Lam\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
STOPWORDS.append('could')
STOPWORDS.append('also')
STOPWORDS.append('would')
STOPWORDS.append('really')

In [4]:
def deEmojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii') # A function to remove emojis from the reviews

In [5]:
def lemmatize_with_postag(sentence):
    sent = TextBlob(sentence)
    tag_dict = {"J": 'a', 
                "N": 'n', 
                "V": 'v', 
                "R": 'r'}
    words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]    
    lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
    return " ".join(lemmatized_list)

In [6]:
# Contraction convert, spelling check
spell = Speller(lang='en')
contractions_dict = {     
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I had",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "iit will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that had",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there had",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they had",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

def expand_contractions(text, contractions_dict):
    contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())),
                                      flags=re.IGNORECASE | re.DOTALL)

    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contractions_dict.get(match) \
            if contractions_dict.get(match) \
            else contractions_dict.get(match.lower())
        expanded_contraction = expanded_contraction
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

In [7]:
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
    return TAG_RE.sub('', text)

In [8]:
def clean_text(text):
    if not isinstance(text, str):
        return ''
    text=deEmojify(text) # remove emojis
    text_cleaned = re.sub(' +', ' ', text) # remove extra white spaces
    text_cleaned = remove_tags(text_cleaned) # remove tags
    text_cleaned = text_cleaned.lower() # converting to lowercase
    text_cleaned = ''.join(c for c in text_cleaned if not c.isdigit())# remove numbers
    text_cleaned = expand_contractions(text_cleaned, contractions_dict) # contraction & spelling check
    text_cleaned="".join([x for x in text_cleaned if x not in string.punctuation]) # remove punctuation

    text_cleaned = nltk.word_tokenize(text_cleaned)
    text_cleaned = [x for x in text_cleaned if len(x) < 20]
    text_cleaned = [spell(w) for w in (text_cleaned)]   # correct spelling
    
    # Taking only those words which are not stopwords
    text_cleaned=" ".join([token for token in text_cleaned if token not in STOPWORDS])

    #Converting to lemma
    text_cleaned = lemmatize_with_postag(str(text_cleaned))
    
    return text_cleaned

In [9]:
# Track progress
from tqdm import tqdm
tqdm.pandas()

  from pandas import Panel


In [10]:
# Load current dataframe
metadata = pd.read_csv('book_metadata.csv')
metadata

Unnamed: 0,book_id,publication_year,publication_month,publication_day,publisher,description,num_pages,format,genres,asin
0,780911,2002.0,9.0,16.0,Houghton Mifflin Harcourt,"""Michel Faber leads us back to 1870s London, w...",838.0,Hardcover,"{'fiction': 2428, 'history, historical fiction...",015100692X
1,926667,1964.0,3.0,18.0,Mariner Books,A modern verse play about the search for meani...,190.0,Paperback,"{'fiction': 30, 'poetry': 29}",0156182890
2,18498572,2014.0,5.0,6.0,Broadside Books,"The dramatic, first-hand account of the histor...",384.0,Hardcover,"{'history, historical fiction, biography': 30,...",0062310194
3,268464,2007.0,5.0,29.0,William Morrow,Celebrity journalist Amelia Stone is the quint...,288.0,Hardcover,"{'fiction': 15, 'history, historical fiction, ...",0061198722
4,598199,2001.0,2.0,22.0,"Oxford University Press, USA","Throughout African-American history, religion ...",184.0,Paperback,"{'history, historical fiction, biography': 10,...",0195145852
...,...,...,...,...,...,...,...,...,...,...
37234,5582304,2009.0,6.0,1.0,Da Capo Press,Drawing on intimate recollections from friends...,440.0,Hardcover,"{'history, historical fiction, biography': 22,...",0306815869
37235,3106983,2008.0,3.0,6.0,Vintage,The Story of a Childhood and The Story of a Re...,343.0,Paperback,"{'comics, graphic': 7696, 'non-fiction': 1811,...",009952399X
37236,11873,2004.0,7.0,1.0,Vintage Classics,WINNER OF THE PULITZER PRIZE\nIn 1831 Nat Turn...,480.0,Paperback,"{'fiction': 688, 'history, historical fiction,...",0099285568
37237,823091,1988.0,1.0,1.0,Harcourt Brace College Publishers,,,Hardcover,{'non-fiction': 3},0153117362


In [13]:
# Create new dataframe for cleaned text descriptions
df = metadata[['book_id', 'asin', 'description']]
df['cleaned_description'] = np.nan
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,book_id,asin,description,cleaned_description
0,780911,015100692X,"""Michel Faber leads us back to 1870s London, w...",
1,926667,0156182890,A modern verse play about the search for meani...,
2,18498572,0062310194,"The dramatic, first-hand account of the histor...",
3,268464,0061198722,Celebrity journalist Amelia Stone is the quint...,
4,598199,0195145852,"Throughout African-American history, religion ...",
...,...,...,...,...
37234,5582304,0306815869,Drawing on intimate recollections from friends...,
37235,3106983,009952399X,The Story of a Childhood and The Story of a Re...,
37236,11873,0099285568,WINNER OF THE PULITZER PRIZE\nIn 1831 Nat Turn...,
37237,823091,0153117362,,


In [14]:
# Clean text
df['cleaned_description'] = df.progress_apply(lambda row: clean_text(row.description), axis=1)

100%|█████████████████████████████████████████████████████████████████████████| 37239/37239 [10:18:35<00:00,  1.00it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [16]:
df.to_csv('goodreads_description.csv', index=False)