# Data Preprocessing

**Note:** Please make sure you have the prerequisites installed for each of these codes

In [1]:
import warnings


In [2]:
warnings.filterwarnings("ignore")

In [3]:
import nltk
from nltk import FreqDist
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tanma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
import pandas as pd
# pd.set_option("display.max_colwidth", 200)
import numpy as np
import json
import re
import gzip
import spacy

import gensim
from gensim import corpora
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords

import nltk

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer

from nltk.stem import WordNetLemmatizer

from tqdm import tqdm

In [5]:
review_df = pd.read_csv('./data/reviews.csv')
review_df.drop(columns=['Unnamed: 0'], inplace= True)

In [6]:
review_df['review_text'].fillna('', inplace = True)

In [7]:
book_df = pd.read_csv("./data/top_1000_books.csv")
book_df.drop(columns=['Unnamed: 0'], inplace= True)

In [8]:
review_df.head()

Unnamed: 0,book_id,user_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,5805,6baf45d03466a5858403d892286ff222,8d680b828c9b260423f5740e67b03291,4,Could use less misogyny and stockholm syndrome.,Thu Apr 21 11:27:06 -0700 2016,Thu Apr 21 11:27:36 -0700 2016,Thu Apr 21 11:27:36 -0700 2016,,0,0
1,5805,76413aa32ae6debd29ff7c2c437f17ca,80a09e6153378edbe685b640b1b957f0,4,I love the artwork. I love the ideas that V re...,Tue Nov 18 09:14:16 -0800 2014,Sun Dec 21 16:14:29 -0800 2014,Sun Dec 21 00:00:00 -0800 2014,Tue Nov 18 00:00:00 -0800 2014,0,0
2,5805,c001efe2d798df0bbac0ba51dbed1f9b,65715d42615cf1ca8c7151bd25a22544,3,interesting graphic novel. Interesting created...,Sun Sep 07 12:35:48 -0700 2008,Sun Sep 07 12:39:38 -0700 2008,Sun Sep 07 00:00:00 -0700 2008,,0,0
3,5805,2d43e0a1f7e9c0946fdf7f71fddbf7a8,d55a8c7950191477cf62cc76677bbf19,2,"Honestly, I was really disappointed by this. O...",Sun Jun 10 11:07:52 -0700 2012,Sat Jan 30 11:14:07 -0800 2016,Sat Jan 01 00:00:00 -0800 2011,,0,0
4,5805,d6804aa6e3a96b8e1104b8b9ac3fe882,77ad3c07806bb88c5a1bfa46b9357b20,3,"I enjoyed this book---and the idea of a ""Robin...",Sat Jan 09 18:46:54 -0800 2010,Sun Jan 17 18:24:13 -0800 2010,Sat Jan 16 00:00:00 -0800 2010,,0,0


In [9]:
engaged_df = review_df[(review_df['n_votes'] > 0)|(review_df['n_comments'] > 0)]

In [10]:
engaged_df.head()

Unnamed: 0,book_id,user_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
6,5805,4cbecbc15af3db041a8e0f594c642bb5,58f2301bd2d4bbfc1b51e4e5fb161cfe,5,"Remember, remember, the fifth of November. Thi...",Wed Jun 13 17:55:53 -0700 2012,Mon Jan 30 05:58:01 -0800 2017,Thu Jun 14 00:00:00 -0700 2012,,7,0
8,5805,49cc59f1c479d698507627b401d47ecf,761a17f52538341a085b629a316204a1,4,Tinha apontado este livro como um dos que tinh...,Sun Jan 05 03:27:10 -0800 2014,Tue Oct 28 15:58:20 -0700 2014,Tue Oct 28 15:58:20 -0700 2014,,1,0
10,5805,5f03864c758bfceb6d7d5e93eeb20044,ac54c3ce0c9f660c03881b0668f79c60,5,Review coming soon! www.youtube.com/ReadTomes,Tue Feb 12 11:38:36 -0800 2013,Tue Feb 12 12:00:47 -0800 2013,Tue Feb 12 12:00:47 -0800 2013,Tue Feb 12 00:00:00 -0800 2013,1,0
12,5805,c309dff1695ed8558b29ea8dcd7479b8,0da0bcc469c2acd15350f9a8f0a74e2b,5,"What better way to celebrate Guy Fawkes Day, t...",Tue May 20 09:52:36 -0700 2014,Thu Mar 16 09:49:11 -0700 2017,Wed Nov 05 14:22:10 -0800 2014,Wed Nov 05 00:00:00 -0800 2014,2,0
13,5805,0f6b8c04f811e05c8978bd6b66ce7685,7d670b6c8cac0c086e21ae2f1af6eccb,4,To note - I am writing this review a quarter c...,Tue Jul 03 16:43:26 -0700 2012,Fri Feb 27 14:46:00 -0800 2015,Tue Jan 01 00:00:00 -0800 1991,,10,0


In [11]:
lm  = WordNetLemmatizer()

In [12]:
stop_words = set(stopwords.words('english'))

In [13]:
words = set(nltk.corpus.words.words())

In [14]:
# nlp = spacy.load('en', disable=['parser', 'ner'])
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [15]:
def preprocess(x):
    try:
        x = re.sub("n\'t", " not", x)
        x = re.sub("\'d", " would", x)
        x = re.sub("[^a-zA-Z0-9#]", " ", x)
        x = re.sub("www", " ", x)
        x = re.sub("http", " ", x)
        x = re.sub("https", " ", x)
        x = re.sub("com", " ", x)
        x = re.sub("url", " ", x)
        return x
    except:
        print(x)
        return ""
    

In [16]:
def lemmas(x, lm, tags=['NOUN', 'ADJ']):
    x = word_tokenize(x)
    
    x = [lm.lemmatize(w).lower() for w in x]
    
#     print(x)
    x = nlp(" ".join(x))
#     x = [w for w in x if w in words]
    x = [token.lemma_ for token in x if token.pos_ in tags or token.lemma_ in words]
    x = [w for w in x if not w.lower() in stop_words]
    x = [w for w in x if len(w) > 2]
    
    return x

In [17]:
engaged_df['clean_reviews'] = engaged_df['review_text'].apply(lambda x: preprocess(x))

In [18]:
engaged_df['clean_reviews'] = engaged_df['clean_reviews'].apply(lambda x: lemmas(x, lm))

In [19]:
engaged_df.head()

Unnamed: 0,book_id,user_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments,clean_reviews
6,5805,4cbecbc15af3db041a8e0f594c642bb5,58f2301bd2d4bbfc1b51e4e5fb161cfe,5,"Remember, remember, the fifth of November. Thi...",Wed Jun 13 17:55:53 -0700 2012,Mon Jan 30 05:58:01 -0800 2017,Thu Jun 14 00:00:00 -0700 2012,,7,0,"[remember, remember, fifth, part, quote, origi..."
8,5805,49cc59f1c479d698507627b401d47ecf,761a17f52538341a085b629a316204a1,4,Tinha apontado este livro como um dos que tinh...,Sun Jan 05 03:27:10 -0800 2014,Tue Oct 28 15:58:20 -0700 2014,Tue Oct 28 15:58:20 -0700 2014,,1,0,"[tinha, visto, portman, hora, livro, tinha, ge..."
10,5805,5f03864c758bfceb6d7d5e93eeb20044,ac54c3ce0c9f660c03881b0668f79c60,5,Review coming soon! www.youtube.com/ReadTomes,Tue Feb 12 11:38:36 -0800 2013,Tue Feb 12 12:00:47 -0800 2013,Tue Feb 12 12:00:47 -0800 2013,Tue Feb 12 00:00:00 -0800 2013,1,0,"[review, ing, soon, readtome]"
12,5805,c309dff1695ed8558b29ea8dcd7479b8,0da0bcc469c2acd15350f9a8f0a74e2b,5,"What better way to celebrate Guy Fawkes Day, t...",Tue May 20 09:52:36 -0700 2014,Thu Mar 16 09:49:11 -0700 2017,Wed Nov 05 14:22:10 -0800 2014,Wed Nov 05 00:00:00 -0800 2014,2,0,"[well, way, celebrate, guy, day, read, modern,..."
13,5805,0f6b8c04f811e05c8978bd6b66ce7685,7d670b6c8cac0c086e21ae2f1af6eccb,4,To note - I am writing this review a quarter c...,Tue Jul 03 16:43:26 -0700 2012,Fri Feb 27 14:46:00 -0800 2015,Tue Jan 01 00:00:00 -0800 1991,,10,0,"[note, write, review, quarter, century, read, ..."


In [20]:
engaged_df.to_csv('./data/engaged_df.csv', index = False)

In [21]:
dictionary = corpora.Dictionary(engaged_df['clean_reviews'])

In [22]:
doc_term_matrix = tqdm([dictionary.doc2bow(rev) for rev in engaged_df['clean_reviews']])

  0%|                                                                                        | 0/39787 [00:00<?, ?it/s]

### preprocessing the book df for the description data

In [24]:
book_df.columns

Index(['isbn', 'text_reviews_count', 'series', 'country_code', 'language_code',
       'popular_shelves', 'asin', 'is_ebook', 'average_rating', 'kindle_asin',
       'similar_books', 'description', 'format', 'link', 'authors',
       'publisher', 'num_pages', 'publication_day', 'isbn13',
       'publication_month', 'edition_information', 'publication_year', 'url',
       'image_url', 'book_id', 'ratings_count', 'work_id', 'title',
       'title_without_series'],
      dtype='object')

In [25]:
book_df['processed_description'] = book_df['description'].apply(lambda x: preprocess(x))
book_df['processed_description'] = book_df['processed_description'].apply(lambda x: lemmas(x, lm))

nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


In [26]:
book_df.head()

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series,processed_description
0,1401207928,3400,['631559'],US,eng,"[{'count': '42534', 'name': 'to-read'}, {'coun...",,False,4.25,B0064W65UM,...,,2005.0,https://www.goodreads.com/book/show/5805.V_for...,https://images.gr-assets.com/books/1343668985m...,5805,212464,392838,V for Vendetta,V for Vendetta,"[remember, remember, fifth, frightening, power..."
1,394541553,4402,['186595'],US,eng,"[{'count': '92915', 'name': 'to-read'}, {'coun...",,False,4.35,,...,,1991.0,https://www.goodreads.com/book/show/15196.Maus_I,https://images.gr-assets.com/books/1327884972m...,15196,189288,1947012,Maus I: A Survivor's Tale: My Father Bleeds Hi...,Maus I: A Survivor's Tale: My Father Bleeds Hi...,"[story, jewish, survivor, son, cartoonist, try..."
2,1401207529,1790,"['482441', '300099', '194163', '636672']",US,eng,"[{'count': '7924', 'name': 'to-read'}, {'count...",,False,4.22,B0064W65SO,...,,2005.0,https://www.goodreads.com/book/show/59980.Batman,https://images.gr-assets.com/books/1327940389m...,59980,154013,2501570,Batman: Year One,Batman: Year One,"[lieutenant, take, new, post, crime, ride, cor..."
3,1421501686,2399,['205782'],US,eng,"[{'count': '9727', 'name': 'to-read'}, {'count...",,False,4.42,,...,,2005.0,https://www.goodreads.com/book/show/13615.Deat...,https://images.gr-assets.com/books/1419952134m...,13615,142755,1782155,"Death Note, Vol. 1: Boredom (Death Note, #1)","Death Note, Vol. 1: Boredom (Death Note, #1)","[light, ace, student, great, prospect, bore, m..."
4,1607066017,8700,['736247'],US,eng,"[{'count': '78762', 'name': 'to-read'}, {'coun...",,False,4.24,B015XEABR4,...,,2012.0,https://www.goodreads.com/book/show/15704307-s...,https://images.gr-assets.com/books/1486028947m...,15704307,142640,19113524,"Saga, Vol. 1 (Saga, #1)","Saga, Vol. 1 (Saga, #1)","[two, soldier, opposite, side, never, end, gal..."


In [28]:
book_df.to_csv('./data/book_p.csv', index = False)