## Amazon Reviews Data Preprocessing  

### Imports

In [147]:
import numpy as np
import pandas as pd
import nltk, re
from bs4 import BeautifulSoup as bs
import requests
import string

### The Dataset

In [148]:
# create dataframe of reviews
review_data = pd.read_csv('review_data.csv')
review_data = review_data.set_axis(['num','reviews'], axis=1)

In [149]:
# Drop Null values
review_data.drop('num',axis=1, inplace=True)
review_data.dropna(inplace=True)

In [150]:
review_data.shape

(477, 1)

In [151]:
review_data

Unnamed: 0,reviews
0,Let me start by saying I’ve got a Master’s de...
1,These are awesome! I should’ve bought the smal...
2,I wanted to mount a Sonos speaker in my bathro...
3,I love these floating bookshelves. They give a...
4,"This is the coolest thing I’ve ever seen , and..."
...,...
474,"This is the coolest thing I’ve ever seen , and..."
475,I know that this is labeled a bookshelf but I ...
476,Really wanted to give 5 stars Because i am in ...
477,Been needing more book storage and I wanted a ...


### Data Preprocessing
#### Remove Punctuations

In [152]:
# defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

# storing the puntuation free text
review_data['clean_data']= review_data['reviews'].apply(lambda x:remove_punctuation(x))
review_data.head()

Unnamed: 0,reviews,clean_data
0,Let me start by saying I’ve got a Master’s de...,Let me start by saying I’ve got a Master’s de...
1,These are awesome! I should’ve bought the smal...,These are awesome I should’ve bought the small...
2,I wanted to mount a Sonos speaker in my bathro...,I wanted to mount a Sonos speaker in my bathro...
3,I love these floating bookshelves. They give a...,I love these floating bookshelves They give a ...
4,"This is the coolest thing I’ve ever seen , and...",This is the coolest thing I’ve ever seen and ...


#### Convert text into lowercase

In [153]:
# Converted into lower case
review_data['review_lower']= review_data['clean_data'].apply(lambda x: x.lower())
review_data.head()

Unnamed: 0,reviews,clean_data,review_lower
0,Let me start by saying I’ve got a Master’s de...,Let me start by saying I’ve got a Master’s de...,let me start by saying i’ve got a master’s de...
1,These are awesome! I should’ve bought the smal...,These are awesome I should’ve bought the small...,these are awesome i should’ve bought the small...
2,I wanted to mount a Sonos speaker in my bathro...,I wanted to mount a Sonos speaker in my bathro...,i wanted to mount a sonos speaker in my bathro...
3,I love these floating bookshelves. They give a...,I love these floating bookshelves They give a ...,i love these floating bookshelves they give a ...
4,"This is the coolest thing I’ve ever seen , and...",This is the coolest thing I’ve ever seen and ...,this is the coolest thing i’ve ever seen and ...


#### Tokenization

In [154]:
# Word tokenization
review_data['tokenied']= review_data['review_lower'].apply(lambda x: nltk.word_tokenize(x))
review_data.head()

Unnamed: 0,reviews,clean_data,review_lower,tokenied
0,Let me start by saying I’ve got a Master’s de...,Let me start by saying I’ve got a Master’s de...,let me start by saying i’ve got a master’s de...,"[let, me, start, by, saying, i, ’, ve, got, a,..."
1,These are awesome! I should’ve bought the smal...,These are awesome I should’ve bought the small...,these are awesome i should’ve bought the small...,"[these, are, awesome, i, should, ’, ve, bought..."
2,I wanted to mount a Sonos speaker in my bathro...,I wanted to mount a Sonos speaker in my bathro...,i wanted to mount a sonos speaker in my bathro...,"[i, wanted, to, mount, a, sonos, speaker, in, ..."
3,I love these floating bookshelves. They give a...,I love these floating bookshelves They give a ...,i love these floating bookshelves they give a ...,"[i, love, these, floating, bookshelves, they, ..."
4,"This is the coolest thing I’ve ever seen , and...",This is the coolest thing I’ve ever seen and ...,this is the coolest thing i’ve ever seen and ...,"[this, is, the, coolest, thing, i, ’, ve, ever..."


#### Remove Stopwords

In [155]:
#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')
stopwords[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [156]:
# defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

#applying the function
review_data['no_stopwords'] = review_data['tokenied'].apply(lambda x:remove_stopwords(x))
review_data.head()

Unnamed: 0,reviews,clean_data,review_lower,tokenied,no_stopwords
0,Let me start by saying I’ve got a Master’s de...,Let me start by saying I’ve got a Master’s de...,let me start by saying i’ve got a master’s de...,"[let, me, start, by, saying, i, ’, ve, got, a,...","[let, start, saying, ’, got, master, ’, degree..."
1,These are awesome! I should’ve bought the smal...,These are awesome I should’ve bought the small...,these are awesome i should’ve bought the small...,"[these, are, awesome, i, should, ’, ve, bought...","[awesome, ’, bought, small, ’, many, large, bo..."
2,I wanted to mount a Sonos speaker in my bathro...,I wanted to mount a Sonos speaker in my bathro...,i wanted to mount a sonos speaker in my bathro...,"[i, wanted, to, mount, a, sonos, speaker, in, ...","[wanted, mount, sonos, speaker, bathroom, dril..."
3,I love these floating bookshelves. They give a...,I love these floating bookshelves They give a ...,i love these floating bookshelves they give a ...,"[i, love, these, floating, bookshelves, they, ...","[love, floating, bookshelves, give, magical, f..."
4,"This is the coolest thing I’ve ever seen , and...",This is the coolest thing I’ve ever seen and ...,this is the coolest thing i’ve ever seen and ...,"[this, is, the, coolest, thing, i, ’, ve, ever...","[coolest, thing, ’, ever, seen, bought, many, ..."


In [157]:
# Removing '’' symbol from tokens
def removing(row):
    for i in range(len(review_data)):
        val = [x for x in row if x != "’"]
    return val
review_data['review_preprocessed'] = review_data['no_stopwords'].apply(removing)
review_data.head()

Unnamed: 0,reviews,clean_data,review_lower,tokenied,no_stopwords,review_preprocessed
0,Let me start by saying I’ve got a Master’s de...,Let me start by saying I’ve got a Master’s de...,let me start by saying i’ve got a master’s de...,"[let, me, start, by, saying, i, ’, ve, got, a,...","[let, start, saying, ’, got, master, ’, degree...","[let, start, saying, got, master, degree, engl..."
1,These are awesome! I should’ve bought the smal...,These are awesome I should’ve bought the small...,these are awesome i should’ve bought the small...,"[these, are, awesome, i, should, ’, ve, bought...","[awesome, ’, bought, small, ’, many, large, bo...","[awesome, bought, small, many, large, books, u..."
2,I wanted to mount a Sonos speaker in my bathro...,I wanted to mount a Sonos speaker in my bathro...,i wanted to mount a sonos speaker in my bathro...,"[i, wanted, to, mount, a, sonos, speaker, in, ...","[wanted, mount, sonos, speaker, bathroom, dril...","[wanted, mount, sonos, speaker, bathroom, dril..."
3,I love these floating bookshelves. They give a...,I love these floating bookshelves They give a ...,i love these floating bookshelves they give a ...,"[i, love, these, floating, bookshelves, they, ...","[love, floating, bookshelves, give, magical, f...","[love, floating, bookshelves, give, magical, f..."
4,"This is the coolest thing I’ve ever seen , and...",This is the coolest thing I’ve ever seen and ...,this is the coolest thing i’ve ever seen and ...,"[this, is, the, coolest, thing, i, ’, ve, ever...","[coolest, thing, ’, ever, seen, bought, many, ...","[coolest, thing, ever, seen, bought, many, sma..."


In [163]:
df = review_data.loc[:,'review_preprocessed']

In [172]:
def onlylatter(location):
    letters_only = re.sub("[^a-zA-Z]",  
                          " ",         
                          str(location))
    return (" ".join(letters_only))
df = df.apply(lambda x: onlylatter(x))