# Data Processing
<br>


### Import libraries

In [1]:
# nltk
from nltk.corpus import stopwords
from nltk import RegexpTokenizer, PorterStemmer, WordNetLemmatizer, FreqDist

# utilities
import string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.cm as cm
import matplotlib.pyplot as plt

# preferences
pd.set_option('display.max_columns', None)
%matplotlib inline


### Read in dataset and display column and datatype info

In [2]:
# read amazon tsv dataset into pandas df 
cols = ['marketplace',       #<---2 letter country code of review marketplace
        'customer_id',       #<---random identifier to aggregate reviews by single author
        'review_id',         #<---unique ID of review
        'product_id',        #<---unique ID of product to which review pertains
        'product_parent',    #<---random identifier to aggregate reviews for same product
        'product_title',     #<---product title
        'product_category',  #<---product category to group dataset into coherent parts 
        'star_rating',       #<---1-5 star rating of product
        'helpful_votes',     #<---number of helpful votes review received
        'total_votes',       #<---total number of votes review received
        'vine',              #<---review part of Vine program
        'verified_purchase', #<---review of verified purchase
        'review_headline',   #<---review title
        'review_body',       #<---review text
        'review_date']       #<---review date

filename = 'amazon_reviews_us_Luggage_v1_00'
df = pd.read_csv(f'data/{filename}.tsv',
                 sep='\t',
                 usecols = cols)

In [3]:
# display column and datatype info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 348474 entries, 0 to 348473
Data columns (total 15 columns):
marketplace          348474 non-null object
customer_id          348474 non-null int64
review_id            348474 non-null object
product_id           348474 non-null object
product_parent       348474 non-null int64
product_title        348474 non-null object
product_category     348474 non-null object
star_rating          348473 non-null float64
helpful_votes        348473 non-null float64
total_votes          348473 non-null float64
vine                 348473 non-null object
verified_purchase    348473 non-null object
review_headline      348468 non-null object
review_body          348452 non-null object
review_date          348472 non-null object
dtypes: float64(3), int64(2), object(10)
memory usage: 39.9+ MB



### Preview dataframe

In [4]:
df.head(3)

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,40884699,R9CO86UUJCAW5,B00VGTN02Y,786681372,Teenage Mutant Ninja Turtle Boys' Teenage Muta...,Luggage,3.0,0.0,0.0,N,Y,my review of this product was in error. It ...,my review of this product was in error. It was...,2015-08-31
1,US,23208852,R3PR8X6QGVJ8B1,B005KIWL0E,618251799,"Kenneth Cole Reaction Out of Bounds 20"" 4 Whe...",Luggage,5.0,0.0,0.0,N,Y,Five Stars,Perfect size.,2015-08-31
2,US,17100246,R39BO2819ABUPF,B007UNSHJ6,810480328,American Tourister Luggage AT Pop 3 Piece Spin...,Luggage,4.0,0.0,0.0,N,Y,so good.,"So far, so good.",2015-08-31


### Drop junk data, set review date to datetime object, and define target label as helpful/unhelpful ratio

In [5]:
# drop reviews with null data, fewer than 10 votes, or duplicate text
df.dropna(inplace=True)
df = df.loc[df.helpful_votes > 10]
df.drop_duplicates(subset=['review_body'], inplace=True)

df.reset_index(inplace=True, drop=True)

In [6]:
# set review date to datetime object
df.review_date = pd.to_datetime(df.review_date)

In [7]:
# define median ratio of helpful votes to total votes as binary target (above/below)
df['HELP'] = df.helpful_votes / df.total_votes
df['TARGET'] = np.where(df.HELP > df.HELP.quantile(q=.5), 1, 0)


### Tokenize text, drop stopwords, and normalize vocabulary for TF-IDF

In [8]:
# tokenize text, define stopwords list and remove from tokens
df['TOKEN'] = df.review_body.apply(RegexpTokenizer(r'[a-zA-Z0-9]+').tokenize)

stopwords_list = stopwords.words('english')
stopwords_list += list(string.punctuation)

df['STOP'] = df.TOKEN.apply(lambda x: [word.lower() for word in x if word.lower() not in stopwords_list])

In [9]:
# lemmitize & stemmitize stopped tokens to normalize vocabulary
df['LEM'] = df.STOP.apply(lambda x: [WordNetLemmatizer().lemmatize(word) for word in x])
df['STEM'] = df.STOP.apply(lambda x: [PorterStemmer().stem(word) for word in x])


### Extract textual and con-textual features

In [10]:
# textual features
df['CHAR'] = df.review_body.apply(lambda x: len(x))                #<---number of characters
df['NUM'] = df.STOP.apply(lambda x: len(x))                        #<---number of tokens
df['WORD'] = df.review_body.apply(lambda x: len(x.split(' ')))     #<---number of words
df['LEN'] = df.CHAR / df.WORD                                      #<---average sentence length
df['SENT'] = df.review_body.apply(lambda x: len(x.split('. ')))    #<---number of sentences
df['AVG'] = df.WORD / df.SENT                                      #<---sentence length
df['INTERRO'] = df.review_body.apply(lambda x: len(x.split('? '))) #<---number of questionss
df['PER'] = df.INTERRO / df.SENT                                   #<---percent questions
df['EXCLAM'] = df.review_body.apply(lambda x: len(x.split('! ')))  #<---number of exclamations
df['COUNT'] = df.review_body.str.count('!')                        #<---number of exclamation points
df['CAPS'] = df.review_body.apply(
    lambda x: len([char for char in x if char.isupper()==True]) 
              / len(x))

In [11]:
# con-textual features
df['MED'] = df.product_id.apply(
    lambda x: df.groupby('product_id').star_rating.median()[x])    #<---product's median star rating
df['FAV'] = df.star_rating - df.MED                                #<---reviewer's rating vs product's median
df['POP'] = df.product_id.apply(
    lambda x: df.groupby('product_id').size()[x])                  #<---number of product's reviews


### Review dataframe

In [12]:
display(df.head(3))
df.info()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,HELP,TARGET,TOKEN,STOP,LEM,STEM,CHAR,NUM,WORD,LEN,SENT,AVG,INTERRO,PER,EXCLAM,COUNT,CAPS,MED,FAV,POP
0,US,20761040,R11IBSD5E6HPSD,B002B3FWXY,677901073,Travelon Anti-Theft Classic Messenger Bag,Luggage,1.0,29.0,31.0,N,Y,This bag was on my shoulder and it just fell t...,The strap broke!!! It was supposed to be anti...,2015-08-31,0.935484,0,"[The, strap, broke, It, was, supposed, to, be,...","[strap, broke, supposed, anti, theft, strap, b...","[strap, broke, supposed, anti, theft, strap, b...","[strap, broke, suppos, anti, theft, strap, bro...",318,25,63,5.047619,6,10.5,1,0.166667,2,3,0.031447,4.0,-3.0,15
1,US,23857312,R3NPROA23JJRFF,B00V6FKB5M,909535974,MOIERG Vintage Trolley Luggage 2tone TSA,Luggage,5.0,11.0,15.0,N,Y,This product is absolutely BEAUTIFUL. I ordere...,This product is absolutely BEAUTIFUL. I order...,2015-08-31,0.733333,0,"[This, product, is, absolutely, BEAUTIFUL, I, ...","[product, absolutely, beautiful, ordered, larg...","[product, absolutely, beautiful, ordered, larg...","[product, absolut, beauti, order, larg, seem, ...",437,39,80,5.4625,9,8.888889,1,0.111111,1,2,0.048055,5.0,0.0,1
2,US,12318409,R2KVWAYBPWK1OV,B011KEPZG8,919734058,Iblue Canvas Leather Weekend Shoulder Duffels ...,Luggage,5.0,20.0,22.0,N,N,My boyfriend wouldn't be without this for travel!,This review is for the Iblue Oversized Leather...,2015-08-31,0.909091,0,"[This, review, is, for, the, Iblue, Oversized,...","[review, iblue, oversized, leather, canvas, ca...","[review, iblue, oversized, leather, canvas, ca...","[review, iblu, overs, leather, canva, casual, ...",1951,193,351,5.558405,15,23.4,1,0.066667,6,9,0.033316,5.0,0.0,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12132 entries, 0 to 12131
Data columns (total 35 columns):
marketplace          12132 non-null object
customer_id          12132 non-null int64
review_id            12132 non-null object
product_id           12132 non-null object
product_parent       12132 non-null int64
product_title        12132 non-null object
product_category     12132 non-null object
star_rating          12132 non-null float64
helpful_votes        12132 non-null float64
total_votes          12132 non-null float64
vine                 12132 non-null object
verified_purchase    12132 non-null object
review_headline      12132 non-null object
review_body          12132 non-null object
review_date          12132 non-null datetime64[ns]
HELP                 12132 non-null float64
TARGET               12132 non-null int64
TOKEN                12132 non-null object
STOP                 12132 non-null object
LEM                  12132 non-null object
STEM                 1


### Write dataframe to csv

In [13]:
df.to_csv(f'data/{filename}_processed.csv', index=0)