In [35]:
import re
import itertools
from collections import Counter
import pandas as pd

import nltk
# nltk.download('averaged_perceptron_tagger')
# nltk.download('punkt')
# nltk.download()
from nltk.corpus import stopwords
from nltk.tag import PerceptronTagger
from nltk.data import find
from nltk.tokenize import RegexpTokenizer

from keras.preprocessing.text import one_hot
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer

import matplotlib.pyplot as plt
from tqdm import tqdm

## Constant

In [36]:
itemID = 'asin'
userID = 'reviewerID'
rating = 'overall'
reviewText = 'reviewText'
summary = 'summary'
helpful = 'helpful'
helpful_rating = 'helpful_rating'
total_helpful = 'total_helpful'
overall = 'overall'
dataPath = 'data/ToysGames/'
dataName = 'reviews_Toys_and_Games_5.json.gz'

## Load Data

In [37]:
import gzip

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF(dataPath+dataName)[[userID, itemID, helpful, reviewText, overall, summary]]
df[reviewText] = df[reviewText].astype('str')
df[summary] = df[summary].astype('str')
df[helpful] = df[helpful].astype('str')

In [38]:
# df[["reviewerID", "asin", "helpful", "reviewText", "overall", "summary"]].to_csv("data/ToysGames/AmazonToyGamesRawData.csv", sep='\t', encoding='utf-8', index=False)

In [39]:
# df = pd.read_csv(dataPath+dataName, sep='\t', encoding='utf-8')
# df.head()

# Sort by helpfulness

In [40]:
from ast import literal_eval
df[helpful] = df[helpful].apply(lambda x: literal_eval(x))
df[helpful_rating] = df[helpful].apply(lambda x: x[0])
df[total_helpful] = df[helpful].apply(lambda x: x[1])

In [41]:
df.sort_values(by=[helpful_rating], ascending=False).head()

Unnamed: 0,reviewerID,asin,helpful,reviewText,overall,summary,helpful_rating,total_helpful
46315,A1OUQCTNVKPVR9,B0010VS078,"[1589, 1637]",I loaned my iPod to my kid and he broke it. T...,4.0,It's a great portable music solution,1589,1637
103098,A4LD7XC56J3ZV,B004Z7H07K,"[1431, 1502]",Hi! I am Erin T. and I run a website called th...,5.0,My Son Won't Put it Down,1431,1502
131030,A1SC7Z2646QCP9,B0089RPUHO,"[1413, 1449]",If you want a child-friendly tablet-style devi...,5.0,Hands down the best choice for a child-friendl...,1413,1449
80422,A3DZFEICHK5LF2,B003JQT4Y0,"[1378, 1393]","Short version:The good: The pen is amazing, a ...",3.0,Great product but a lot more parent involvement.,1378,1393
103019,A2DG63DN704LOI,B004Z7H07K,"[1291, 1359]",I really want to like the LeapPad - my kids do...,3.0,"Kids like it, but educational value is not as ...",1291,1359


In [42]:
df[df[reviewText].isnull()]

Unnamed: 0,reviewerID,asin,helpful,reviewText,overall,summary,helpful_rating,total_helpful


In [43]:
# df_groupby_asin = df.groupby(itemID).agg({reviewText:','.join, summary:','.join})

In [44]:
# df_groupby_asin.head()

In [45]:
# Tokenize Review
df["review_word_tokenized"] = df[reviewText].apply(lambda x: nltk.word_tokenize(x))

# Remove Stopwords
# Get english stopwords
en_stopwords = set(stopwords.words('english'))
df["review_word_tokenized"] = df["review_word_tokenized"].apply(lambda text: 
                                                                [w for w in text if w not in en_stopwords])

# Remove Punctuation
import string
table = str.maketrans('', '', string.punctuation)
df["review_word_tokenized"] = df["review_word_tokenized"].apply(lambda text: 
                                                                [w.translate(table) for w in text])

# Remove tokens that are not alphabetic
df["review_word_tokenized"] = df["review_word_tokenized"].apply(lambda text: 
                                                                [w for w in text if w.isalpha()])

# Lowercase
df["review_word_tokenized"] = df["review_word_tokenized"].apply(lambda text: 
                                                                [w.lower() for w in text])

# Lemmatizing
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
df["review_word_tokenized"] = df["review_word_tokenized"].apply(lambda text: 
                                                                [lemmatizer.lemmatize(w) for w in text])

## Vectorize

In [None]:
tokenizer = Tokenizer()
text = df[reviewText].tolist()
tokenizer.fit_on_texts(text)

In [None]:
df_word_index = pd.DataFrame(list(tokenizer.word_index.items()), columns=['word','index'])

In [None]:
df_word_index.tail()

In [None]:
token_list = df[reviewText].apply(lambda x: nltk.word_tokenize(x)+[' ']).tolist()

In [None]:
token_list[0]

# Filter ADJ/NN

In [None]:
#function to filter for ADJ/NN bigrams
def rightTypes(ngram):
    if '-pron-' in ngram or 't' in ngram:
        return False
    for word in ngram:
        if word.isspace():
            return False
    acceptable_types = ('JJ', 'JJR', 'JJS')
    ins = ('IN','TO')
    second_type = ('NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    
    if len(tags) == 2:
        if tags[0][1] in acceptable_types and tags[1][1] in second_type:
            return True
        else:
            return False
    elif len(tags) == 3:
        if tags[0][1] in acceptable_types and tags[1][1] in ins and tags[2][1] in second_type:
            return True
        else:
            return False
    else:
        if tags[0][1] in acceptable_types and tags[1][1] in ins and tags[2][1] in acceptable_types and tags[3][1] in second_type:
            return True
        else:
            return False  

# Bigram

In [None]:
bigrams = nltk.collocations.BigramAssocMeasures()
tokens = itertools.chain.from_iterable(token_list)
bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(tokens)

bigram_freq = bigramFinder.ngram_fd.items()
bigramFreqTable = pd.DataFrame(list(bigram_freq), 
                               columns=['ngram','freq']).sort_values(by='freq', ascending=False)
# bigramFreqTable = bigramFreqTable[bigramFreqTable.ngram.map(lambda x: rightTypes(x))]

In [None]:
bigramFreqTable.head(50)

In [None]:
# trigrams = nltk.collocations.TrigramAssocMeasures()
# tokens = itertools.chain.from_iterable(token_list)
# trigramFinder = nltk.collocations.TrigramCollocationFinder.from_words(tokens)
# trigram_freq = trigramFinder.ngram_fd.items()

# trigramFreqTable = pd.DataFrame(list(trigram_freq), 
#                                 columns=['ngram','freq']).sort_values(by='freq', ascending=False)
# trigramFreqTable = trigramFreqTable[trigramFreqTable.ngram.map(lambda x: rightTypes(x))]

In [None]:
# trigramFreqTable.head(50)

In [None]:
finder = nltk.collocations.TrigramCollocationFinder.from_words(["what","the","hell"])
finder.ngram_fd.items()

In [None]:
bigramFinder.apply_freq_filter(100)
bigramPMITable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.pmi)), 
                              columns=['bigram','PMI']).sort_values(by='PMI', ascending=False)
bigramPMITable = bigramPMITable[bigramPMITable.bigram.map(lambda x: rightTypes(x))]
bigramPMITable

In [None]:
# bigramPMITable = bigramPMITable[bigramPMITable.bigram.map(lambda x: rightTypes(x))]
# bigramPMITable

In [None]:
bigram_freq_pmi = pd.merge(bigramFreqTable, bigramPMITable, how='right', left_on='ngram', right_on='bigram').sort_values("PMI", ascending=False)
bigram_freq_pmi.head(50)

In [None]:
bigram_freq_pmi.head(50)['freq'].sum()

In [None]:
bigram_freq_pmi.head(50)['PMI'].sum()

In [None]:
#df_keyphrases = pd.concat([bigramFreqTable[['bigram']].head(50), bigramPMITable[['bigram']].head(50)])
df_keyphrases = bigram_freq_pmi[['bigram']].head(50)
df_keyphrases['Phrases'] = df_keyphrases['bigram'].apply(lambda x: ' '.join(x))
df_keyphrases = df_keyphrases[['Phrases']].reset_index(drop=True)
keyphrases = df_keyphrases['Phrases'].tolist()

In [None]:
df["keyphrases_indices"] = df[reviewText].apply(lambda x: [keyphrases.index(key) for key in keyphrases if key in x])
df['keyphrases_indices_length'] = df['keyphrases_indices'].str.len()
df

In [None]:
print(df['keyphrases_indices_length'].sum())
print(df['keyphrases_indices_length'].sum() / len(df))

In [None]:
df.hist(column='keyphrases_indices_length')

In [None]:
df["tokenizer"] = df[reviewText].apply(lambda x: tokenizer.fit_on_texts(x))

In [None]:
# df[reviewText].apply(lambda x: [1. if key in x else 0 for key in keyphrases])

In [None]:
# df['keyVector'] = df[reviewText].apply(lambda x: [1. if key in x else 0 for key in keyphrases])

In [None]:
# df['keyIndices'] = df['keyVector'].apply(lambda vector: [i for i, x in enumerate(vector) if x == 1.])

In [None]:
# df['keyIndices'].str.len().sum()

In [None]:
df['UserIndex'] = df[userID].astype('category').cat.rename_categories(range(0, df[userID].nunique()))
df['ItemIndex'] = df[itemID].astype('category').cat.rename_categories(range(0, df[itemID].nunique()))
df['Binary'] = (df[rating] > 3)*1

In [None]:
df_user_name = df[['UserIndex',userID]]
df_item_name = df[['ItemIndex',itemID]]

In [None]:
# df = df.drop([itemID, userID, reviewText], axis=1)

## DataFrames

In [None]:
df.head()

In [None]:
df_word_index.head()

In [None]:
df_keyphrases.head()

In [None]:
df_user_name.head()

In [None]:
df_item_name.head()

## Statistic

In [None]:
print("Number of User: {0}".format(df_user_name['UserIndex'].nunique()))
print("Number of Item: {0}".format(df_item_name['ItemIndex'].nunique()))
print("Number of Positive Review: {0}".format(sum(df['Binary'].tolist())))

In [None]:
df['vector'] = tokenizer.texts_to_sequences(text)

In [None]:
review_lengths = df.vector.apply(lambda x: len(x)).tolist()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
plt.hist(review_lengths, normed=True, bins=30)
plt.ylabel('Probability');

## Save

In [None]:
df.to_csv(dataPath+'Data.csv')
df_word_index.to_csv(dataPath+'WordIndex.csv')
df_keyphrases.to_csv(dataPath+'KeyPhrases.csv')
df_user_name.to_csv(dataPath+'UserIndex.csv')
df_item_name.to_csv(dataPath+'ItemIndex.csv')