In [1]:
import pandas as pd
import numpy as np
import gzip

import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from keras.preprocessing.text import Tokenizer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
ITEM_ID = 'asin'
USER_ID = 'reviewerID'
RATING = 'overall'
REVIEW_TEXT = 'reviewText'
TIMESTAMP = 'unixReviewTime'
KEYPHRASE = 'keyVector'
BINARY_RATING = 'Binary'
NUM_KEYPHRASE = 100
DATA_PATH = '../NCE_Projected_LRec/data/amazon/raw/'
DATA_NAME = 'reviews_CDs_and_Vinyl_5'
DATA_EXTENSION = '.json.gz'

# Load Dataset

In [3]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [4]:
df = getDF(DATA_PATH+DATA_NAME+DATA_EXTENSION)[[USER_ID, ITEM_ID, RATING, TIMESTAMP, REVIEW_TEXT]]
df[REVIEW_TEXT] = df[REVIEW_TEXT].astype('str')
df[USER_ID] = df[USER_ID].astype('str')
df[ITEM_ID] = df[ITEM_ID].astype('str')

In [5]:
def filter_dataset(df, threshold=3, popularity=True, filter_by_review_count=True, 
                   user_review_threshold=10, item_review_threshold=10, 
                   num_user=None, num_item=None, user_ratio=0.25, item_ratio=0.2):
    # Binarize rating
    df[BINARY_RATING] = (df[RATING] > threshold)*1

    # Filter dataset only based on positive ratings
    df = df[df[BINARY_RATING] == 1]
    
    print("The total number of users is {}".format(df[USER_ID].nunique()))
    print("The total number of items is {} \n".format(df[ITEM_ID].nunique()))
    

    values = df[ITEM_ID].value_counts().keys().tolist()
    counts = df[ITEM_ID].value_counts().tolist()
    item_df = pd.DataFrame.from_dict({ITEM_ID: values, "count": counts})
    
    values = df[USER_ID].value_counts().keys().tolist()
    counts = df[USER_ID].value_counts().tolist()
    user_df = pd.DataFrame.from_dict({USER_ID: values, "count": counts})
    
    if popularity:
        print("Filter dataset by popularity. \n")
        
        if filter_by_review_count:
            print("Filter dataset by review count. \n")
            
            filtered_item_df = item_df[item_df["count"] >= item_review_threshold]
            filtered_item_id = filtered_item_df[ITEM_ID].values
            
            filtered_user_df = user_df[user_df["count"] >= user_review_threshold]
            filtered_user_id = filtered_user_df[USER_ID].values
            
        else:
            print("Filter dataset by user and item number. \n")
            filtered_item_id = item_df[ITEM_ID].unique()[:num_item]
            filtered_user_id = user_df[USER_ID].unique()[:num_user]
            
    else:
        print("Filter dataset by sampling. \n")
        np.random.seed(8292)
        
        filtered_item_id = np.take(item_df[ITEM_ID].unique(), 
                                   indices=np.random.choice(len(item_df), int(item_ratio*len(item_df))))
        filtered_user_id = np.take(user_df[USER_ID].unique(), 
                                   indices=np.random.choice(len(user_df), int(user_ratio*len(user_df))))
        
    df = df.loc[(df[USER_ID].isin(filtered_user_id)) & (df[ITEM_ID].isin(filtered_item_id))]
    
    print("Number of User: {}".format(df[USER_ID].nunique()))
    print("Number of Item: {}".format(df[ITEM_ID].nunique()))

    return df

# Option1: Filter by popularity, then the number of users and items

In [41]:
filtered_df = filter_dataset(df, threshold=3, popularity=True, filter_by_review_count=False, 
                             user_review_threshold=None, item_review_threshold=None, 
                             num_user=40000, num_item=4400, user_ratio=None, item_ratio=None)

The total number of users is 75037
The total number of items is 64356 

Filter dataset by popularity. 

Filter dataset by user and item number. 

Number of User: 35745
Number of Item: 4400


# Option2: Filter by popularity, then the number of positive ratings of users and items

In [42]:
# filtered_df = filter_dataset(df, threshold=3, popularity=True, filter_by_review_count=True, 
#                              user_review_threshold=23, item_review_threshold=40, 
#                              num_user=None, num_item=None, user_ratio=None, item_ratio=None)

# Option3: Sample from the dataset

In [43]:
# filtered_df = filter_dataset(df, threshold=3, popularity=False, filter_by_review_count=False, 
#                              user_review_threshold=None, item_review_threshold=None, 
#                              num_user=None, num_item=None, user_ratio=0.25, item_ratio=0.7)

# Analyze the filtered dataset

In [44]:
filtered_df.head()

Unnamed: 0,reviewerID,asin,overall,unixReviewTime,reviewText,Binary
228,A2K3LZPH3ND94V,780018664,4.0,1167868800,"This was basically a superior movie, still dar...",1
230,A1GSR7RGCG1QYZ,780018664,5.0,1249344000,"From the elongated opening scene of ""M"", you k...",1
232,A2GANR9I6XHTU9,780018664,4.0,1169337600,Filmed in 1931 this is the first serial killer...,1
233,ANCOMAI0I7LVG,780018664,5.0,1213574400,My feelings for this iconic piece of filmmakin...,1
234,A1GGOC9PVDXW7Z,780018664,5.0,1268524800,The title M comes from a chalk mark (for 'murd...,1


In [45]:
values = filtered_df[USER_ID].value_counts().keys().tolist()
counts = filtered_df[USER_ID].value_counts().tolist()
user_df = pd.DataFrame.from_dict({USER_ID: values, "count": counts})

values = filtered_df[ITEM_ID].value_counts().keys().tolist()
counts = filtered_df[ITEM_ID].value_counts().tolist()
item_df = pd.DataFrame.from_dict({ITEM_ID: values, "count": counts})

In [46]:
item_df.tail()

Unnamed: 0,asin,count
4395,B007PKSN6Q,20
4396,B002K0WBM8,19
4397,B001RJ1XV8,16
4398,B000002O42,11
4399,B000006P8O,5


In [47]:
item_df["count"].mean()

60.93545454545455

In [48]:
user_df[user_df["count"] >= 10]

Unnamed: 0,count,reviewerID
0,590,A3KJ6JAZPH382D
1,498,A1GN8UJIZLCA59
2,472,A1J5KCZC8CMW9I
3,430,A2582KMXLK2P06
4,429,A3HU0B9XUEVHIM
5,405,A26NLSTT75FMJM
6,399,A34Y1FT0MTD7C9
7,395,ASVNSWIXBV72Q
8,373,A12R54MKO17TW0
9,342,A1QEWOSV05RYEO


In [13]:
users = user_df[user_df["count"] >= 10][USER_ID].values

In [50]:
pos_df = filtered_df.loc[filtered_df[USER_ID].isin(users)].reset_index(drop=True)
pos_df

# Export the filtered dataset

# pos_df.to_csv(DATA_PATH+DATA_NAME+".csv", header=False)

Unnamed: 0,reviewerID,asin,overall,unixReviewTime,reviewText,Binary
0,A2GANR9I6XHTU9,0780018664,4.0,1169337600,Filmed in 1931 this is the first serial killer...,1
1,ANCOMAI0I7LVG,0780018664,5.0,1213574400,My feelings for this iconic piece of filmmakin...,1
2,A1GGOC9PVDXW7Z,0780018664,5.0,1268524800,The title M comes from a chalk mark (for 'murd...,1
3,A62G4QX6XQVLP,0780018664,5.0,1165190400,This film is easily in the Top 5 of Fritz Lang...,1
4,A2ILOYARQVO4K1,0780018664,4.0,989193600,"Dark, disturbingly satirical, humorous exactly...",1
5,A2XKQHB8VCUCJK,0780018664,5.0,1385942400,&#34;M&#34; is a German expressionist film dir...,1
6,AIMR915K4YCN,0780018664,5.0,1083196800,"While watching this story unfold, I found myse...",1
7,A1GHUN5HXMHZ89,0780018664,5.0,1300406400,"I don't know if it is in writing somewhere, bu...",1
8,A2C7BOQVFH1HLE,0780018664,4.0,1302825600,This is the first Criterion release that I've ...,1
9,A25FDX17O3QKLT,0780018664,5.0,1115769600,This famous Fritz Lang classic is about a chil...,1


In [51]:
items = pos_df[ITEM_ID].values

# Get the final DF with reviews

In [53]:
cur_df = df.loc[(df[USER_ID].isin(users)) & (df[ITEM_ID].isin(items))]

Unnamed: 0,reviewerID,asin,overall,unixReviewTime,reviewText,Binary
232,A2GANR9I6XHTU9,0780018664,4.0,1169337600,Filmed in 1931 this is the first serial killer...,1
233,ANCOMAI0I7LVG,0780018664,5.0,1213574400,My feelings for this iconic piece of filmmakin...,1
234,A1GGOC9PVDXW7Z,0780018664,5.0,1268524800,The title M comes from a chalk mark (for 'murd...,1
236,A62G4QX6XQVLP,0780018664,5.0,1165190400,This film is easily in the Top 5 of Fritz Lang...,1
237,A2ILOYARQVO4K1,0780018664,4.0,989193600,"Dark, disturbingly satirical, humorous exactly...",1
238,A2XKQHB8VCUCJK,0780018664,5.0,1385942400,&#34;M&#34; is a German expressionist film dir...,1
240,AIMR915K4YCN,0780018664,5.0,1083196800,"While watching this story unfold, I found myse...",1
244,A1GHUN5HXMHZ89,0780018664,5.0,1300406400,"I don't know if it is in writing somewhere, bu...",1
258,A2C7BOQVFH1HLE,0780018664,4.0,1302825600,This is the first Criterion release that I've ...,1
262,A25FDX17O3QKLT,0780018664,5.0,1115769600,This famous Fritz Lang classic is about a chil...,1


# Process review

In [54]:
def process_reviewText(df):

    # Process ReviewText
    df["review"] = df[REVIEW_TEXT].apply(lambda x: nltk.word_tokenize(x)+[' '])

    # Lowercase
    df["review"] = df["review"].apply(lambda text: [w.lower() for w in text])

    # Remove Punctuation
    table = str.maketrans('', '', string.punctuation)
    df["review"] = df["review"].apply(lambda text: [w.translate(table) for w in text])

    # Remove tokens that are not alphabetic
    df["review"] = df["review"].apply(lambda text: [w for w in text if w.isalpha()])

    # Remove Stopwords
    # Get english stopwords
    en_stopwords = set(stopwords.words('english'))
    df["review"] = df["review"].apply(lambda text: [w for w in text if w not in en_stopwords])
    
    return df

In [55]:
cur_df = process_reviewText(cur_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docu

In [56]:
cur_df

Unnamed: 0,reviewerID,asin,overall,unixReviewTime,reviewText,Binary,review
232,A2GANR9I6XHTU9,0780018664,4.0,1169337600,Filmed in 1931 this is the first serial killer...,1,"[filmed, first, serial, killer, movie, story, ..."
233,ANCOMAI0I7LVG,0780018664,5.0,1213574400,My feelings for this iconic piece of filmmakin...,1,"[feelings, iconic, piece, filmmaking, mirror, ..."
234,A1GGOC9PVDXW7Z,0780018664,5.0,1268524800,The title M comes from a chalk mark (for 'murd...,1,"[title, comes, chalk, mark, murderer, placed, ..."
236,A62G4QX6XQVLP,0780018664,5.0,1165190400,This film is easily in the Top 5 of Fritz Lang...,1,"[film, easily, top, fritz, lang, best, films, ..."
237,A2ILOYARQVO4K1,0780018664,4.0,989193600,"Dark, disturbingly satirical, humorous exactly...",1,"[dark, disturbingly, satirical, humorous, exac..."
238,A2XKQHB8VCUCJK,0780018664,5.0,1385942400,&#34;M&#34; is a German expressionist film dir...,1,"[german, expressionist, film, directed, auteur..."
240,AIMR915K4YCN,0780018664,5.0,1083196800,"While watching this story unfold, I found myse...",1,"[watching, story, unfold, found, quite, roller..."
244,A1GHUN5HXMHZ89,0780018664,5.0,1300406400,"I don't know if it is in writing somewhere, bu...",1,"[nt, know, writing, somewhere, guess, would, i..."
258,A2C7BOQVFH1HLE,0780018664,4.0,1302825600,This is the first Criterion release that I've ...,1,"[first, criterion, release, ever, seen, say, q..."
262,A25FDX17O3QKLT,0780018664,5.0,1115769600,This famous Fritz Lang classic is about a chil...,1,"[famous, fritz, lang, classic, child, murderer..."


# Index Words since no further changes will be made on the words

In [64]:
tokenizer = Tokenizer()
text = cur_df["review"].tolist()
tokenizer.fit_on_texts(text)

In [65]:
df_word_index = pd.DataFrame(list(tokenizer.word_index.items()), columns=['word','index'])

In [67]:
df_word_index

Unnamed: 0,word,index
0,album,1
1,quot,2
2,song,3
3,nt,4
4,one,5
5,like,6
6,songs,7
7,great,8
8,music,9
9,good,10


# Get Token List

In [70]:
token_list = cur_df["review"].tolist()

In [71]:
token_list[0]

['filmed',
 'first',
 'serial',
 'killer',
 'movie',
 'story',
 'serial',
 'killer',
 'terrorizes',
 'german',
 'town',
 'follow',
 'story',
 'get',
 'see',
 'city',
 'going',
 'mad',
 'hunt',
 'killer',
 'killer',
 'work',
 'eventually',
 'lorre',
 'fleeing',
 'life',
 'killer',
 'strikes',
 'hes',
 'always',
 'whistling',
 'haunting',
 'tune',
 'eerie',
 'blind',
 'man',
 'sells',
 'balloons',
 'ends',
 'using',
 'keen',
 'observances',
 'hearing',
 'whistling',
 'putting',
 'together',
 'eventually',
 'aiding',
 'capture',
 'bringing',
 'killer',
 'justicethe',
 'imagery',
 'haunting',
 'whistling',
 'creepy',
 'really',
 'grates',
 'nerves',
 'scariness',
 'dont',
 'see',
 'kind',
 'deal',
 'ive',
 'also',
 'heard',
 'film',
 'somewhat',
 'models',
 'story',
 'german',
 'serial',
 'killer',
 'peter',
 'kurten',
 'aka',
 'vampire',
 'dusseldorf',
 'commiting',
 'assaults',
 'murders',
 'children',
 'fritz',
 'denied',
 'movie',
 'anything',
 'kurten',
 'yet',
 'similiarity',
 'film'

In [73]:
cur_df[REVIEW_TEXT].tolist()[0]

'Filmed in 1931 this is the first serial killer movie...M is the story of a serial killer who terrorizes a German town. As you follow the story you get to see the city going mad, the hunt for the killer, the killer at work and then eventually Lorre fleeing for his life.. Before the killer strikes hes always whistling this haunting tune, very eerie. A blind man who sells balloons ends up using his keen observances and hearing of the whistling and putting 2 and 2 together...eventually aiding in his capture and bringing the killer to justice.The imagery is haunting,the whistling is creepy and really grates on your nerves, the scariness is more of a what you dont see kind of deal  ... Ive also have heard this film somewhat models the story of German serial killer Peter Kurten aka the Vampire of Dusseldorf who in 1929 was commiting assaults and murders of children. Fritz denied the movie having anything to do with Kurten yet there is a similiarity between the film and the murders that occur

In [81]:
token_list[4]

['dark',
 'disturbingly',
 'satirical',
 'humorous',
 'exactly',
 'would',
 'nt',
 'expect',
 'populated',
 'strangelooking',
 'people',
 'would',
 'result',
 'george',
 'grosz',
 'film',
 'director']

In [80]:
cur_df[REVIEW_TEXT].tolist()[4]

"Dark, disturbingly satirical, humorous exactly where you wouldn't expect it to be, and populated by strange-looking people, this would be the result if George Grosz had been a film director."