In [58]:
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
from collections import Counter
import nltk
import re
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
from nltk.tag import PerceptronTagger
from nltk.data import find
from tqdm import tqdm

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/wuga/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
df = pd.read_json('data/video/Amazon_Instant_Video_5.json', lines=True)

In [4]:
df.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,B000H00VBQ,"[0, 0]",2,I had big expectations because I love English ...,"05 3, 2014",A11N155CW1UV02,AdrianaM,A little bit boring for me,1399075200
1,B000H00VBQ,"[0, 0]",5,I highly recommend this series. It is a must f...,"09 3, 2012",A3BC8O2KCL29V2,Carol T,Excellent Grown Up TV,1346630400
2,B000H00VBQ,"[0, 1]",1,This one is a real snoozer. Don't believe anyt...,"10 16, 2013",A60D5HQFOTSOM,"Daniel Cooper ""dancoopermedia""",Way too boring for me,1381881600
3,B000H00VBQ,"[0, 0]",4,Mysteries are interesting. The tension betwee...,"10 30, 2013",A1RJPIGRSNX4PW,"J. Kaplan ""JJ""",Robson Green is mesmerizing,1383091200
4,B000H00VBQ,"[1, 1]",5,"This show always is excellent, as far as briti...","02 11, 2009",A16XRPF40679KG,Michael Dobey,Robson green and great writing,1234310400


In [20]:
df['UserID'] = df.reviewerID.astype('category').cat.rename_categories(range(1, df.reviewerID.nunique()+1))

In [22]:
df['ItemID'] = df.asin.astype('category').cat.rename_categories(range(1, df.asin.nunique()+1))

In [25]:
df.drop(['asin', 'reviewerID'], axis=1)

Unnamed: 0,helpful,overall,reviewText,reviewTime,reviewerName,summary,unixReviewTime,UserID,ItemID
0,"[0, 0]",2,I had big expectations because I love English ...,"05 3, 2014",AdrianaM,A little bit boring for me,1399075200,76,1
1,"[0, 0]",5,I highly recommend this series. It is a must f...,"09 3, 2012",Carol T,Excellent Grown Up TV,1346630400,3098,1
2,"[0, 1]",1,This one is a real snoozer. Don't believe anyt...,"10 16, 2013","Daniel Cooper ""dancoopermedia""",Way too boring for me,1381881600,3958,1
3,"[0, 0]",4,Mysteries are interesting. The tension betwee...,"10 30, 2013","J. Kaplan ""JJ""",Robson Green is mesmerizing,1383091200,1060,1
4,"[1, 1]",5,"This show always is excellent, as far as briti...","02 11, 2009",Michael Dobey,Robson green and great writing,1234310400,254,1
5,"[12, 12]",5,I discovered this series quite by accident. Ha...,"10 11, 2011",Z Hayes,I purchased the series via streaming and loved...,1318291200,994,1
6,"[0, 0]",3,"It beats watching a blank screen. However, I j...","10 15, 2013","Jimmy C. Saunders ""Papa Smurf""",It takes up your time.,1381795200,984,2
7,"[0, 0]",3,"There are many episodes in this series, so I p...","12 29, 2013",JohnnyC,A reasonable way to kill a few minutes,1388275200,4877,2
8,"[0, 0]",5,This is the best of the best comedy Stand-up. ...,"02 26, 2014",Kansas,kansas001,1393372800,3724,2
9,"[0, 0]",3,Not bad. Didn't know any of the comedians but...,"04 2, 2014",Louis V. Borsellino,Entertaining Comedy,1396396800,4934,2


In [35]:
grammar = r"""
    NBAR:
        {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
        
    NP:
        {<NBAR>}
        {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
"""


In [48]:
tagger = PerceptronTagger()
pos_tag = tagger.tag
# Create phrase tree
chunker = nltk.RegexpParser(grammar)

In [61]:


# Noun Phrase Extraction Support Functions
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
lemmatizer = nltk.WordNetLemmatizer()
stemmer = nltk.stem.porter.PorterStemmer()

# generator, generate leaves one by one
def leaves(tree):
    """Finds NP (nounphrase) leaf nodes of a chunk tree."""
    for subtree in tree.subtrees(filter = lambda t: t.label()=='NP' or t.label()=='JJ' or t.label()=='RB'):
        yield subtree.leaves()

# stemming, lematizing, lower case... 
def normalise(word):
    """Normalises words to lowercase and stems and lemmatizes it."""
    word = word.lower()
#     word = stemmer.stem(word)
#     word = lemmatizer.lemmatize(word)
    return word

# stop-words and length control
def acceptable_word(word):
    """Checks conditions for acceptable word: length, stopword."""
    accepted = bool(2 <= len(word) <= 40
        and word.lower() not in stopwords)
    return accepted

# generator, create item once a time
def get_terms(tree):
    for leaf in leaves(tree):
        term = [normalise(w) for w,t in leaf if acceptable_word(w) ]
        # Phrase only
        if len(term)>1:
            yield term

In [62]:
# Flatten phrase lists to get tokens for analysis
def flatten(npTokenList):
    finalList =[]
    for phrase in npTokenList:
        token = ''
        for word in phrase:
            token += word + ' '
        finalList.append(token.rstrip())
    return finalList

In [65]:
# Revise the previous dataframe transform function...
def newDataFrameTransformation(reviewDF, k=50):
    reviews = reviewDF['reviewText'].values
    
    # Top-k frequent terms
    counter = Counter()
    for i, review in tqdm(enumerate(reviews)):
            counter.update(flatten([word
                            for word 
                            in get_terms(chunker.parse(pos_tag(re.findall(r'\w+', review)))) 
                            ]))
    topk = counter.most_common(k)        
    
    #Find out if a particular review has the word from topk list
    freqReview = []
    for i in tqdm(range(len(reviews))):
        tempCounter = Counter(flatten([word 
                                       for word 
                                       in get_terms(chunker.parse(pos_tag(re.findall(r'\w+',reviews[i]))))]))
        topkinReview = [1 if tempCounter[word] > 0 else 0 for (word,wordCount) in topk]
        freqReview.append(topkinReview)
        
        
    #Prepare freqReviewDf
    freqReviewDf = pd.DataFrame(freqReview)
    dfName = []
    for c in topk:
        dfName.append(c[0])
    freqReviewDf.columns = dfName
    finalreviewDf = reviewDF.join(freqReviewDf)
    return topk, finalreviewDf

In [66]:
new_df = newDataFrameTransformation(df)

37126it [03:39, 169.16it/s]
100%|██████████| 37126/37126 [03:33<00:00, 174.12it/s]


In [69]:
new_df[1].head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,UserID,...,new york,special features,many people,good movie,pilot episode,huge fan,walking dead,good guys,good thing,top notch
0,B000H00VBQ,"[0, 0]",2,I had big expectations because I love English ...,"05 3, 2014",A11N155CW1UV02,AdrianaM,A little bit boring for me,1399075200,76,...,0,0,0,0,0,0,0,0,0,0
1,B000H00VBQ,"[0, 0]",5,I highly recommend this series. It is a must f...,"09 3, 2012",A3BC8O2KCL29V2,Carol T,Excellent Grown Up TV,1346630400,3098,...,0,0,0,0,0,0,0,0,0,0
2,B000H00VBQ,"[0, 1]",1,This one is a real snoozer. Don't believe anyt...,"10 16, 2013",A60D5HQFOTSOM,"Daniel Cooper ""dancoopermedia""",Way too boring for me,1381881600,3958,...,0,0,0,0,0,0,0,0,0,0
3,B000H00VBQ,"[0, 0]",4,Mysteries are interesting. The tension betwee...,"10 30, 2013",A1RJPIGRSNX4PW,"J. Kaplan ""JJ""",Robson Green is mesmerizing,1383091200,1060,...,0,0,0,0,0,0,0,0,0,0
4,B000H00VBQ,"[1, 1]",5,"This show always is excellent, as far as briti...","02 11, 2009",A16XRPF40679KG,Michael Dobey,Robson green and great writing,1234310400,254,...,0,0,0,0,0,0,0,0,0,1


In [34]:


num_words = 500
top_words = [word for (word, freq) in word_counter.most_common(num_words)]
print(top_words)

['the', 'and', 'a', 'to', 'of', 'i', 'is', 'it', 'this', 'in', 'that', 's', 'for', 'but', 'with', 'show', 'as', 'on', 'was', 'you', 'are', 't', 'not', 'have', 'they', 'season', 'be', 'one', 'like', 'all', 'so', 'he', 'series', 'good', 'more', 'his', 'there', 'just', 'has', 'at', 'an', 'who', 'if', 'from', 'great', 'what', 'my', '34', 'or', 'can', 'some', 'out', 'about', 'by', 'very', 'really', 'watch', 'love', 'up', 'characters', 'will', 'her', 'we', 'well', 'would', 'story', 'me', 'their', 'movie', 'see', 'episode', 'when', 'get', 'time', 'she', 'film', 'much', 'first', 'do', 'how', 'watching', 'no', 'into', 'them', 'than', 'only', 'people', 'were', 'had', 'even', 'other', 'episodes', 'don', 'been', 'character', 'shows', 'which', 'too', 'new', 'also', 'think', 'because', 'way', 'two', 'these', 'make', 'interesting', 'most', 'its', 'better', 'then', 'could', 'after', 'm', 'know', 'tv', 'still', 'little', 'many', 'over', 'him', 'best', 'acting', 'now', 'being', 'did', 'am', 'while', 'go