# Dynamic Aspect Extraction for `camera` Reviews Part B

Han, Kehang (hkh12@mit.edu)

As a follow-up demonstration, this ipynb is focused on extracting aspects from datasets called `AmazonReviews`, which has much more reviews on cameras. 

## Set up

In [1]:
import json
import nltk
import string
import sys
sys.path.insert(0,'../')

from utilities import Product, AspectPattern

## s1: load raw data from `AmazonReviews` datasets

In [2]:
product_name = 'B00AW2P98E'
reviewJsonFile = product_name + '.json'
product = Product(name=product_name)
product.loadReviewsFromJsonFile('../data/trainingFiles/AmazonReviews/cameras/' + reviewJsonFile)

## s2: define aspect patterns

In [3]:
aspectPatterns = []
# define an aspect pattern1
pattern_name = 'adj_nn'
pattern_structure ="""
adj_nn:{<JJ><NN.?>}
"""
aspectTagIndices = [1]
aspectPattern = AspectPattern(name='adj_nn', structure=pattern_structure, aspectTagIndices=aspectTagIndices)
aspectPatterns.append(aspectPattern)
# define an aspect pattern2
pattern_name = 'nn_nn'
pattern_structure ="""
nn_nn:{<NN.?><NN.?>}
"""
aspectTagIndices = [0,1]
aspectPattern = AspectPattern(name='nn_nn', structure=pattern_structure, aspectTagIndices=aspectTagIndices)
aspectPatterns.append(aspectPattern)

## s3: match sentence to pattern to extract aspects

In [4]:
# pos tagging
for review in product.reviews:
    for sentence in review.sentences:
        sentence.pos_tag()
        sentence.matchDaynamicAspectPatterns(aspectPatterns)

## s4: statistic analysis on aspects extracted across all reviews

In [5]:
word_dict = {}
for review in product.reviews:
    for sentence in review.sentences:
        for aspect in sentence.dynamic_aspects:
            if aspect in word_dict:
                word_dict[aspect] += 1
            else:
                word_dict[aspect] = 1

In [6]:
word_sorted = sorted(word_dict.items(), key=lambda tup:-tup[1])
word_sorted[:15]

[(u'camera', 15),
 (u'pictures', 12),
 (u'cameras', 7),
 (u'screen', 7),
 (u'size', 5),
 (u'zoom', 4),
 (u'feature', 3),
 (u'color', 3),
 (u'party', 3),
 (u'shots', 3),
 (u'price', 3),
 (u'friday', 3),
 (u'features', 3),
 (u'point', 3),
 (u'charger', 3)]

## s5: save most frequent dynamic aspects

In [7]:
import json
word_output = open('../data/word_list/{0}_wordlist.txt'.format(product_name), 'w')
json.dump(word_sorted[:15], word_output)
word_output.close()

## s6: stemming analysis

In [8]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')

In [9]:
# collect word with same stem
stemmedWord_dict = {}
for word in word_dict:
    stemmedWord = stemmer.stem(word)
    if stemmedWord in stemmedWord_dict:
        stemmedWord_dict[stemmedWord] += word_dict[word]
    else:
        stemmedWord_dict[stemmedWord] = word_dict[word]

In [10]:
# frequency ranking
stemmedWord_sorted = sorted(stemmedWord_dict.items(), key=lambda tup:-tup[1])
stemmedWord_sorted[:15]

[(u'camera', 22),
 (u'pictur', 13),
 (u'screen', 7),
 (u'featur', 6),
 (u'size', 5),
 (u'shot', 4),
 (u'zoom', 4),
 (u'photo', 3),
 (u'button', 3),
 (u'coolpix s6400', 3),
 (u'friday', 3),
 (u'angl', 3),
 (u'point', 3),
 (u'color', 3),
 (u'batteri', 3)]

In [11]:
# save most frequent stemmed words
stemmedWord_output = open('../data/word_list/{0}_stemmedwordlist.txt'.format(product_name), 'w')
json.dump(stemmedWord_sorted[:15], stemmedWord_output)
stemmedWord_output.close()