In [43]:
from pymongo import MongoClient
from srs.database import connect_to_db
from srs.utilities import Sentence, tokenize
from nltk import pos_tag
from collections import Counter
import math

In [44]:
def get_category_dict():
    """Build a dictionary whose key is the category tuple, and the value is a list of product_ids:"""
    client, db = connect_to_db()
    cursor = db.product_collection.find()
    category_dict = {}
    i = 0
    for product in cursor:
        i += 1   
        if i % 10000 == 0:
            print i
        category = product['category']
        category_short = tuple(category[:3]) #generally category is 4-tuple. Now limit to the first three tuple
        product_id = product['product_id']

        if category_short not in category_dict:
            category_dict[category_short] = [product_id]
        else:
            category_dict[category_short].append(product_id)
    client.close()
    
    return category_dict

def show_category(category_dict, min_product_num):
    """Sort the categories according to the number of products in that category, and print them from top"""
    category_list_sorted = []
    category_list = []
    for key in category_dict:
        length = len(category_dict[key])
        category_list.append([length, key])
    category_list_sorted = sorted(category_list, key=lambda tup: tup[0], reverse=True)

    for category_data in category_list_sorted:
        if category_data[0] > min_product_num:
            print category_data

In [None]:
category_dict = get_category_dict()

In [42]:
# Shows all the main categories (up to 3rd level) and the number of product it contains:
min_product_num = 1000
show_categoty(category_dict, min_product_num)

[201512, (u'Cell Phones & Accessories', u'Cases', u'Basic Cases')]
[54232, (u'Electronics', u'Computers & Accessories', u'Laptop & Netbook Computer Accessories')]
[50234, (u'Electronics', u'Computers & Accessories', u'Cables & Accessories')]
[46040, (u'Electronics', u'Accessories & Supplies', u'Audio & Video Accessories')]
[40292, (u'Electronics', u'Camera & Photo', u'Accessories')]
[36801, (u'Electronics', u'Computers & Accessories', u'Touch Screen Tablet Accessories')]
[26273, (u'Electronics', u'Computers & Accessories', u'Computer Components')]
[25569, (u'Electronics', u'Portable Audio & Video', u'MP3 Players & Accessories')]
[25256, (u'Cell Phones & Accessories', u'Accessories', u'Accessory Kits')]
[16506, (u'Cell Phones & Accessories', u'Accessories', u'Chargers')]
[15195, (u'Cell Phones & Accessories', u'Accessories', u'Screen Protectors')]
[14752, (u'Electronics', u'Car & Vehicle Electronics', u'Car Electronics')]
[11840, (u'Electronics', u'Computers & Accessories', u'Data Stora

In [45]:
def get_sentence_from_category(category):
    """Obtain all the review sentences from a category tuple:"""
    client, db = connect_to_db()
    product_id_list = category_dict[category]
    sentence_list = []
    review_num = 0
    for product_id in product_id_list:
        query_res = list(db.product_collection.find({"product_id": product_id}))
        contents = query_res[0]["contents"]
        sentence_list += contents
        review_num += len(query_res[0]["review_ids"])
    print "Number of products {0}\nNumber of reviews: {1}\nNumber of sentences: {2}".format(len(product_id_list), review_num, len(sentence_list))
    client.close()
    return sentence_list


def get_tf_idf(sentence_list):
    """Get tf-idf score for each word
       The dictionary records for each word as a key, the [num_word, num_doc] value, where num_word means the number of 
       that word in the sentence_list, and num_doc means the number of sentences this word appears in.
    """
    word_statistics = {}
    i = 0
    print "Number of sentences processed:"
    # Getting each word's statistics: [num_word, num_doc]
    for sentence in sentence_list:
        i += 1
        if i % 10000 == 0:
            print i
        tokens = tokenize(sentence, stem = False)
        tokens_count = Counter(tokens)
        for word in tokens_count:        
            if word not in word_statistics:
                word_statistics[word] = [tokens_count[word], 1]
            else:
                word_statistics[word][0] += tokens_count[word]
                word_statistics[word][1] += 1
      
    total_num_doc = len(sentence_list)
    word_tf_idf = []
    
    # Getting the maximum word frequency:
    max_word_freq = 0
    for word in word_statistics:
        if word_statistics[word][0] > max_word_freq:
            max_word_freq = word_statistics[word][0]
    
    # Getting the tf-idf score for each word
    for word in word_statistics:
        tf = float(word_statistics[word][0]) / max_word_freq
        num_doc = word_statistics[word][1]
        idf = math.log(float(total_num_doc)/(0 + num_doc)) 
        word_tf_idf.append([word, tf * idf, tf, idf])
            
    # Sorting the word_tf_idf list:
    word_tf_idf = sorted(word_tf_idf, key=lambda tup: tup[1], reverse=True)
    return word_tf_idf


def get_aspect_cadidate(word_tf_idf, tag_list = ["NN"], threshold = 0.8):
    '''Get cadidate aspects from word_tf_idf. Only words whose tag belong to tag_list and score > threshold will pass'''
    aspect_cadidate = []
    for word_data in word_tf_idf:
        word = word_data[0]
        tf_idf = word_data[1]
        word_tag = pos_tag(word)[0]      
        # If any of the tag string (e.g. "NN") in the given tag_list appears in the word's tag (e.g. "NNS")
        if any(tag in word_tag for tag in tag_list) and word_data[1] >= threshold:
            aspect_cadidate_data = [word, tf_idf, word_tag[1]]
            aspect_cadidate.append(aspect_cadidate_data)
            print [word, '%0.2f' % tf_idf, word_tag[1]]
        if word_data[1] < threshold:
            break
            
    return aspect_cadidate

In [39]:
# Sample categories, with decreasing number of sentences
category = (u'Electronics', u'Computers & Accessories', u'Tablets') #2439 products, 99311 reviews, 781032 sentences
category = (u'Cell Phones & Accessories', u'Cell Phones', u'No-Contract Cell Phones') # 743 products, 32200 reviews, 221681 sentences
category = (u'Cell Phones & Accessories', u'Accessories', u'Bluetooth Speakers') #609 products, 38847 reviews, 233048 sentences
# category = (u'Electronics', u'Portable Audio & Video', u'Portable DVD Players') #222 products, 3977 reviews, 22476 sentences
sentence_list = get_sentence_from_category(category)

Number of products 609
Number of reviews: 38847
Number of sentences: 233048


In [None]:
word_tf_idf= get_tf_idf(sentence_list)

In [41]:
tag_list = ["NN","VB"]  # Include in VB since some nouns are mis-classified as verbs
aspect_candidate = get_aspect_cadidate(word_tf_idf, tag_list, 0.8)

[u'speaker', '1.97', 'NN']
[u'sound', '1.90', 'NN']
[u'great', '1.46', 'NN']
[u'bluetooth', '1.34', 'NN']
[u'good', '1.28', 'NN']
[u'phone', '1.21', 'NN']
[u'quality', '1.11', 'NN']
[u'music', '1.01', 'NN']
[u'volume', '0.96', 'NN']
[u'like', '0.90', 'NN']
[u'product', '0.89', 'NN']
[u'device', '0.87', 'NN']
[u'battery', '0.86', 'VB']
[u'little', '0.85', 'NN']
[u'easy', '0.83', 'VB']
[u'speakers', '0.82', 'NN']
