In [31]:
from pymongo import MongoClient
from srs.database import connect_to_db
from srs.utilities import Sentence, tokenize
from nltk import pos_tag
from collections import Counter
import math
import word2vec
import os
import numpy as np
# Loading Word2Vec model
current_directory = os.path.dirname(os.path.realpath("__file__"))
model_path = os.path.join(current_directory[:-6], 'srs/predictor_data/text8.bin')
model = word2vec.load(model_path)

### The following functions accumulate all the sentences by category:

In [11]:
def get_category_dict():
    """Build a dictionary whose key is the category tuple, and the value is a list of product_ids:"""
    client, db = connect_to_db()
    cursor = db.product_collection.find()
    category_dict = {}
    i = 0
    for product in cursor:
        i += 1   
        if i % 10000 == 0:
            print i
        category = product['category']
        category_short = tuple(category[:3]) #generally category is 4-tuple. Now limit to the first three tuple
        product_id = product['product_id']

        if category_short not in category_dict:
            category_dict[category_short] = [product_id]
        else:
            category_dict[category_short].append(product_id)
    client.close()
    
    return category_dict

def show_category(category_dict, min_product_num):
    """Sort the categories according to the number of products in that category, and print them from top"""
    category_list_sorted = []
    category_list = []
    for key in category_dict:
        length = len(category_dict[key])
        category_list.append([length, key])
    category_list_sorted = sorted(category_list, key=lambda tup: tup[0], reverse=True)

    for category_data in category_list_sorted:
        if category_data[0] > min_product_num:
            print category_data

In [None]:
category_dict = get_category_dict()

In [109]:
# Shows all the main categories (up to 3rd level) and the number of product it contains:
min_product_num = 1000
show_category(category_dict, min_product_num)

[201512, (u'Cell Phones & Accessories', u'Cases', u'Basic Cases')]
[54232, (u'Electronics', u'Computers & Accessories', u'Laptop & Netbook Computer Accessories')]
[50234, (u'Electronics', u'Computers & Accessories', u'Cables & Accessories')]
[46040, (u'Electronics', u'Accessories & Supplies', u'Audio & Video Accessories')]
[40292, (u'Electronics', u'Camera & Photo', u'Accessories')]
[36801, (u'Electronics', u'Computers & Accessories', u'Touch Screen Tablet Accessories')]
[26273, (u'Electronics', u'Computers & Accessories', u'Computer Components')]
[25569, (u'Electronics', u'Portable Audio & Video', u'MP3 Players & Accessories')]
[25256, (u'Cell Phones & Accessories', u'Accessories', u'Accessory Kits')]
[16506, (u'Cell Phones & Accessories', u'Accessories', u'Chargers')]
[15195, (u'Cell Phones & Accessories', u'Accessories', u'Screen Protectors')]
[14752, (u'Electronics', u'Car & Vehicle Electronics', u'Car Electronics')]
[11840, (u'Electronics', u'Computers & Accessories', u'Data Stora

### The following three functions collects sentences from one category, obtain each word's tf-idf score, and choose aspect candidates:

In [94]:
def get_sentence_from_category(category):
    """Obtain all the review sentences from a category tuple:"""
    client, db = connect_to_db()
    product_id_list = category_dict[category]
    sentence_list = []
    review_num = 0
    for product_id in product_id_list:
        query_res = list(db.product_collection.find({"product_id": product_id}))
        contents = query_res[0]["contents"]
        sentence_list += contents
        review_num += len(query_res[0]["review_ids"])
    print "Number of products {0}\nNumber of reviews: {1}\nNumber of sentences: {2}".format(len(product_id_list), review_num, len(sentence_list))
    client.close()
    return sentence_list


def get_tf_idf(sentence_list):
    """Get tf-idf score for each word
       The dictionary records for each word as a key, the [num_word, num_doc] value, where num_word means the number of 
       that word in the sentence_list, and num_doc means the number of sentences this word appears in.
    """
    word_statistics = {}
    i = 0
    print "Number of sentences processed:"
    # Getting each word's statistics: [num_word, num_doc]
    for sentence in sentence_list:
        i += 1
        if i % 10000 == 0:
            print i
        tokens = tokenize(sentence, stem = False)
        tokens_count = Counter(tokens)
        for word in tokens_count:        
            if word not in word_statistics:
                word_statistics[word] = [tokens_count[word], 1]
            else:
                word_statistics[word][0] += tokens_count[word]
                word_statistics[word][1] += 1
      
    total_num_doc = len(sentence_list)
    word_tf_idf = []
    
    # Getting the maximum word frequency:
    max_word_freq = 0
    for word in word_statistics:
        if word_statistics[word][0] > max_word_freq:
            max_word_freq = word_statistics[word][0]
    
    # Getting the tf-idf score for each word
    for word in word_statistics:
        tf = float(word_statistics[word][0]) / max_word_freq
        num_doc = word_statistics[word][1]
        idf = math.log(float(total_num_doc)/(0 + num_doc)) 
        word_tf_idf.append([word, tf * idf, tf, idf])
            
    # Sorting the word_tf_idf list:
    word_tf_idf = sorted(word_tf_idf, key=lambda tup: tup[1], reverse=True)
    return word_tf_idf


def get_aspect_cadidate(word_tf_idf, tag_list = ["NN"], score_threshold = 0.8):
    '''Get cadidate aspects from word_tf_idf. Only words whose tag belong to tag_list and score > threshold will pass'''
    aspect_cadidate = []
    for word_data in word_tf_idf:
        word = word_data[0]
        tf_idf = word_data[1]
        word_tag = pos_tag(word)[0]      
        # If any of the tag string (e.g. "NN") in the given tag_list appears in the word's tag (e.g. "NNS")
        if any(tag in word_tag for tag in tag_list) and word_data[1] >= score_threshold:
            aspect_cadidate_data = [word, tf_idf, word_tag[1]]
            aspect_cadidate.append(aspect_cadidate_data)
            print [word, '%0.2f' % tf_idf, word_tag[1]]
        if word_data[1] < score_threshold:
            break
            
    return aspect_cadidate

In [110]:
# Sample categories, with decreasing number of sentences
category = ("Electronics", "Camera & Photo", "Digital Cameras") #Control Group: 7916 products, 203836 reviews, 1724928 sentences
# category = (u'Electronics', u'Computers & Accessories', u'Tablets') #2439 products, 99311 reviews, 781032 sentences
# category = (u'Cell Phones & Accessories', u'Cell Phones', u'No-Contract Cell Phones') # 743 products, 32200 reviews, 221681 sentences
# category = (u'Cell Phones & Accessories', u'Accessories', u'Bluetooth Speakers') #609 products, 38847 reviews, 233048 sentences
# category = (u'Electronics', u'Portable Audio & Video', u'Portable DVD Players') #222 products, 3977 reviews, 22476 sentences
sentence_list = get_sentence_from_category(category)

Number of products 7916
Number of reviews: 203836
Number of sentences: 1724928


In [None]:
word_tf_idf= get_tf_idf(sentence_list)

In [150]:
tag_list = ["NN","VB"]  # Include in VB since some nouns are mis-classified as verbs
aspect_candidate_list = get_aspect_cadidate(word_tf_idf, tag_list, score_threshold = 0.3)

[u'camera', '1.22', 'VB']
[u'pictures', '0.65', 'NN']
[u'great', '0.63', 'NN']
[u'good', '0.56', 'NN']
[u'quality', '0.50', 'NN']
[u'like', '0.46', 'NN']
[u'lens', '0.44', 'NN']
[u'get', '0.44', 'NN']
[u'take', '0.43', 'VB']
[u'canon', '0.41', 'VB']
[u'zoom', '0.40', 'NN']
[u'cameras', '0.38', 'VB']
[u'photos', '0.37', 'NN']
[u'battery', '0.37', 'VB']
[u'easy', '0.36', 'VB']
[u'really', '0.36', 'NN']
[u'video', '0.36', 'NN']
[u'time', '0.36', 'NN']
[u'picture', '0.35', 'NN']
[u'even', '0.34', 'NN']
[u'digital', '0.33', 'NN']
[u'better', '0.33', 'NN']
[u'much', '0.32', 'NN']
[u'flash', '0.32', 'NN']
[u'bought', '0.32', 'NN']
[u'price', '0.31', 'NN']
[u'light', '0.30', 'NN']


### The following functions start from a seed_word list, find the word list that serves as a dict for each seed_word:

In [148]:
def get_similarity(word1, word2):
    """Find the similarity between two words, which equals the dot product of their vectors"""
    if word1 in model:
        word1_vec = model[word1]
    else:
        word1_vec = np.zeros([100])
        print word1_vec
    if word2 in model:
        word2_vec = model[word2]
    else:
        word2_vec = np.zeros([100])
    similarity = np.dot(word1_vec, word2_vec)
    return similarity

def get_word_list_from_aspect_candidates(seed_word, word_tf_idf, similarity_threshold, score_threshold):
    """Method 1: directly find the word list from all words whose similarity with the seed_word and tf-idf score are above 
    certain threshold"""
    word_list = []
    for word_data in word_tf_idf:
        word = word_data[0]
        tf_idf = word_data[1]
        if tf_idf > score_threshold:
            similarity = get_similarity(seed_word, word)
            if similarity > similarity_threshold:
                word_list.append([word, similarity, tf_idf])              
    word_list_sorted = sorted(word_list, key=lambda tup: tup[1], reverse=True)
    return word_list_sorted


def get_word_list_by_tf_idf(seed_word_list, sentence_list, num_words_in_list):
    """Method 2: Find the word_list who can distinguish the chosen sentences from other sentences"""
    
    # For each seed_word in seed_word_list, get the word_data for all sentences. word_data has 4 fields [num_word_total, num_doc_total, 
    # num_word_in_topic, num_doc_in_topic], the first two are from all sentences, and the latter two are from the sentences 
    # that contain the seed_word.   
    num_seed_word = len(seed_word_list)
    word_statistics_dic_list = [{} for i in range(num_seed_word)]
    
    num_sentence_topic_list = [0 for k in range(num_seed_word)]
    num_sentence_total_list = [0 for k in range(num_seed_word)]
    i = 0
    print "Number of sentences processed:"
    for sentence in sentence_list: 
        i += 1
        if i % 10000 == 0:
            print i
        tokens = tokenize(sentence, stem = False)
        tokens_count = Counter(tokens)
        for word in tokens_count:
            # check for each seed_word:
            for k in range(num_seed_word):
                seed_word = seed_word_list[k]
                if seed_word in sentence: 
                    num_sentence_topic_list[k] += 1
                    num_sentence_total_list[k] += 1
                    if word not in word_statistics_dic_list[k]:
                        word_statistics_dic_list[k][word] = [tokens_count[word], 1, tokens_count[word], 1]
                    else:
                        word_statistics_dic_list[k][word][0] += tokens_count[word]
                        word_statistics_dic_list[k][word][1] += 1
                        word_statistics_dic_list[k][word][2] += tokens_count[word]
                        word_statistics_dic_list[k][word][3] += 1
                else:
                    num_sentence_total_list[k] += 1
                    if word not in word_statistics_dic_list[k]:
                        word_statistics_dic_list[k][word] = [tokens_count[word], 1, 0, 0]
                    else:
                        word_statistics_dic_list[k][word][0] += tokens_count[word]
                        word_statistics_dic_list[k][word][1] += 1

    # Get the maximum word frequency for each seed_word group:
    word_tf_idf_ratio_list = [[] for k in range(num_seed_word)]
    max_num_word_total_list =[0 for k in range(num_seed_word)]
    max_num_word_topic_list =[0 for k in range(num_seed_word)] 
    for k in range(num_seed_word):    
        for word in word_statistics_dic_list[k]:
            word_data = word_statistics_dic_list[k][word]
            if word_data[0] > max_num_word_total_list[k]:
                max_num_word_total_list[k] = word_data[0]
            if word_data[2] > max_num_word_topic_list[k]:
                max_num_word_topic_list[k] = word_data[2]
    
    # Get tf_idf adjusted ratio for each word, to measure how this word can distinguish the topic sentences:
    for k in range(num_seed_word):        
        for word in word_statistics_dic_list[k]:
            word_data = word_statistics_dic_list[k][word]
            num_word_total = word_data[0]
            num_word_topic = word_data[2]
            num_doc_total = word_data[1]   
            num_doc_topic = word_data[3] 
        
            if num_doc_topic == 0 or num_doc_total == 0:
                word_tf_idf_ratio_list[k].append([word, 0])
                continue

            tf_topic = float(num_word_topic) / max_num_word_topic_list[k]
            tf_total = float(num_word_total) / max_num_word_total_list[k]
            tf_ratio = (tf_topic/num_word_topic) / (tf_total/num_doc_total)

            idf_topic = math.log(float(num_sentence_topic_list[k]) / num_doc_topic)
            idf_total = math.log(float(num_sentence_total_list[k]) / num_doc_total) 

            word_tf_idf_ratio_list[k].append([word, tf_topic * math.log(tf_ratio) * idf_total, tf_topic, tf_total, tf_ratio, idf_topic, idf_total])

        word_tf_idf_ratio_list[k].sort(key=lambda tup: tup[1], reverse=True)

    word_tf_idf_ratio_list = [item[:num_words_in_list] for item in word_tf_idf_ratio_list]
    
    return word_tf_idf_ratio_list

In [116]:
# Get the word list from all words whose similarity with the seed_word and tf-idf score are above a certain threshold
word_list1 = get_word_list_from_aspect_candidates('pictures', word_tf_idf, similarity_threshold = 0.25, score_threshold = 0.3)
word_list1

[[u'pictures', 0.99999997268218732, 0.6495218245632511],
 [u'photos', 0.86083627362442172, 0.37072616666455493],
 [u'picture', 0.61262880792408558, 0.35469257714975955],
 [u'video', 0.53999254308776479, 0.3572299834316976],
 [u'flash', 0.47092313744007452, 0.3188676566055583],
 [u'zoom', 0.40573301108039994, 0.40380737966801256],
 [u'cameras', 0.36868705644628258, 0.3786131470313853],
 [u'digital', 0.35883936250750026, 0.33130751879204],
 [u'camera', 0.35455396198588562, 1.223331569583933],
 [u'like', 0.26126584994220436, 0.46283471998808356]]

In [None]:
seed_word_list = ["battery","pictures","price","zoom","ease of use","detection","design","video","quality","screen","size"]
word_list2 = get_word_list_by_tf_idf(seed_word_list, sentence_list, num_words_in_list = 15)
word_list2

In [None]:
word_list_ours = {
    "battery": ["batteri", "charger"], 
    "pictures": ["pictur", "imag", "shot"], 
    "price": ["cheap","cheaper", "expens", "afford", "price"], 
    "zoom": ["zoom", "len"], 
    "ease of use": ["easi", "simpl", "easili", "simpli", "use"], 
    "detection": ["detect", "auto", "mode", "smart", "focus"], 
    "design": ["design", "nice", "beatiful", "color", "pretti"], 
    "video": ["clear", "video"], 
    "quality": ["qualiti"], 
    "screen": ["screen", "display"], 
    "size": ["size", "big", "small", "fit", "carri", "pocket", "bulki"]
}

In [147]:
# Show word_list for each seed_word
num_seed_word = len(word_list2)
word_dic = {}
for k in range(num_seed_word):
    word_dic[seed_word_list[k]] = [str(item[0]) for item in word_list2[k]]
for word in word_dic:
    print '"%s": %s'%(word, word_dic[word])

"battery": ['battery', 'life', 'camera', 'charger', 'charge', 'batteries', 'extra', 'good', 'use', 'long', 'card', 'one', 'pictures', 'rechargeable', 'charged']
"pictures": ['pictures', 'camera', 'takes', 'take', 'great', 'good', 'taking', 'quality', 'use', 'took', 'easy', 'taken', 'get', 'clear', 'like']
"price": ['price', 'camera', 'great', 'good', 'quality', 'range', 'priced', 'features', "it's", 'cameras', 'one', 'better', 'best', 'amazon', 'worth']
"zoom": ['zoom', 'x', 'camera', 'optical', 'lens', 'great', 'good', 'video', 'pictures', 'digital', 'quality', 'mm', 'use', 'image', 'zooming']
"ease of use": ['ease', 'use', 'quality', 'camera', 'picture', 'great', 'features', 'size', 'pictures', 'love', 'good', 'price', "it's", 'image', 'like']
"detection": ['detection', 'face', 'smile', 'mode', 'camera', 'focus', 'features', 'works', 'blink', 'phase', 'auto', '+', 'image', 'like', 'feature']
"design": ['design', 'designed', 'camera', 'lens', 'quality', 'use', 'like', "it's", 'good', 