In [1]:
from pymongo import MongoClient, ASCENDING
from srs.database import connect_to_db
from srs.utilities import Sentence, tokenize
from nltk import pos_tag
from collections import Counter
import math
import word2vec
import os
import numpy as np
import random
import copy
import gzip
import ast
# Loading Word2Vec model
current_directory = os.path.dirname(os.path.realpath("__file__"))
model_path = os.path.join(current_directory[:-6], 'srs/predictor_data/text8.bin')
model = word2vec.load(model_path)
def sort_list(list, sort_index, reverse = True):
    list_sorted = sorted(list, key=lambda tup: tup[sort_index], reverse = reverse)
    return list_sorted

### Obtain the prod_dict:

In [2]:
def parse(path):
    g = gzip.open(path, 'r')
    for l in g:
        yield ast.literal_eval(l)


def construct_prod_dict(meta_file_path_list):
    """return a dictionary for product metadata"""
    prod_dict = {}
    for meta_file_path in meta_file_path_list:
        metaParser = parse(meta_file_path)
        client, db = connect_to_db()
        i = 0       
        print "Building the product dictionary for %s" % meta_file_path
        for meta in metaParser:
            i+=1
            if i % 100000 == 0:
                print i
            product_id = meta['asin']
            category = meta['categories'][0]
            product_name = ""
            brand = ""
            if 'title' in meta:
                inter = meta['title'].split()
                if len (inter) > 1:
                    product_name_short = inter[0] + ' ' + inter[1]
                else:
                    product_name_short = inter[0]
            if 'brand' in meta:
                brand = meta['brand']
            prod_dict[product_id]={'category': category, 'product_name': product_name_short, 'brand': brand}
        print i
    return prod_dict

In [6]:
Electronics_Meta_Path = '../../Datasets/Full_Reviews/meta_Electronics.json.gz'
Phone_Meta_Path = '../../Datasets/Full_Reviews/meta_Cell_Phones_and_Accessories.json.gz'

prod_dict = construct_prod_dict([Electronics_Meta_Path,Phone_Meta_Path])

Building the product dictionary for ../../Datasets/Full_Reviews/meta_Electronics.json.gz
100000
200000
300000
400000
498196
Building the product dictionary for ../../Datasets/Full_Reviews/meta_Cell_Phones_and_Accessories.json.gz
100000
200000
300000
346793


### The following functions accumulate all the sentences by category

In [14]:
def get_category_dict(prod_dict):
    """Build a dictionary whose key is the category tuple, and the value is a list of product_ids:"""
    client, db = connect_to_db()
    cursor = db.product_collection.find()
    category_dict = {}
    i = 0
    for product in cursor:
        i += 1   
        if i % 100000 == 0:
            print i
        category = product['category']
        category_short = tuple(category[:4]) #generally category is 4-tuple. Now limit to the first three tuple
        product_id = product['product_id']
        product_name = ""
        brand = ""
        if product_id in prod_dict:
            product_info = prod_dict[product_id]
            if 'product_name' in product_info:
                product_name = product_info['product_name']
            if 'brand' in product_info:
                brand = product_info['brand']

        if category_short not in category_dict:
            category_dict[category_short] = {"product_id": [product_id], "brand_list": [], "product_name_list": []}
        else:
            category_dict[category_short]['product_id'].append(product_id)
            
        if len(product_name) > 0:
            category_dict[category_short]['product_name_list'].append(product_name)
        if len(brand) > 0:
            if brand not in category_dict[category_short]['brand_list']:
                category_dict[category_short]['brand_list'].append(brand)
            
    client.close()
    print i
  
    return category_dict


def sort_category_dict(category_dict, isPrint = False):
    """Sort the categories according to the number of products in that category, and print them from top"""
    category_list_sorted = []
    category_list = []

    for key in category_dict:
        length = len(category_dict[key]['product_id'])
        category_list.append([key,length,key[:3],0])
    category_list_sorted = sorted(category_list, key=lambda tup: (tup[2],tup[1]), reverse=True)
    
    category_list_sorted_dict = {}
    for Id in range(len(category_list_sorted)):
        category_list_sorted[Id][3]=Id
        category = category_list_sorted[Id][0]
        category_dict[category]["category_id"] = Id
        category_list_sorted_dict[Id] = category_list_sorted[Id][:3]
    
    if isPrint:
        for Id in range(len(category_list_sorted)):
            print Id, category_list_sorted_dict[Id][:2]
        
    return category_list_sorted_dict


def combine_category_custom(category_dict_raw, category_list_sorted_dict):
    category_dict = copy.deepcopy(category_dict_raw)
    print "Number of categories in original set: %g"%len(category_dict_raw)
    print "Combined category ID:"
    f = open('Aspect_and_wordlist_txt/combined_dict.txt','r')
    for line in f:
        combine_info = eval(line)
        print combine_info
        if len(combine_info) > 0:
            Id_to_combine = combine_info[0]
            name_info = combine_info[1]
            category_name_combined = category_list_sorted_dict[name_info[0]][0][:name_info[1]]
            category_id = category_dict_raw[category_list_sorted_dict[name_info[0]][0]]["category_id"]
            new_prod_id_list = []
            new_product_name_list = []
            new_brand_list = []
            for Id in Id_to_combine:
                category_name = category_list_sorted_dict[Id][0]
                new_prod_id_list += category_dict[category_name]["product_id"]
                new_product_name_list += category_dict[category_name]["product_name_list"]
                new_brand_list += category_dict[category_name]["brand_list"]
                category_dict.pop(category_name, 0)
            category_dict[category_name_combined] = {"category_id": category_id,"product_id": new_prod_id_list,\
                        "product_name_list": new_product_name_list, "brand_list": new_brand_list}
    f.close()
    print "Number of categories in the new dict: %g"%len(category_dict)
      
    return category_dict


def combine_small_category(category_dict_raw, category_list_sorted, prod_num_threshold = 100, shrink_level = 3):
    category_dict = copy.deepcopy(category_dict_raw)
    i = 0
    for i in range(len(category_list_sorted)):
        i += 1
        category_name = category_list_sorted[-i][1]
        prod_num = category_list_sorted[-i][0]
        if prod_num > prod_num_threshold:
            break
        if len(category_name) > shrink_level:
            category_name_shrink = category_name[:shrink_level]
            if category_name_shrink in category_dict:
                category_dict[category_name_shrink] += category_dict[category_name]
                category_dict.pop(category_name,0)
                print "{0} combined into {1}".format(category_name_shrink, category_name)
            else:
                print "{0} not combined".format(category_name_shrink)
        else:
            print "{0} length not enough.".format(category_name)
    
    return category_dict


def save_category_dict_to_db(category_dict, dropPrevious = False):
    client, db = connect_to_db()
    db_category_data = db.category_data
    if dropPrevious == True:
        db_category_data.delete_many({})
    for category in category_dict:
        query = {"category_id": category_dict[category]["category_id"]}
        update_field = {"category": list(category),\
                        "prod_id_list": category_dict[category]["product_id"], \
                        "brand_list":  category_dict[category]["brand_list"],\
                        "product_name_list": category_dict[category]["product_name_list"]}
        db_category_data.update_one(query, {"$set": update_field}, True)
        
    client.close()


def show_category_dict_info(category_dict, min_prod_num = 1000):
    new_list = []
    for category in category_dict:
        new_list.append([len(category_dict[category]["product_id"]),category,category_dict[category]["category_id"]])
    
    new_list = sorted(new_list, key=lambda tup: tup[0], reverse=True)
    
    for item in new_list:
        if int(item[0]) < min_prod_num:        
            break
        print "{0},{1},{2}".format(item[0],item[1],item[2])


def get_sentence_from_category(category_list):
    """Obtain all the review sentences from a list of category tuple:"""
    if isinstance(category_list, dict):
        category_lists = [category_list]
    else:
        category_lists = category_list
    
    category_content_list = []
    
    for category in category_lists:
        print "{0}:".format(category)
        client, db = connect_to_db()
        product_id_list = category_dict[category]["product_id"]
        category_contents = {"category": category,"sentence_list": [], "brand_list": category_dict[category]["brand_list"],\
                            "product_name_list": category_dict[category]["product_name_list"]}
        review_num = 0
        for product_id in product_id_list:
            query_res = list(db.product_collection.find({"product_id": product_id}))
            contents = query_res[0]["contents"]
            category_contents['sentence_list'] += contents
            review_num += len(query_res[0]["review_ids"])
        print "  ({0}, {1}, {2})".format(len(product_id_list), review_num, len(category_contents['sentence_list']))      
        category_content_list.append(category_contents)
        
    client.close()

    return category_content_list


def get_sentence_from_category_ensemble(category_dict, max_prod_chosen = 500, min_product_level = 500):
    client, db = connect_to_db()
    full_sentence_list = []
    print "Getting product categories: (num_sentence_chosen, category):"
    for category in category_dict:
        if len(category_dict[category]) < min_product_level:
            continue
        product_id_list = category_dict[category]["product_id"]
        random.shuffle(product_id_list)
        new_sentence = []
        for product_id in product_id_list[:max_prod_chosen]:
            query_res = list(db.product_collection.find({"product_id": product_id}))
            contents = query_res[0]["contents"]
            new_sentence += contents
        print len(new_sentence),category
        full_sentence_list += new_sentence
    client.close()
    print "Number of sentences: {0}".format(len(full_sentence_list))
    
    all_category_content = {"sentence_list": full_sentence_list}
    return all_category_content

In [12]:
category_dict_raw = get_category_dict(prod_dict)

100000
200000
300000
400000
500000
600000
700000
793315


In [15]:
category_list_sorted_dict = sort_category_dict(category_dict_raw, isPrint = False)
category_dict = combine_category_custom(category_dict_raw, category_list_sorted_dict)
save_category_dict_to_db(category_dict, dropPrevious = False)

Number of categories in original set: 512
Combined category ID:
[[16, 17], [17, 2]]
[[20, 21, 22, 23, 26, 28, 29, 30, 31, 32], [22, 3]]
[[38, 39, 40, 41, 42], [38, 3]]
[[105, 106], [105, 3]]
[[108, 109, 110, 111, 112], [112, 3]]
[[139, 140, 141], [139, 3]]
[[176, 177, 178, 179, 180, 181, 182, 183, 184], [176, 3]]
[[277, 278, 279, 280], [277, 3]]
[[282, 283, 284, 285, 286, 287, 288], [282, 3]]
[[297, 298, 299, 300, 301], [297, 3]]
[[302, 303, 304], [302, 3]]
[[308, 309, 310, 311, 312, 313, 314], [308, 3]]
[[316, 321], [316, 3]]
[[322, 323, 324, 325, 326, 327, 328, 329, 330, 331], [322, 3]]
[[356, 357, 358, 359, 360], [356, 3]]
[[362, 363, 364, 365, 366], [366, 3]]
[[367, 368, 369, 370], [367, 3]]
[[371, 372, 373, 374, 375, 376, 377], [371, 3]]
[[379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393], [379, 3]]
[[399, 400], [399, 3]]
[[417, 418, 419, 420, 421], [417, 3]]
[[427, 428, 429, 430], [427, 2]]
[[433, 434, 435, 436, 437, 438], [433, 2]]
[[439, 440, 441, 442, 

In [22]:
show_category_dict_info(category_dict, min_prod_num = 1000)

207742,(u'Cell Phones & Accessories', u'Cases', u'Waterproof Cases'),439
25256,(u'Cell Phones & Accessories', u'Accessories', u'Accessory Kits'),474
25245,(u'Electronics', u'Computers & Accessories', u'Touch Screen Tablet Accessories', u'Cases & Sleeves'),129
21726,(u'Electronics', u'Portable Audio & Video', u'MP3 Players & Accessories', u'MP3 Player Accessories'),78
16275,(u'Electronics', u'Computers & Accessories', u'Laptop & Netbook Computer Accessories', u'Batteries'),155
15453,(u'Electronics', u'Camera & Photo', u'Accessories', u'Batteries & Chargers'),332
15195,(u'Cell Phones & Accessories', u'Accessories', u'Screen Protectors'),450
15051,(u'Electronics', u'Computers & Accessories', u'Laptop & Netbook Computer Accessories', u'Chargers & Adapters'),156
13548,(u'Electronics', u'Computers & Accessories', u'Laptop & Netbook Computer Accessories', u'Bags & Cases'),157
13115,(u'Electronics', u'Accessories & Supplies', u'Audio & Video Accessories', u'Cables & Interconnects'),401
11747,(

In [None]:
# all_category_content = get_sentence_from_category_ensemble(category_dict, max_prod_chosen = 1000, min_product_level = 0)
# get_tf_idf(all_category_content, is_idf_db = False)

### Aspect extraction: the following functions collects sentences from one category, obtain each word's tf-idf score, and choose aspect candidates:

In [245]:
def get_tf_idf(sentence_list, is_idf_db = True):
    """Get tf-idf score for each word
       The dictionary records for each word as a key, the [num_word, num_doc] value, where num_word means the number of 
       that word in the sentence_list, and num_doc means the number of sentences this word appears in.
    """
    word_statistics = {}
    db_word_score_list = db.word_score_list
    i = 0
    print "Number of sentences processed:"
    # Getting each word's statistics: [num_word, num_doc]
    for sentence in sentence_list:
        i += 1
        if i % 50000 == 0:
            print i
        tokens = tokenize(sentence, stem = False)
        tokens_count = Counter(tokens)
        for word in tokens_count:        
            if word not in word_statistics:
                word_statistics[word] = [tokens_count[word], 1]
            else:
                word_statistics[word][0] += tokens_count[word]
                word_statistics[word][1] += 1
      
    total_num_doc = len(sentence_list)
    word_tf_idf = []
    
    # Getting the maximum word frequency:
    max_word_freq = 0
    for word in word_statistics:
        if word_statistics[word][0] > max_word_freq:
            max_word_freq = word_statistics[word][0]
    
    # Getting the tf-idf score for each word
    print "Calculating tf-idf:"
    i = 0
    for word in word_statistics:
        i += 1
        if i % 10000 == 0:
            print i
        tf = float(word_statistics[word][0]) / max_word_freq 
        # Calculating idf:     
        num_doc = word_statistics[word][1]
        idf_category = math.log(float(total_num_doc)/(0 + num_doc))
        if is_idf_db == True:
            word_score = get_word_score_from_db(word, db_word_score_list)
            idf_db = word_score[2]
        else:
            idf_db = 1
        word_tf_idf.append([word, tf * idf_db, tf, idf_category, idf_db])
            
    # Sorting the word_tf_idf list:
    word_tf_idf = sorted(word_tf_idf, key=lambda tup: tup[1], reverse=True)
    return word_tf_idf, word_statistics


def save_word_score_to_db(word_tf_idf):
    client, db = connect_to_db()
    db.word_score_list.delete_many({})
    db.word_score_list.create_index([("word", ASCENDING)])
    i = 0
    for word_data in word_tf_idf:
        i += 1
        if i % 10000 == 0:
            print i
        word = word_data[0]
        word_score = word_data[1:]
        query = {"word": word}
        update_field = {"word_score": word_score}
        
        db.word_score_list.update_one(query, {"$set": update_field}, True)
    client.close()
    print "Total number of words: %g"%len(word_tf_idf)

    
def get_word_score_from_db(word, db_word_score_list):
    query = db_word_score_list.find({"word": word})
    if len(query[0]) > 0:
        return query[0]["word_score"]
    else:
        return [0,0,0]


def get_aspect_cadidate(word_tf_idf, tag_list = ["NN"], score_threshold = 0.8):
    '''Get cadidate aspects from word_tf_idf. Only words whose tag belong to tag_list and score > threshold will pass'''
    aspect_cadidate = []
    j = 0
    for word_data in word_tf_idf:
        word = word_data[0]
        tf_idf = word_data[1]
        word_tag = pos_tag([word])[0][1]
        # If any of the tag string (e.g. "NN") in the given tag_list appears in the word's tag (e.g. "NNS")
        if any(tag in word_tag for tag in tag_list) and (word_data[1] >= score_threshold \
                                                         or (word_data[1] < score_threshold and j <= 20)):
            j +=1
            aspect_cadidate_data = [word, tf_idf, word_tag]
            aspect_cadidate.append(aspect_cadidate_data)
            print [word, '%0.2f' % tf_idf, word_tag]
        if word_data[1] < score_threshold:
            break
            
    return aspect_cadidate

In [241]:
# Sample categories, with decreasing number of sentences
# category = ("Electronics", "Camera & Photo", "Digital Cameras") #Control Group: 7916 products, 203836 reviews, 1724928 sentences
# category = (u'Electronics', u'Computers & Accessories', u'Tablets') #2439 products, 99311 reviews, 781032 sentences
category = (u'Cell Phones & Accessories', u'Cell Phones', u'No-Contract Cell Phones') # 743 products, 32200 reviews, 221681 sentences
# category = (u'Cell Phones & Accessories', u'Accessories', u'Bluetooth Speakers') #609 products, 38847 reviews, 233048 sentences
# category = (u'Electronics', u'Portable Audio & Video', u'Portable DVD Players') #222 products, 3977 reviews, 22476 sentences
sentence_list = get_sentence_from_category(category)

Number of products 743
Number of reviews: 32200
Number of sentences: 221681


In [272]:
word_tf_idf, _ = get_tf_idf(sentence_list)

Number of sentences processed:
50000
100000
150000
200000
Calculating tf-idf:
10000
20000
30000


In [227]:
tag_list = ["NN"]  # Include in VB since some nouns are mis-classified as verbs
aspect_candidate_list = get_aspect_cadidate(word_tf_idf, tag_list, score_threshold = 0.2)

[u'phone', '3.07', 'NN']
[u'minutes', '0.53', 'NNS']
[u'tracfone', '0.52', 'NN']
[u'use', '0.46', 'NN']
[u'phones', '0.45', 'NNS']
[u'service', '0.44', 'NN']
[u'battery', '0.40', 'NN']
[u'screen', '0.39', 'NN']
[u"it's", '0.36', 'NN']
[u'text', '0.35', 'NN']
[u'apps', '0.35', 'NN']
[u'card', '0.33', 'NN']
[u'call', '0.32', 'NN']
[u'time', '0.31', 'NN']
[u"don't", '0.30', 'NN']
[u'calls', '0.30', 'NNS']
[u'lg', '0.28', 'NN']
[u'android', '0.28', 'NN']
[u'virgin', '0.28', 'NN']
[u'mobile', '0.26', 'NN']
[u'g', '0.26', 'NN']
[u"i'm", '0.24', 'NN']
[u'plan', '0.24', 'NN']
[u'cell', '0.24', 'NN']
[u'price', '0.23', 'NN']
[u'bought', '0.23', 'NN']
[u'love', '0.22', 'NN']
[u'life', '0.22', 'NN']
[u'month', '0.22', 'NN']
[u"i've", '0.22', 'NN']
[u'want', '0.22', 'NN']
[u'works', '0.22', 'NNS']
[u'need', '0.22', 'NN']
[u'keyboard', '0.21', 'NN']
[u'work', '0.21', 'NN']
[u'data', '0.21', 'NNS']
[u'number', '0.21', 'NN']
[u'camera', '0.21', 'NN']


In [250]:
for item in aspect_candidate_list:
    print '"%s",'%item[0],

"phone", "minutes", "tracfone", "use", "phones", "service", "battery", "screen", "it's", "text", "apps", "card", "call", "time", "don't", "calls", "lg", "android", "virgin", "mobile", "g", "i'm", "plan", "cell", "price", "bought", "love", "life", "month", "i've", "want", "works", "need", "keyboard", "work", "data", "number", "camera",


### The following functions start from a seed_word list, find the word list that serves as a dict for each seed_word:

In [274]:
def get_similarity(word1, word2):
    """Find the similarity between two words, which equals the dot product of their vectors"""
    similarity = 0
    word1=word1.lower()
    word2=word2.lower()
    if word1 in model and word2 in model:
        word1_vec = model[word1]
        word2_vec = model[word2]
        similarity = np.dot(word1_vec, word2_vec)
    return similarity

def get_word_list_from_aspect_candidates(seed_word, word_tf_idf, similarity_threshold, score_threshold):
    """Method 1: directly find the word list from all words whose similarity with the seed_word and tf-idf score are above 
    certain threshold"""
    word_list = []
    for word_data in word_tf_idf:
        word = word_data[0]
        tf_idf = word_data[1]
        if tf_idf > score_threshold:
            similarity = get_similarity(seed_word, word)
            if similarity > similarity_threshold:
                word_list.append([word, similarity, tf_idf])              
    word_list_sorted = sorted(word_list, key=lambda tup: tup[1], reverse=True)
    return word_list_sorted


def get_word_list_by_tf_idf(seed_word_list, sentence_list, num_words_in_list, sim_slope = 0.5, sim_intercept = 0.2):
    """Method 2: Find the word_list who can distinguish the chosen sentences from other sentences"""
    
    # For each seed_word in seed_word_list, get the word_data for all sentences. word_data has 4 fields [num_word_total, num_doc_total, 
    # num_word_in_topic, num_doc_in_topic, similarity with seed_word], the first two are from all sentences, and the latter two are from the sentences 
    # that contain the seed_word.   
    num_seed_word = len(seed_word_list)
    word_statistics_dic_list = [{} for i in range(num_seed_word)]
    
    num_sentence_topic_list = [0 for k in range(num_seed_word)]
    num_sentence_total_list = [0 for k in range(num_seed_word)]
    i = 0
#     print "Number of sentences processed:"
    for sentence in sentence_list: 
        i += 1
        if i % 50000 == 0:
            print i
        tokens = tokenize(sentence, stem = False)
        tokens_count = Counter(tokens)
        for word in tokens_count:
            # check for each seed_word:
            for k in range(num_seed_word):
                seed_word = seed_word_list[k]
                if seed_word in sentence: 
                    num_sentence_topic_list[k] += 1
                    num_sentence_total_list[k] += 1
                    if word not in word_statistics_dic_list[k]:
                        word_statistics_dic_list[k][word] = [tokens_count[word], 1, tokens_count[word], 1]
                    else:
                        word_statistics_dic_list[k][word][0] += tokens_count[word]
                        word_statistics_dic_list[k][word][1] += 1
                        word_statistics_dic_list[k][word][2] += tokens_count[word]
                        word_statistics_dic_list[k][word][3] += 1
                else:
                    num_sentence_total_list[k] += 1
                    if word not in word_statistics_dic_list[k]:
                        word_statistics_dic_list[k][word] = [tokens_count[word], 1, 0, 0]
                    else:
                        word_statistics_dic_list[k][word][0] += tokens_count[word]
                        word_statistics_dic_list[k][word][1] += 1

    # Get the maximum word frequency for each seed_word group:
    word_tf_idf_ratio_list = [[] for k in range(num_seed_word)]
    max_num_word_total_list =[0 for k in range(num_seed_word)]
    max_num_word_topic_list =[0 for k in range(num_seed_word)] 
    for k in range(num_seed_word):    
        for word in word_statistics_dic_list[k]:
            word_data = word_statistics_dic_list[k][word]
            if word_data[0] > max_num_word_total_list[k]:
                max_num_word_total_list[k] = word_data[0]
            if word_data[2] > max_num_word_topic_list[k]:
                max_num_word_topic_list[k] = word_data[2]
    
    # Get tf_idf adjusted ratio for each word, to measure how this word can distinguish the topic sentences:
    for k in range(num_seed_word):        
        for word in word_statistics_dic_list[k]:
            word_data = word_statistics_dic_list[k][word]
            num_word_total = word_data[0]
            num_word_topic = word_data[2]
            num_doc_total = word_data[1]   
            num_doc_topic = word_data[3]
        
            if num_doc_topic == 0 or num_doc_total == 0:
                word_tf_idf_ratio_list[k].append([word, 0, 0, 0])
                continue

            tf_topic = float(num_word_topic) / max_num_word_topic_list[k]
            tf_total = float(num_word_total) / max_num_word_total_list[k]
            tf_ratio = (tf_topic/num_word_topic) / (tf_total/num_doc_total)

            idf_topic = math.log(float(num_sentence_topic_list[k]) / num_doc_topic)
            idf_total = math.log(float(num_sentence_total_list[k]) / num_doc_total) 

            word_tf_idf_ratio_list[k].append([word, 0, 0, tf_topic * math.log(tf_ratio) * idf_total ** 2, tf_topic, tf_total, tf_ratio, idf_topic, idf_total])

        word_tf_idf_ratio_list[k].sort(key=lambda tup: tup[3], reverse=True)
 
    for k in range(num_seed_word): 
        for j in range(100):
            word_data = word_tf_idf_ratio_list[k][j]
            word = word_data[0]
            similarity = get_similarity(word, seed_word_list[k])
            sim_amplify = sim_intercept + similarity * sim_slope
            word_tf_idf_ratio_list[k][j][2] = sim_amplify
            argument = 1 + word_tf_idf_ratio_list[k][j][3] * sim_amplify
            if argument > 0:
                word_tf_idf_ratio_list[k][j][1] = math.log(argument) 
            else:
                word_tf_idf_ratio_list[k][j][1] = -1
        word_tf_idf_ratio_list[k].sort(key=lambda tup: tup[1], reverse=True)

    word_tf_idf_ratio_list = [item[:num_words_in_list] for item in word_tf_idf_ratio_list]
    word_list = {}
    for k in range(num_seed_word):
        word_list[seed_word_list[k]] = [[word_data[0],word_data[1]] for word_data in word_tf_idf_ratio_list[k][:num_words_in_list]]
    return word_list, word_tf_idf_ratio_list

In [253]:
seed_word_list = ["battery","pictures","price","zoom","ease of use","detection","design","video","quality","screen","size"] # Camera
seed_word_list = ['screen', "battery", "price", "keyboard", "wifi", "games", "touch", "quality","camera","video"] # Tablets
seed_word_list = ["service", "battery", "screen", "text", "apps", "card", "call", "plan", "price", "keyboard","data", "camera"] # no contract phones
sim_slope = 1
sim_intercept = 0.2
wordlist_dict, _ = get_word_list_by_tf_idf(seed_word_list, sentence_list, 10, sim_slope, sim_intercept)
num_seed_word = len(word_list)
for aspect in word_list:
    print '"{0}": '.format(aspect)
    print '  ',
    for word_data in word_list[aspect]:
        print '"%s", %0.2f; '%(word_data[0], word_data[1]),
    print

50000
100000
150000
200000
"service": 
   "service", 4.37;  "customer", 2.67;  "services", 2.01;  "phone", 1.90;  "minutes", 1.60;  "mobile", 1.52;  "month", 1.31;  "net", 1.28;  "days", 1.27;  "year", 1.20; 
"battery": 
   "battery", 4.35;  "charge", 1.83;  "phone", 1.83;  "life", 1.63;  "hours", 1.31;  "drain", 1.30;  "use", 1.29;  "remove", 1.26;  "screen", 1.23;  "fast", 1.17; 
"text": 
   "text", 4.75;  "texts", 3.57;  "messages", 2.75;  "message", 2.68;  "texting", 2.57;  "phone", 2.46;  "data", 2.33;  "keyboard", 2.28;  "calls", 2.02;  "web", 2.00; 
"screen": 
   "screen", 4.30;  "touch", 2.76;  "touchscreen", 2.05;  "phone", 1.96;  "screens", 1.94;  "keyboard", 1.90;  "camera", 1.55;  "button", 1.51;  "buttons", 1.33;  "size", 1.29; 
"apps": 
   "apps", 4.64;  "download", 2.29;  "phone", 2.15;  "memory", 2.13;  "app", 2.01;  "sd", 2.00;  "google", 1.99;  "card", 1.83;  "install", 1.80;  "storage", 1.64; 
"camera": 
   "camera", 4.77;  "video", 2.57;  "flash", 2.49;  "phone", 2.

In [266]:
def writeWordlistDictToDB(category, wordlist_dict):
    client, db = connect_to_db()
    query = {"category": list(category)}
    update_field = {"wordlist_dict": wordlist_dict}
    db.category_collection.update_one(query, {"$set": update_field}, True)
    client.close()

def getWordlistDictFromDB(category):
    client, db = connect_to_db()
    category_collection = db.category_collection
    query_res = list(category_collection.find({"category": category}))
    disconnect_db(client)

    if len(query_res) < 1:
        raise Exception('Category: {0} not found in database'.format(category))
    elif len(query_res) > 1:
        raise Exception('Category: {0} found multiple occurances in database'.format(category))

    result = query_res[0]
    wordlistDictWithWeights = result['wordlist_dict']
    wordlistDict = {}
    for aspect in wordlistDictWithWeights:
        wordlistDict[aspect] = [sublist[0] for sublist in wordlistDictWithWeights[aspect]]

    return wordlistDict

In [265]:
category = (u'Cell Phones & Accessories', u'Cell Phones', u'No-Contract Cell Phones')
writeWordlistDictToDB(category, wordlist_dict)