# Attribute-based Text Generation from Amazon Product Reviews

## Library Imports

In [None]:
from numpy import zeros, concatenate

from IPython.display import display, HTML

## Notification JavaScript snippets

In [None]:
def browser_alert(message):
    display(HTML('<script type="text/javascript">alert("' + message + '");</script>'))
    
def browser_notify(message):
    display(HTML('<script type="text/javascript">var notification=new Notification("' + \
                 'Jupyter Notification",{icon:"http://blog.jupyter.org/content/' + \
                 'images/2015/02/jupyter-sq-text.png",body:"' + \ message + \
                 '"});</script>'))

In [None]:
# browser_notify("test")

## Reading Data

In [None]:
dataset_path = "/home/v2john/Tools/attr-reviews-dataset/"

In [None]:
dev_set_path = dataset_path + "dev.txt"
train_set_path = dataset_path + "train.txt"
test_set_path = dataset_path + "test.txt"
vocab_path = dataset_path + "vocab.txt"

In [None]:
def build_vocab_index(vocab_file_path):
    
    vocab_index = dict()
    current_index = 0
    
    with open(vocab_file_path) as vocab_file:
        for line in vocab_file:
            word = line.split()[0]
            vocab_index[word] = current_index
            current_index += 1
            
    return vocab_index


def build_user_and_product_indices(review_file_path):
    
    user_index = dict()
    product_index = dict()
    
    current_user_index = 0
    current_product_index = 0
    
    with open(review_file_path) as review_file:
        for line in review_file:
            split_line = line.split()
            user_id = split_line[0]
            product_id = split_line[1]
            
            if user_id not in user_index:
                user_index[user_id] = current_user_index
                current_user_index += 1
                
            if product_id not in product_index:
                product_index[product_id] = current_product_index
                current_product_index += 1
                
    return user_index, product_index


def build_rating_index():
    
    rating_index = dict()
    
    for i in range(0, 5):
        rating_index[(i + 1) * 1.0] = i
        
    return rating_index

In [None]:
vocab_index = build_vocab_index(vocab_path)

In [None]:
user_index, product_index = build_user_and_product_indices(dev_set_path)

In [None]:
rating_index = build_rating_index()

In [None]:
browser_notify("Indices built")

## Convert build attribute and text-embedding versions of the training data

In [None]:
def get_one_hot_vector_embedding(term_index, term):
    
    vector = zeros(len(term_index))
    vector[term_index[term]] = 1
    
    return vector

In [None]:
def get_attribute_vectors_and_text_embedding_sequences(review_file_path):

    x_train_att = list()
    
    with open(review_file_path) as review_file:
        for line in review_file:
            split_line = line.split()
            user_id = split_line[0]
            product_id = split_line[1]
            rating = float(split_line[2])
            review_text = split_line[3]
            
            attribute_vector = concatenate((get_one_hot_vector_embedding(user_index, user_id), 
                                            get_one_hot_vector_embedding(product_index, product_id),
                                            get_one_hot_vector_embedding(rating_index, rating)))
            
            x_train_att.append(attribute_vector)
            
    return x_train_att

In [None]:
x_train_att = get_attribute_vectors_and_text_embedding_sequences(dev_set_path)