In [1]:
# !pip install nltk pandas numpy scipy keras tensorflow sklearn
# nltk.download('stopwords')
# nltk.download('wordnet')

In [2]:
import json
import nltk
import string
import copy
import pandas as pd
import numpy as np
import keras.backend as K

from keras import regularizers
from keras.models import Model
from numpy import linalg as LA
from nltk.corpus import stopwords
from scipy.special import gammaln
from keras.models import Sequential
from scipy.sparse import csr_matrix
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.layers import Dense, Activation, Embedding, LSTM

Using TensorFlow backend.
Using TensorFlow backend.


In [3]:
def preprocess(pd):
    pd = pd.str.lower()
    pd = pd.str.replace('[{}]'.format(string.punctuation), ' ')
    pd = pd.apply(lambda x: [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(x)])
    pd = pd.apply(lambda x: [item for item in x if item not in stop_words])
    return pd.str.join(' ')

In [4]:
def get_x_lstm(max_vocab, vocab):
    tokenizer = Tokenizer(nb_words=max_vocab, lower=True, split=' ')
    tokenizer.fit_on_texts(vocab)
    X = tokenizer.texts_to_sequences(vocab)
    return pad_sequences(X)

In [5]:
lemmatizer = nltk.stem.WordNetLemmatizer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
stop_words = stopwords.words('english')

In [6]:
dataset = []
with open("Automotive_5.json", 'r') as f:
    p = f.readlines()
    for i in p:
        dataset.append(json.loads(i))
dataset = pd.DataFrame(dataset)

In [7]:
U_user_ids = list(dataset.reviewerID.unique())
V_item_ids = list(dataset.asin.unique())

In [8]:
M_users_count = len(U_user_ids)
N_items_count = len(V_item_ids)

In [9]:
M_users_count, N_items_count

(2928, 1835)

(2928, 1835)

In [10]:
R_rating = np.zeros((M_users_count, N_items_count))

for idx, i in enumerate(dataset[["reviewerID", "overall", "asin"]].values):
    R_rating[U_user_ids.index(i[0])][V_item_ids.index(i[2])] = i[1]
        
R_rating = R_rating/5

In [11]:
D_combined_reviews = [""] * N_items_count

for i in dataset[["asin", "reviewText"]].values:
    D_combined_reviews[V_item_ids.index(i[0])] += i[1]
D_combined_reviews = pd.DataFrame(D_combined_reviews)
D_combined_reviews = preprocess(D_combined_reviews[0])
D_combined_reviews.shape

(1835,)

(1835,)

## PMF

In [12]:
numtopics = 5
MAX_VOCAB_SIZE = 100
lambda_u = lambda_v = 0.1

In [13]:
U_user_weights = np.random.rand(numtopics, M_users_count)
V_item_weights = np.random.rand(numtopics, N_items_count)

In [14]:
U_user_weights.shape, V_item_weights.shape

((5, 2928), (5, 1835))

((5, 2928), (5, 1835))

## HFT

In [15]:
beta = 0.01
alpha = 10/numtopics * np.ones(numtopics)

In [16]:
import lda_virgin
sampler = lda_virgin.LdaSampler(numtopics)

## LSTM

In [17]:
lstm_out = 128
batch_size = 8
p_embedding_lstm = 200

In [18]:
X = get_x_lstm(MAX_VOCAB_SIZE, D_combined_reviews.values)



In [19]:
model = Sequential()
model.add(Embedding(MAX_VOCAB_SIZE, p_embedding_lstm, input_length=X.shape[1]))
model.add(LSTM(lstm_out, dropout = 0.2))
model.add(Dense(5, activation='tanh', name ="doc_latent_vector", kernel_regularizer=regularizers.l2()))
model.compile(loss = 'mean_squared_error', optimizer='rmsprop', metrics = ['accuracy'])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [20]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2103, 200)         20000     
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               168448    
_________________________________________________________________
doc_latent_vector (Dense)    (None, 5)                 645       
Total params: 189,093
Trainable params: 189,093
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2103, 200)         20000     
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               168448    
_________________________________________________________________
doc_

In [21]:
def get_last_layer_op():
    intermediate_layer_model = Model(inputs=model.input,
                                     outputs=model.get_layer('doc_latent_vector').output)
    return intermediate_layer_model.predict(X)

### Loss

In [22]:
def get_l1():
    l1 = 0
    for i in range(M_users_count):
        for j in range(N_items_count):
            if R_rating[i][j]>0:
                l1 += (R_rating[i][j] - np.dot(U_user_weights.T[i], V_item_weights.T[j]))**2
    return l1

In [23]:
def get_l3():
    return LA.norm(U_user_weights, 'fro')

In [24]:
def get_l4():
    return LA.norm(V_item_weights.T - get_last_layer_op(), 'fro')

In [25]:
def get_total_loss():
    return get_l1() + lambda_u * get_l3() + lambda_v * get_l4()

## Gradient V

In [26]:
def processReviews(reviews, window=5, MAX_VOCAB_SIZE=1000):
#     vectorizer = SkipGramVectorizer(analyzer="word",stop_words="english",
#                                          max_features=MAX_VOCAB_SIZE,max_df=.75,min_df=10, k = window,ngram_range=(1,2))
    vectorizer = CountVectorizer(analyzer="word",tokenizer=None)
#                                       , preprocessor=None,stop_words="english",max_features=MAX_VOCAB_SIZE,max_df=.5,min_df=5)
    train_data_features = vectorizer.fit_transform(reviews)
    words = vectorizer.get_feature_names()
    vocabulary = dict(zip(words,np.arange(len(words))))
    inv_vocabulary = dict(zip(np.arange(len(words)),words))
    wordOccurenceMatrix = train_data_features.toarray()
    return wordOccurenceMatrix, vocabulary, words

In [27]:
matrix, vocabulary, words = processReviews(D_combined_reviews.values)

In [28]:
def get_gradient_V(sampler, lstm_last_layer):
    param_k = 0.1
    peakiness = 1
    lambda_t = 0.01
    param_Nj = matrix.sum(axis=1)
    param_njk = sampler.get_nmz()
    dt_distribution = sampler.phi()

    diff_lv = []
    for j in range(N_items_count):
        temp_sums = [0]*5
        for i in range(M_users_count):
            if R_rating[i][j]>0:
                temp_sums += (R_rating[i][j] - np.dot(U_user_weights.T[i], V_item_weights.T[j]))*U_user_weights.T[i]
        temp_sums += 2 * lambda_v * (V_item_weights.T[j] - lstm_last_layer[j])
        temp_sums -= lambda_t*peakiness*(param_njk[j] - param_Nj[j]*dt_distribution[j]).sum()
        diff_lv.append(list(temp_sums))
    diff_lv = np.array(diff_lv)
    # diff_lv -= lambda_t*peakiness*(param_njk - param_Nj*sampler.dt_distribution)
    return diff_lv

## Gradinet U

In [29]:
def get_gradient_U():
    diff_lu = []

    for i in range(M_users_count):
        temp_sums = [0]*5
        for j in range(N_items_count):
            if R_rating[i][j]>0:
                temp_sums += (R_rating[i][j] - np.dot(U_user_weights.T[i], V_item_weights.T[j]))*V_item_weights.T[j]
        temp_sums += 2 * lambda_u * U_user_weights.T[i]
        diff_lu.append(list(temp_sums))
    diff_lu = np.array(diff_lu)
    return diff_lu

## Gradinet phi

In [30]:
def get_gradient_Phi(sampler, Phi_weights):
    param_nkw = sampler.get_nzw()
    param_Nk = sampler.get_nzw().T.sum(axis=1)
    diff_phi = []
    for w in range(MAX_VOCAB_SIZE):
        param_zw = np.exp(Phi_weights[w]).sum()
        temp_phi = []
        for k in range(numtopics):
            temp_phi.append(param_nkw[w, k] - (param_Nk[k] * np.exp(Phi_weights[w, k])/param_zw))
        diff_phi.append(temp_phi)
    diff_phi = np.array(diff_phi)
    return diff_phi

# RUN

In [31]:
maxiter_hft = 10
learning_rate_pmf = learning_rate_hft = 0.01
Phi_weights = np.random.rand(MAX_VOCAB_SIZE, numtopics)

In [32]:
iterations = 10
for i in range(iterations):
    print("="*30, "\niteration", i)
    sampler.run(matrix)
    
    for i in range(5):
        model.fit(X, V_item_weights.T, epochs=1, batch_size=128)
        lstm_last_layer = get_last_layer_op()

        print("\nExtracting Gradients...")
        gradient_v = get_gradient_V(sampler, lstm_last_layer)
        gradient_u = get_gradient_U()
        gradient_phi = get_gradient_Phi(sampler, Phi_weights)

        print("\nUpdating Gradients...")
        U_user_weights += learning_rate_pmf * gradient_u.T
        V_item_weights += learning_rate_pmf * gradient_v.T
        Phi_weights += learning_rate_hft * gradient_phi
    
        print(get_l1(), get_l3(), get_l4())

iteration 0
Instructions for updating:
Use tf.cast instead.
iteration 0
Instructions for updating:
Use tf.cast instead.
Epoch 1/1
Epoch 1/1

Extracting Gradients...

Extracting Gradients...


AttributeError: 'LdaSampler' object has no attribute 'nmz'

AttributeError: 'LdaSampler' object has no attribute 'nmz'