In [4]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
import sys
#change to any directory you have to store the repo
sys.path.insert(1, '/home/ec2-user/SageMaker/github/aspect_topic_modeling')

from src.features.metric import diversity, get_topic_coherence
from models.atten_model import MODEL_ATT_COMP
import swifter
from src.models.utils import sinkhorn_torch
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
D = pd.read_csv("/home/ec2-user/SageMaker/github/aspect_topic_modeling/src/data/train.txt",header = None)
D.iloc[:,0] = D.iloc[:,0].astype(str)
sentences = [item.split() for item in D.iloc[:,0]]
#generate word 2 vec models
w2vmodel = Word2Vec(sentences,vector_size=200, window=10, negative = 5)




In [5]:
#generate input for the model
vocab = list(set([j for i in D.values for j in i[0].split(' ') if j in w2vmodel.wv]))
vocab = [''] + vocab
word_track = {i: ind for ind, i in enumerate(vocab)}
index_track = {ind: i for ind, i in enumerate(vocab)}
#pad the input

vocab_tensor = torch.Tensor([[0] * 200]  + [w2vmodel.wv[i] for i in vocab[1:]])
vocab_ind = [torch.LongTensor([word_track[it] for it in i if it in word_track][:16]) for i in sentences]
input = torch.nn.utils.rnn.pad_sequence(vocab_ind, batch_first=True, padding_value=0)

#preprocessing to calculate the coherehence
mlb = MultiLabelBinarizer()
XX = mlb.fit_transform([[word_track[it] for it in i if it in word_track]  for i in sentences] + [[0]])


In [31]:
sentences[1]

['like',
 'roll',
 'tiny',
 'order',
 'anyway',
 'often',
 'get',
 'order',
 'wrong',
 'stray',
 'menu']

In [16]:
import torch
import torch.nn as nn
from torch.nn.functional import normalize
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


class MODEL_ATT_COMP(nn.Module):
    def __init__(self, d_word, d_key, d_value, n_topic, n, embeddings):
        super(MODEL_ATT_COMP, self).__init__()
        self.embeddings = nn.Embedding(len(embeddings), 200)
        self.embeddings.weight = torch.nn.Parameter(embeddings)
        self.embeddings.weight.requires_grad = False
        self.K = nn.Linear(d_word,d_key)
        self.Q = nn.Linear(d_word,d_key)
        self.V = nn.Linear(d_word,d_value)
        self.V2T = nn.Linear(d_value,n_topic)
        self.T2L = nn.Linear(n, 1)
        self.L2T = nn.Linear(1, n)
        self.soft1 = nn.Softmax(dim = 2)
        self.T2V = nn.Linear(n_topic,d_value)
        self.V2W = nn.Linear(d_value,d_word)
        self.sqrtdk = torch.tensor([d_key**0.5]).to(device)
        self.soft2 = nn.Softmax(dim = 1)
    
    def loss_max_margin_neg_sample(self, x):
        """Maximize word level embedding reconstruction with negative sampling
        """
        word_repre_x = normalize(self.word_repre, dim = 2) #batch n dvalue
        value_recon_x = normalize(self.value_recon, dim = 2) #batch n dvalue
        sim_matrix = torch.matmul(word_repre_x, value_recon_x.transpose(2,1)) #batch n n
        sim_x = torch.diagonal(sim_matrix, 0, 1, 2) #batch n 
        ns = torch.randperm(sim_x.shape[1]) # n 
        loss =  1 - sim_x + torch.diagonal(sim_matrix[:, ns], 0, 1, 2)
        loss = loss.mean(1)  #batch 
        return loss

    def loss_word_prediction_no_self(self, x):
        """Maximize word distribution reconstruction without given word
        """
        word_recon_no_self_normalized = normalize(self.word_recon_no_self, dim = 2) #batch n d_word
        x_normalized = normalize(x, dim = 2).transpose(2,1) #batch d_word n 
        sim_matrix = torch.matmul(word_recon_no_self_normalized, x_normalized) #batch n n
        return 1 - torch.diagonal(sim_matrix, 0, 1, 2).mean(1) #batch 

    def reconstruction_loss(self):
        """Sparsity/Entropy loss to make sure each word goes to 1 topic
        """
        distribution = self.topic
        return - torch.log(distribution) * distribution
    
    def sinkhorn_distance(self, lambda_sh = 10):
        """Make sure sentence goes to less topics and documents are uniformly distributed to each topics
        """
        d1, d2, d3 = self.topic.shape
        sentence_topic = self.topic.reshape(d1, d2)
        a = torch.ones(sentence_topic.shape[0]).to(device) * sentence_topic.shape[1] / sentence_topic.shape[0] #batch dimension
        b = torch.ones(sentence_topic.shape[1]).to(device)  #topics dimension 
        #print(a.shape, b.shape)
        return sinkhorn_torch( - torch.log(sentence_topic + 1e-6), a, b, lambda_sh).sum()
    
    
    def similarity_loss(self):
        """If word has high attention on another word, they should have similar topics distribution
        """
        d1, d2, d3 = self.att_weight.shape
        normal_weights = self.att_weight.reshape(-1, d3) # batch * n n
        #print(normal_weights.shape)
        samples = torch.multinomial(normal_weights, 1).reshape(-1) #batch * n
        normalize_weights = normalize(self.topic_weight, dim = 2)
        #print(normalize_weights.shape)
        topic_similarity = torch.matmul(normalize_weights, normalize_weights.transpose(1,2)).reshape(d1*d2, -1) #batch n n
        #print(topic_similarity.shape, samples.shape)
        return 1 - topic_similarity[torch.arange(topic_similarity.shape[0]), samples].reshape(d1, d2).mean(1) #batch n
         
    def word_topics(self):
        x = self.embeddings.weight
        self.soft2 = nn.Softmax(dim = 1)
        self.k = self.K(x).transpose(0,1) #d_key n 
        self.q = self.Q(x) #n d_key
        self.att_score = torch.matmul(self.q, self.k) #n n
        self.att_weight = self.soft2(self.att_score/self.sqrtdk) #n n, row sum = 1
        self.v = self.V(x) #n d_key
        self.word_repre = torch.matmul(self.att_weight, self.v) #batch n d_value
        self.topic_score = self.V2T(self.word_repre) #n n_topic
        self.word2topic = self.soft2(self.topic_score) #n n_topic, row sum = 1       
        return self.word2topic
    
    def forward(self,x):
        '''
        x: tensor, batch by n
        Output: a dictionary that contains different loss
        '''
        x = self.embeddings(x) #batch n d_word
        self.k = self.K(x).transpose(2,1) #batch d_key n 
        self.q = self.Q(x) #batch n d_key
        self.att_score = torch.matmul(self.q, self.k) #batch n n
        self.att_weight = self.soft1(self.att_score/self.sqrtdk) #batch n n, row sum = 1
        self.v = self.V(x) #batch n d_key
        self.word_repre = torch.matmul(self.att_weight, self.v) #batch n d_value
        self.topic_score = self.V2T(self.word_repre) #batch n n_topic
        self.topic_weight = self.soft1(self.topic_score) #batch n n_topic , row sum = 1
        #print(self.topic_weight.shape)
        self.topic = self.soft2(self.T2L(self.topic_weight.transpose(2,1))) #batch n_topic 1
        #print(self.topic.shape)
        self.topic_recon = self.L2T(self.topic).transpose(2,1) #batch  n n_topic
        #print(self.topic_recon.shape)
        self.value_recon = self.T2V(self.topic_recon) #batch n d_value
        self.word_recon = self.V2W(self.word_repre)#batch n d_word
        #no self computation, effectively masked
        #print(self.k.shape, self.att_score.shape, self.att_weight.shape, self.word_repre.shape, self.topic_weight.shape)
        self.att_score_no_self = self.att_score -  torch.diag(torch.zeros(self.att_score.shape[1])+torch.tensor(float('inf'))).to(device)#batch n n
        self.att_weight_no_self = self.soft1(self.att_score_no_self/self.sqrtdk) #batch n n 
        self.word_repre_no_self = torch.matmul(self.att_weight_no_self, self.v)#batch n d_key
        self.word_recon_no_self = self.V2W(self.word_repre_no_self) #batch n d_word
        word_pred_loss = self.loss_word_prediction_no_self(x).sum()
        margin_loss = self.loss_max_margin_neg_sample(x).sum()
        recon_loss = self.reconstruction_loss().sum()
        sim_loss = self.similarity_loss().sum()
        sinkhorn_loss = self.sinkhorn_distance()
        return {
            'loss' : word_pred_loss + margin_loss,
            'margin_loss': margin_loss,
            'word_loss': word_pred_loss,
            'reconstruct_loss': recon_loss,
            'similarity_loss': sim_loss,
            'sinkhorn_loss': sinkhorn_loss
            
        }

In [27]:
#--------------------------------------------
#model param
#--------------------------------------------
d_word = w2vmodel.vector_size
n_topic = 14
batch_size = 256
d_key = 50
d_value = 50
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


#---------------------------------------
#train model
#---------------------------------------
np.random.seed(10)
model = MODEL_ATT_COMP(d_key = d_key, d_word = d_word, n_topic = n_topic, d_value = d_value, n = 16, embeddings = vocab_tensor)
model.to(device)
learning_rate = 5 * 1e-5
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
margin_loss, word_loss, similarity_loss, sinkhorn_loss = 0, 0, 0, 0
for i in range(4000):
    idx_batch = np.random.choice(np.arange(D.shape[0]),batch_size, replace = False)
    x = input[idx_batch].to(device)
    d = model.forward(x)
    loss = d['loss']  +  d['sinkhorn_loss'] + d['similarity_loss']
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    margin_loss += d['margin_loss']
    word_loss += d['word_loss']
    similarity_loss += d['similarity_loss']
    sinkhorn_loss += d['sinkhorn_loss']
    if i % 100 == 0:
        print(margin_loss/100, word_loss/100, similarity_loss/100, sinkhorn_loss/100)
        margin_loss, word_loss, similarity_loss, sinkhorn_loss = 0, 0, 0, 0

tensor(2.5602, device='cuda:0', grad_fn=<DivBackward0>) tensor(2.5661, device='cuda:0', grad_fn=<DivBackward0>) tensor(0.0002, device='cuda:0', grad_fn=<DivBackward0>) tensor(0.3695, device='cuda:0', grad_fn=<DivBackward0>)
tensor(255.9141, device='cuda:0', grad_fn=<DivBackward0>) tensor(251.8075, device='cuda:0', grad_fn=<DivBackward0>) tensor(0.0181, device='cuda:0', grad_fn=<DivBackward0>) tensor(36.9463, device='cuda:0', grad_fn=<DivBackward0>)
tensor(255.7092, device='cuda:0', grad_fn=<DivBackward0>) tensor(240.7245, device='cuda:0', grad_fn=<DivBackward0>) tensor(0.0865, device='cuda:0', grad_fn=<DivBackward0>) tensor(36.9461, device='cuda:0', grad_fn=<DivBackward0>)
tensor(255.1251, device='cuda:0', grad_fn=<DivBackward0>) tensor(230.5776, device='cuda:0', grad_fn=<DivBackward0>) tensor(0.1839, device='cuda:0', grad_fn=<DivBackward0>) tensor(36.9458, device='cuda:0', grad_fn=<DivBackward0>)
tensor(254.6742, device='cuda:0', grad_fn=<DivBackward0>) tensor(223.4379, device='cuda:0

tensor(222.1388, device='cuda:0', grad_fn=<DivBackward0>) tensor(201.1265, device='cuda:0', grad_fn=<DivBackward0>) tensor(0.0034, device='cuda:0', grad_fn=<DivBackward0>) tensor(36.9467, device='cuda:0', grad_fn=<DivBackward0>)
tensor(225.7466, device='cuda:0', grad_fn=<DivBackward0>) tensor(201.5566, device='cuda:0', grad_fn=<DivBackward0>) tensor(0.0031, device='cuda:0', grad_fn=<DivBackward0>) tensor(36.9468, device='cuda:0', grad_fn=<DivBackward0>)
tensor(220.2372, device='cuda:0', grad_fn=<DivBackward0>) tensor(201.2594, device='cuda:0', grad_fn=<DivBackward0>) tensor(0.0028, device='cuda:0', grad_fn=<DivBackward0>) tensor(36.9468, device='cuda:0', grad_fn=<DivBackward0>)
tensor(217.3984, device='cuda:0', grad_fn=<DivBackward0>) tensor(200.8024, device='cuda:0', grad_fn=<DivBackward0>) tensor(0.0026, device='cuda:0', grad_fn=<DivBackward0>) tensor(36.9468, device='cuda:0', grad_fn=<DivBackward0>)


In [235]:
emb[1]

tensor([0.0796, 0.0745, 0.0747, 0.0716, 0.0590, 0.0697, 0.0756, 0.0811, 0.0744,
        0.0712, 0.0703, 0.0635, 0.0646, 0.0703], grad_fn=<SelectBackward>)

In [227]:
topics

[['dark',
  'lighting',
  'banquette',
  'window',
  'lit',
  'scene',
  'brick',
  'ceiling',
  'wood',
  'wall'],
 ['10', 'min', 'tasty', '20', 'arrived', '30', '15', '45', 'waited', 'yummy'],
 ['exposed',
  'floor',
  'window',
  'banquette',
  'lit',
  'scene',
  'wall',
  'ceiling',
  'wood',
  'brick'],
 ['american',
  'authentic',
  'simple',
  'cuisine',
  'traditional',
  'presentation',
  'fare',
  'combination',
  'flavor',
  'ingredient'],
 ['cocktail',
  'dessert',
  'sangria',
  'course',
  'appetizer',
  'wine',
  'fixe',
  'prix',
  'selection',
  'entree'],
 ['chair',
  'floor',
  'banquette',
  'window',
  'lit',
  'scene',
  'brick',
  'ceiling',
  'wood',
  'wall'],
 ['creative',
  'thai',
  'fare',
  'quality',
  'american',
  'fusion',
  'asian',
  'japanese',
  'authentic',
  'cuisine'],
 ['crab',
  'tender',
  'sweet',
  'mashed',
  'garlic',
  'cake',
  'dry',
  'banana',
  'cream',
  'chocolate'],
 ['sunday',
  'great',
  'friendly',
  'attentive',
  'birthday

In [28]:
#report results for coherence and diversity
del emb
gc.collect()
emb = model.word_topics().cpu()
topics = [[vocab[j] for j in i] for i in emb.argsort(0)[-10:, :].t().detach().cpu().numpy() ]
coherences= get_topic_coherence(XX, topics, word_track)
np.mean(diversity(topics)), coherences

(0.34285714285714286, -0.014083204172458997)

In [25]:
margin_loss, word_loss, similarity_loss, sinkhorn_loss = 0, 0, 0, 0
for i in range(4000):
    idx_batch = np.random.choice(np.arange(D.shape[0]),batch_size, replace = False)
    x = input[idx_batch].to(device)
    d = model.forward(x)
    loss = d['loss'] +  d['sinkhorn_loss'] + d['similarity_loss']
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    margin_loss += d['margin_loss']
    word_loss += d['word_loss']
    similarity_loss += d['similarity_loss']
    sinkhorn_loss += d['sinkhorn_loss']
    if i % 100 == 0:
        print(margin_loss/100, word_loss/100, similarity_loss/100, sinkhorn_loss/100)
        margin_loss, word_loss, similarity_loss, sinkhorn_loss = 0, 0, 0, 0


tensor(2.4330, device='cuda:0', grad_fn=<DivBackward0>) tensor(1.9773, device='cuda:0', grad_fn=<DivBackward0>) tensor(5.5597e-05, device='cuda:0', grad_fn=<DivBackward0>) tensor(0.3695, device='cuda:0', grad_fn=<DivBackward0>)
tensor(175.3267, device='cuda:0', grad_fn=<DivBackward0>) tensor(200.1168, device='cuda:0', grad_fn=<DivBackward0>) tensor(0.0054, device='cuda:0', grad_fn=<DivBackward0>) tensor(36.9463, device='cuda:0', grad_fn=<DivBackward0>)
tensor(170.5996, device='cuda:0', grad_fn=<DivBackward0>) tensor(199.7471, device='cuda:0', grad_fn=<DivBackward0>) tensor(0.0054, device='cuda:0', grad_fn=<DivBackward0>) tensor(36.9463, device='cuda:0', grad_fn=<DivBackward0>)
tensor(177.2196, device='cuda:0', grad_fn=<DivBackward0>) tensor(199.7489, device='cuda:0', grad_fn=<DivBackward0>) tensor(0.0054, device='cuda:0', grad_fn=<DivBackward0>) tensor(36.9462, device='cuda:0', grad_fn=<DivBackward0>)
tensor(164.9298, device='cuda:0', grad_fn=<DivBackward0>) tensor(199.4720, device='cu

KeyboardInterrupt: 

In [15]:
emb[0]

tensor([0.0023, 0.0069, 0.0038, 0.0069, 0.0022, 0.0052, 0.9222, 0.0010, 0.0029,
        0.0031, 0.0044, 0.0296, 0.0082, 0.0012], grad_fn=<SelectBackward>)

In [26]:
del emb
import gc
gc.collect()
emb = model.word_topics().cpu()
topics = [[vocab[j] for j in i] for i in emb.argsort(0)[-10:, :].t().detach().cpu().numpy() ]
coherences= get_topic_coherence(XX, topics, word_track)
np.mean(diversity(topics)), coherences

(0.37857142857142856, -0.48443663506943274)

In [22]:
topics = [[vocab[j] for j in i] for i in emb.argsort(0)[-10:, :].t().detach().cpu().numpy() ]
print(np.mean(diversity(topics)))
coherences= get_topic_coherence(XX, topics, word_track)
coherences

0.42857142857142855


0.15679243820582905

In [81]:
for i in range(2000):
    idx_batch = np.random.choice(np.arange(D.shape[0]),batch_size, replace = False)
    x = input[idx_batch].to(device)
    d = model.forward(x)
    loss = d['loss'] + d['similarity_loss'] + 10 * d['sinkhorn_loss']
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if i % 100 == 0:
        print(d['loss'], d['margin_loss'], d['word_loss'], d['similarity_loss'], d['sinkhorn_loss'])


tensor(360.7344, device='cuda:0', grad_fn=<AddBackward0>) tensor(146.9851, device='cuda:0', grad_fn=<SumBackward0>) tensor(213.7492, device='cuda:0', grad_fn=<SumBackward0>) tensor(0.6848, device='cuda:0', grad_fn=<SumBackward0>) tensor(36.9289, device='cuda:0', grad_fn=<SumBackward0>)
tensor(353.1934, device='cuda:0', grad_fn=<AddBackward0>) tensor(141.2190, device='cuda:0', grad_fn=<SumBackward0>) tensor(211.9745, device='cuda:0', grad_fn=<SumBackward0>) tensor(3.8667, device='cuda:0', grad_fn=<SumBackward0>) tensor(36.0786, device='cuda:0', grad_fn=<SumBackward0>)
tensor(307.4585, device='cuda:0', grad_fn=<AddBackward0>) tensor(91.4383, device='cuda:0', grad_fn=<SumBackward0>) tensor(216.0202, device='cuda:0', grad_fn=<SumBackward0>) tensor(6.4207, device='cuda:0', grad_fn=<SumBackward0>) tensor(35.5880, device='cuda:0', grad_fn=<SumBackward0>)
tensor(294.3574, device='cuda:0', grad_fn=<AddBackward0>) tensor(81.3318, device='cuda:0', grad_fn=<SumBackward0>) tensor(213.0256, device='

In [82]:
del emb
gc.collect()
emb = model.word_topics().cpu()
topics = [[vocab[j] for j in i] for i in emb.argsort(0)[-10:, :].t().detach().cpu().numpy() ]
coherences= get_topic_coherence(XX, topics, word_track)
np.mean(diversity(topics)), coherences

(0.5428571428571428, -0.03780059480179641)

In [158]:
collections.Counter([j for i in topics for j in i]).most_common()

[('tuna', 4),
 ('mushroom', 4),
 ('roasted', 4),
 ('tomato', 4),
 ('grilled', 4),
 ('sauce', 4),
 ('business', 3),
 ('people', 3),
 ('around', 3),
 ('decided', 3),
 ('tender', 3),
 ('gras', 3),
 ('salad', 3),
 ('onion', 3),
 ('bathroom', 2),
 ('seating', 2),
 ('young', 2),
 ('movie', 2),
 ('crowd', 2),
 ('chair', 2),
 ('lounge', 2),
 ('street', 2),
 ('open', 2),
 ('see', 2),
 ('ago', 2),
 ('since', 2),
 ('creamy', 2),
 ('better', 2),
 ('cheap', 2),
 ('aged', 1),
 ('smooth', 1),
 ('oven', 1),
 ('color', 1),
 ('touch', 1),
 ('warm', 1),
 ('flower', 1),
 ('beautifully', 1),
 ('wood', 1),
 ('presentation', 1),
 ('cuisine', 1),
 ('citysearch', 1),
 ('week', 1),
 ('year', 1),
 ('advance', 1),
 ('wife', 1),
 ('wedding', 1),
 ('event', 1),
 ('celebrate', 1),
 ('greeted', 1),
 ('evening', 1),
 ('anniversary', 1),
 ('birthday', 1),
 ('reservation', 1),
 ('burnt', 1),
 ('hamburger', 1),
 ('medium', 1),
 ('plate', 1),
 ('dessert', 1),
 ('bland', 1),
 ('stick', 1),
 ('slice', 1),
 ('bbq', 1),
 ('tr