In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
import sys
#change to any directory you have to store the repo
sys.path.insert(1, '/home/ec2-user/SageMaker/github/aspect_topic_modeling')

from src.features.metric import diversity, get_topic_coherence
from models.atten_model import MODEL_ATT_COMP
import swifter
from src.models.utils import get_wordnet_pos, remove_stopWords, get_emb, generate_emb, train, get_common_words, generate_bow


In [2]:
D = pd.read_csv("/home/ec2-user/SageMaker/github/aspect_topic_modeling/src/data/train.txt",header = None)
D.iloc[:,0] = D.iloc[:,0].astype(str)
sentences = [item.split() for item in D.iloc[:,0]]
#generate word 2 vec models
w2vmodel = Word2Vec(sentences,vector_size=200, window=10, negative = 5)




In [7]:
#generate input
vocab = list(set([j for i in D.values for j in i[0].split(' ') if j in w2vmodel.wv]))
vocab = [''] + vocab
word_track = {i: ind for ind, i in enumerate(vocab)}
index_track = {ind: i for ind, i in enumerate(vocab)}
D['index_num'] = D.swifter.apply(
            lambda x: [word_track[i] for i in x[0].split() if i in word_track], axis=1)
# X, indices = generate_bow(df = D, common_words = vocab)


In [95]:
XX.shape

(279885, 13590)

In [94]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
XX = mlb.fit_transform([[word_track[it] for it in i if it in word_track] + [0]  for i in sentences])


In [8]:
#pad the input

vocab_tensor = torch.Tensor([[0] * 200]  + [w2vmodel.wv[i] for i in vocab[1:]])
vocab_ind = [torch.LongTensor([word_track[it] for it in i if it in word_track][:16]) for i in sentences]
input = torch.nn.utils.rnn.pad_sequence(vocab_ind, batch_first=True, padding_value=0)

In [None]:
import torch
import torch.nn as nn
from torch.nn.functional import normalize
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


class MODEL_ATT_COMP(nn.Module):
    def __init__(self, d_word, d_key, d_value,n_topic, embeddings):
        super(MODEL_ATT_COMP, self).__init__()
        self.embeddings = nn.Embedding(len(embeddings), 200)
        self.embeddings.weight = torch.nn.Parameter(embeddings)
        self.embeddings.weight.requires_grad = False
        self.K = nn.Linear(d_word,d_key)
        self.Q = nn.Linear(d_word,d_key)
        self.V = nn.Linear(d_word,d_value)
        self.V2T = nn.Linear(d_value,n_topic)
        self.soft1 = nn.Softmax(dim = 2)
        self.T2V = nn.Linear(n_topic,d_value)
        self.V2W = nn.Linear(d_value,d_word)
        self.sqrtdk = torch.tensor([d_key**0.5]).to(device)
    
    def loss_max_margin_neg_sample(self, x):
        word_repre_x = normalize(self.word_repre, dim = 2) #batch n dvalue
        value_recon_x = normalize(self.value_recon, dim = 2) #batch n dvalue
        sim_matrix = torch.matmul(word_repre_x, value_recon_x.transpose(2,1)) #batch n n
        sim_x = torch.diagonal(sim_matrix, 0, 1, 2) #batch n 
        ns = torch.randperm(sim_x.shape[1]) # n 
        loss =  1 - sim_x + torch.diagonal(sim_matrix[:, ns], 0, 1, 2)
        loss = loss.mean(1)  #batch 
        return loss

    def loss_word_prediction_no_self(self, x):
        word_recon_no_self_normalized = normalize(self.word_recon_no_self, dim = 2) #batch n d_word
        x_normalized = normalize(x, dim = 2).transpose(2,1) #batch d_word n 
        sim_matrix = torch.matmul(word_recon_no_self_normalized, x_normalized) #batch n n
        return 1 - torch.diagonal(sim_matrix, 0, 1, 2).mean(1) #batch 

    def reconstruction_loss(self):
        distribution = self.topic_weight
        return - torch.log(distribution) * distribution
    
    def similarity_loss(self):
        d1, d2, d3 = self.att_weight.shape
        normal_weights = self.att_weight.reshape(-1, d3) # batch * n n
        samples = torch.multinomial(normal_weights, 1).reshape(-1) #batch * n
        normalize_weights = normalize(self.topic_weight, dim = 2)
        topic_similarity = torch.matmul(normalize_weights, normalize_weights.transpose(1,2)).reshape(d1*d2, -1) #batch n n
        #print(topic_similarity.shape, samples.shape)
        return 1 - topic_similarity[torch.arange(topic_similarity.shape[0]), samples].reshape(d1, d2).mean(1) #batch n
         
    def word_topics(self):
        x = self.embeddings.weight
        self.soft2 = nn.Softmax(dim = 1)
        self.k = self.K(x).transpose(0,1) #d_key n 
        self.q = self.Q(x) #n d_key
        self.att_score = torch.matmul(self.q, self.k) #n n
        self.att_weight = self.soft2(self.att_score/self.sqrtdk) #n n, row sum = 1
        self.v = self.V(x) #n d_key
        self.word_repre = torch.matmul(self.att_weight, self.v) #batch n d_value
        self.topic_score = self.V2T(self.word_repre) #n n_topic
        self.word2topic = self.soft2(self.topic_score) #n n_topic, row sum = 1       
        return self.word2topic
    
    def forward(self,x):
        '''
        x: tensor, n by d_word
        '''
        x = self.embeddings(x) #batch n d_word
        self.k = self.K(x).transpose(2,1) #batch d_key n 
        self.q = self.Q(x) #batch n d_key
        self.att_score = torch.matmul(self.q, self.k) #batch n n
        self.att_weight = self.soft1(self.att_score/self.sqrtdk) #batch n n, row sum = 1
        self.v = self.V(x) #batch n d_key
        self.word_repre = torch.matmul(self.att_weight, self.v) #batch n d_value
        self.topic_score = self.V2T(self.word_repre) #batch n n_topic
        self.topic_weight = self.soft1(self.topic_score) #batch n n_topic, row sum = 1
        self.value_recon = self.T2V(self.topic_weight) #batch n d_value
        self.word_recon = self.V2W(self.word_repre)#batch n d_word
        #no self computation, effectively masked
        #print(self.k.shape, self.att_score.shape, self.att_weight.shape, self.word_repre.shape, self.topic_weight.shape)
        self.att_score_no_self = self.att_score -  torch.diag(torch.zeros(self.att_score.shape[1])+torch.tensor(float('inf'))).to(device)#batch n n
        self.att_weight_no_self = self.soft1(self.att_score_no_self/self.sqrtdk) #batch n n 
        self.word_repre_no_self = torch.matmul(self.att_weight_no_self, self.v)#batch n d_key
        self.word_recon_no_self = self.V2W(self.word_repre_no_self) #batch n d_word
        word_pred_loss = self.loss_word_prediction_no_self(x).sum()
        margin_loss = self.loss_max_margin_neg_sample(x).sum()
        recon_loss = self.reconstruction_loss().mean(1).sum()
        sim_loss = self.similarity_loss().sum()
        return {
            'loss' : word_pred_loss + margin_loss,
            'margin_loss': margin_loss,
            'word_loss': word_pred_loss,
            'reconstruct_loss': recon_loss,
            'similarity_loss': sim_loss
            
        }

In [65]:
#--------------------------------------------
#model param
#--------------------------------------------
d_word = w2vmodel.vector_size
n_topic = 14
batch_size = 256
d_key = 50
d_value = 50
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


#---------------------------------------
#train model
#---------------------------------------
np.random.seed(10)
model = MODEL_ATT_COMP(d_key = d_key, d_word = d_word, n_topic = n_topic, d_value = d_value, embeddings = vocab_tensor)
model.to(device)
learning_rate = 5 * 1e-5
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for i in range(2000):
    idx_batch = np.random.choice(np.arange(D.shape[0]),batch_size, replace = False)
    x = input[idx_batch].to(device)
    d = model.forward(x)
    loss = d['loss']
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if i % 100 == 0:
        print(d['loss'], d['margin_loss'], d['word_loss'])


tensor(512.6777, device='cuda:0', grad_fn=<AddBackward0>) tensor(256.0007, device='cuda:0', grad_fn=<SumBackward0>) tensor(256.6771, device='cuda:0', grad_fn=<SumBackward0>)
tensor(502.6732, device='cuda:0', grad_fn=<AddBackward0>) tensor(255.9948, device='cuda:0', grad_fn=<SumBackward0>) tensor(246.6784, device='cuda:0', grad_fn=<SumBackward0>)
tensor(490.3693, device='cuda:0', grad_fn=<AddBackward0>) tensor(255.7669, device='cuda:0', grad_fn=<SumBackward0>) tensor(234.6024, device='cuda:0', grad_fn=<SumBackward0>)
tensor(477.7612, device='cuda:0', grad_fn=<AddBackward0>) tensor(254.5385, device='cuda:0', grad_fn=<SumBackward0>) tensor(223.2227, device='cuda:0', grad_fn=<SumBackward0>)
tensor(470.8660, device='cuda:0', grad_fn=<AddBackward0>) tensor(250.4775, device='cuda:0', grad_fn=<SumBackward0>) tensor(220.3885, device='cuda:0', grad_fn=<SumBackward0>)
tensor(454.6784, device='cuda:0', grad_fn=<AddBackward0>) tensor(239.2022, device='cuda:0', grad_fn=<SumBackward0>) tensor(215.476

In [101]:
#report results for coherence and diversity
del emb
gc.collect()
emb = model.word_topics().cpu()
topics = [[vocab[j] for j in i] for i in emb.argsort(0)[-10:, :].t().detach().cpu().numpy() ]
coherences= get_topic_coherence(XX, topics, word_track)
np.mean(diversity(topics)), coherences

(0.6428571428571429, -0.2500997513313553)

In [102]:
for i in range(2000):
    idx_batch = np.random.choice(np.arange(D.shape[0]),batch_size, replace = False)
    x = input[idx_batch].to(device)
    d = model.forward(x)
    loss = d['loss'] + d['similarity_loss']
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if i % 100 == 0:
        print(d['loss'], d['margin_loss'], d['word_loss'], d['similarity_loss'])


tensor(342.0538, device='cuda:0', grad_fn=<AddBackward0>) tensor(126.5413, device='cuda:0', grad_fn=<SumBackward0>) tensor(215.5125, device='cuda:0', grad_fn=<SumBackward0>) tensor(18.0474, device='cuda:0', grad_fn=<SumBackward0>)
tensor(314.5013, device='cuda:0', grad_fn=<AddBackward0>) tensor(94.5565, device='cuda:0', grad_fn=<SumBackward0>) tensor(219.9447, device='cuda:0', grad_fn=<SumBackward0>) tensor(16.4558, device='cuda:0', grad_fn=<SumBackward0>)
tensor(301.7924, device='cuda:0', grad_fn=<AddBackward0>) tensor(88.4152, device='cuda:0', grad_fn=<SumBackward0>) tensor(213.3773, device='cuda:0', grad_fn=<SumBackward0>) tensor(17.3890, device='cuda:0', grad_fn=<SumBackward0>)
tensor(325.7821, device='cuda:0', grad_fn=<AddBackward0>) tensor(108.7282, device='cuda:0', grad_fn=<SumBackward0>) tensor(217.0539, device='cuda:0', grad_fn=<SumBackward0>) tensor(18.8047, device='cuda:0', grad_fn=<SumBackward0>)
tensor(345.7769, device='cuda:0', grad_fn=<AddBackward0>) tensor(127.5820, dev

In [103]:
del emb
gc.collect()
emb = model.word_topics().cpu()
topics = [[vocab[j] for j in i] for i in emb.argsort(0)[-10:, :].t().detach().cpu().numpy() ]
coherences= get_topic_coherence(XX, topics, word_track)
np.mean(diversity(topics)), coherences

ValueError: dimension mismatch

(0.31428571428571433, 0.10776733456458878)