In [1]:
import pandas as pd
import numpy as np


def sampleFromDirichlet(alpha):
    return np.random.dirichlet(alpha)
    
    
def sampleFromCategorical(theta):
    theta = theta/np.sum(theta)
    return np.random.multinomial(1, theta).argmax()


class JST(object):
    
    def __init__(self, n_topics=5, beta=0.01, gamma=0.01, sentilab=2):
        '''
        alpha: for (sentiment, document)
        beta: for (sentiment, topic)
        gamma: for document
        '''
        self.n_topics = n_topics
        self.alpha = 50/n_topics
        self.beta = beta
        self.gamma = gamma
        self.sentilab = sentilab

        
    def read_corpus(self, corpus):

        self.voc = set()
        self.vocs = []
        self.docs = []
        self.n_docs = 0
        corp = pd.read_csv(corpus,index_col=None,header=None)
        corp = pd.np.array(corp.iloc[:,0]).tolist()
        for doc in corp:
            self.n_docs += 1
            doc = doc.strip().split()[1:]
            for w in doc:
                self.voc.add(w)
                self.vocs.append(w)
            self.docs.append(doc)
            self.doc_size = len(self.docs)
            self.voc_size = len(self.voc)
    
    
    def w_id(self):
        
        self.word2id = {}
        self.id2word = {}
        id=0
        for w in self.voc:
            self.word2id[w] = id
            self.id2word[id] = w
            id += 1

    def read_score(self, pos_score):
        
        self.score = pd.read_csv(pos_score,index_col=None).to_dict()
        
 
    def _initialize_(self, corpus, pos_score):

        self.read_corpus(corpus)
        self.read_score(pos_score)
        self.n_dt = np.zeros((self.n_docs, self.n_topics))
        self.n_dts = np.zeros((self.n_docs, self.n_topics, self.sentilab))
        self.n_d = np.zeros((self.n_docs))
        self.n_vts = np.zeros((self.voc_size, self.n_topics, self.sentilab))
        self.n_ts = np.zeros((self.n_topics, self.sentilab))
        self.topics = {}
        self.sentiments = {}
        self.priorSentiment = {}

        alphaVec = self.alpha * np.ones(self.n_topics)
        gammaVec = self.gamma * np.ones(self.sentilab)
        
        for i, word in enumerate(self.voc):
            posScore = self.score[word]
            if posScore >= 0.1:
                self.priorSentiment[i] = 1
            elif posScore <= -0.1:
                self.priorSentiment[i] = 0

        for d in range(n_docs):

            topicDistribution = sampleFromDirichlet(alphaVec)
            sentimentDistribution = np.zeros((self.n_topics, self.sentilab))
            for t in range(self.n_topics):
                sentimentDistribution[t, :] = sampleFromDirichlet(gammaVec)
            for i, w in enumerate(self.docs[d]):
                t = sampleFromCategorical(topicDistribution)
                s = sampleFromCategorical(sentimentDistribution[t, :])

                self.topics[(d, i)] = t
                self.sentiments[(d, i)] = s
                self.n_dt[d, t] += 1
                self.n_dts[d, t, s] += 1
                self.n_d[d] += 1
                self.n_vts[w, t, s] += 1
                self.n_ts[t, s] += 1   
                

    def conditionalDistribution(self, d, v):
        """
        Calculates the (topic, sentiment) probability for word v in document d
        Returns:    a matrix (numTopics x numSentiments) storing the probabilities
        """
        probabilities_ts = np.ones((self.n_topics, self.sentilab))
        firstFactor = (self.n_dt[d] + self.alpha) / \
            (self.n_d[d] + self.n_topics * self.alpha)
        secondFactor = (self.n_dts[d, :, :] + self.gamma) / \
            (self.n_dt[d, :] + self.sentilab * self.gamma)[:, np.newaxis]
        thirdFactor = (self.n_vts[v, :, :] + self.beta) / \
            (self.n_ts + self.n_vts.shape[0] * self.beta)
        probabilities_ts *= firstFactor[:, np.newaxis]
        probabilities_ts *= secondFactor * thirdFactor
        probabilities_ts /= np.sum(probabilities_ts)
        return probabilities_ts

    def getTopKWordsByLikelihood(self, K):
        """
        Returns top K discriminative words for topic t and sentiment s
        ie words v for which p(t, s | v) is maximum
        """
        pseudocounts = np.copy(self.n_vts)
        normalizer = np.sum(pseudocounts, (1, 2))
        pseudocounts /= normalizer[:, np.newaxis, np.newaxis]
        for t in range(self.n_topics):
            for s in range(self.sentilab):
                topWordIndices = pseudocounts[:, t, s].argsort()[-1:-(K + 1):-1]
                vocab = self.voc
                print( t, s, [vocab[i] for i in topWordIndices])

    def getTopKWords(self, K):
        """
        Returns top K discriminative words for topic t and sentiment s
        ie words v for which p(v | t, s) is maximum
        """
        pseudocounts = np.copy(self.n_vts)
        normalizer = np.sum(pseudocounts, (0))
        pseudocounts /= normalizer[np.newaxis, :, :]
        for t in range(self.n_topics):
            for s in range(self.sentilab):
                topWordIndices = pseudocounts[:, t, s].argsort()[-1:-(K + 1):-1]
                vocab = self.voc
                print( t, s, [vocab[i] for i in topWordIndices])


    def run(self, corpus, pos_score, maxIters=50):
        """
        Runs Gibbs sampler for sentiment-LDA
        """
        self._initialize_(corpus,pos_score)
        print("Initialize done")
        self.w_id()
        for iteration in range(maxIters):
            print( "Starting iteration %d of %d" % (iteration + 1, maxIters))
            for d in range(self.doc_size):
                for i, v in enumerate(self.word2id(self.docs[d])):
                    t = self.topics[(d, i)]
                    s = self.sentiments[(d, i)]
                    self.n_dt[d, t] -= 1
                    self.n_d[d] -= 1
                    self.n_dts[d, t, s] -= 1
                    self.n_vts[v, t, s] -= 1
                    self.n_ts[t, s] -= 1

                    probabilities_ts = self.conditionalDistribution(d, v)
                    if v in self.priorSentiment:
                        s = self.priorSentiment[v]
                        t = sampleFromCategorical(probabilities_ts[:, s])
                    else:
                        ind = sampleFromCategorical(probabilities_ts.flatten())
                        t, s = np.unravel_index(ind, probabilities_ts.shape)

                    self.topics[(d, i)] = t
                    self.sentiments[(d, i)] = s
                    self.n_dt[d, t] += 1
                    self.n_d[d] += 1
                    self.n_dts[d, t, s] += 1
                    self.n_vts[v, t, s] += 1
                    self.n_ts[t, s] += 1

In [2]:
jst=JST()