###  Import Libraries

In [1]:
import numpy as np
import re
import pandas as pd
from pprint import pprint
import os
import urllib
import tarfile
MAX_VOCAB_SIZE = 100000
vocabSize = 100000

# nltk & Sentiwordnet
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer  
from nltk import word_tokenize,sent_tokenize, pos_tag
from nltk.corpus import sentiwordnet as swn

#sklearn & lemmatization
from sklearn.feature_extraction.text import CountVectorizer
st = PorterStemmer()
lemmatizer = WordNetLemmatizer()


In [1]:
# import numpy as np
# import re
# import pandas as pd


# from nltk.corpus import stopwords
# from nltk.stem.porter import PorterStemmer
# from nltk.stem import WordNetLemmatizer  
# from sklearn.feature_extraction.text import CountVectorizer
# from nltk import word_tokenize,sent_tokenize, pos_tag
# from nltk.corpus import sentiwordnet as swn

# from pprint import pprint
 
# lemmatizer = WordNetLemmatizer()

# MAX_VOCAB_SIZE = 50000


# import os
# import urllib
# import tarfile
# vocabSize = 50000

In [2]:
def sampleFromDirichlet(alpha):
    """
    
    Sample from a Dirichlet distribution
    alpha: Dirichlet distribution parameter (of length d)
    Returns:
    x: Vector (of length d) sampled from dirichlet distribution
    """
    return np.random.dirichlet(alpha)

In [3]:
def sampleFromCategorical(theta):
    """
    Samples from a categorical/multinoulli distribution
    theta: parameter (of length d)
    Returns:
    x: index ind (0 <= ind < d) based on probabilities in theta
    """
    theta = theta/np.sum(theta)
    return np.random.multinomial(1, theta).argmax()

In [4]:
def word_indices(wordOccuranceVec):
    """
    Turn a document vector of size vocab_size to a sequence
    of word indices. The word indices are between 0 and
    vocab_size-1. The sequence length is equal to the document length.
    """
    for idx in wordOccuranceVec.nonzero()[0]:
        for i in range(int(wordOccuranceVec[idx])):
            yield idx

### Sentiment LDA

In [14]:
class SentimentLDAGibbsSampler:

    def __init__(self, numTopics, alpha, beta, gamma, numSentiments=4):
        """
        numTopics: Number of topics in the model
        numSentiments: Number of sentiments (default 2)
        alpha: Hyperparameter for Dirichlet prior on topic distribution 
        per document
        beta: Hyperparameter for Dirichlet prior on vocabulary distribution
        per (topic, sentiment) pair
        gamma:Hyperparameter for Dirichlet prior on sentiment distribution
        per (document, topic) pair
        """
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma
        self.numTopics = numTopics
        self.numSentiments = numSentiments
        
        self.word_sentiment_dict = {}
        self.topic_sentiment_dict = {}
        self.topic_distribution_dict = {}
        self.topic_distribution_dict_2 = {}
        
    def processSingleReview(self, review, d=None):
        """
        Convert a raw review to a string of words
        """
        letters_only = re.sub("[^a-zA-Z]", " ", review)
        words = letters_only.lower().split()
        stops = set(stopwords.words("english"))
        meaningful_words = [lemmatizer.lemmatize(w) for w in words if w not in stops]     
       
        return(" ".join(meaningful_words))
    
    

    def processReviews(self, reviews, saveAs=None, saveOverride=False):
        import os
        import dill
        if not saveOverride and saveAs and os.path.isfile(saveAs):
            [wordOccurenceMatrix, self.vectorizer] = dill.load(open(saveAs,'rb'))  
            return wordOccurenceMatrix
        processed_reviews = []
        i = 0
        for review in reviews:
            if((i + 1) % 1000 == 0):
                print ("Review %d of %d" % (i + 1, len(reviews))) 
            processed_reviews.append(self.processSingleReview(review, i))
            i += 1
  
        self.vectorizer = CountVectorizer(analyzer="word",
                                          tokenizer=None,
                                          preprocessor=None,
                                          stop_words="english",
                                          max_features=MAX_VOCAB_SIZE)
        
        train_data_features = self.vectorizer.fit_transform(processed_reviews)   
        wordOccurenceMatrix = train_data_features.toarray()
        if saveAs:
            dill.dump([wordOccurenceMatrix, self.vectorizer], open(saveAs, 'wb'))  
            
        return wordOccurenceMatrix

    
    def _initialize_(self, reviews, saveAs=None, saveOverride=False):
        """
        wordOccuranceMatrix: numDocs x vocabSize matrix encoding the
        bag of words representation of each document
        """
        self.wordOccuranceMatrix = self.processReviews(reviews, saveAs, saveOverride)
        numDocs, vocabSize = self.wordOccuranceMatrix.shape

        self.n_dt = np.zeros((numDocs, self.numTopics))
        self.n_dts = np.zeros((numDocs, self.numTopics, self.numSentiments))
        self.n_d = np.zeros((numDocs))
        self.n_vts = np.zeros((vocabSize, self.numTopics, self.numSentiments))
        self.n_ts = np.zeros((self.numTopics, self.numSentiments))
        self.topics = {}
        self.sentiments = {}
        self.priorSentiment = {}

        alphaVec = self.alpha * np.ones(self.numTopics)
        gammaVec = self.gamma * np.ones(self.numSentiments)

 
        '''
            positive negative objektiv und compare Score von jedem Wort in Document
        '''
        for i, word in enumerate(self.vectorizer.get_feature_names()):
            synsets = swn.senti_synsets(word)
            posScore = np.mean([s.pos_score() for s in synsets])
            
            synsets = swn.senti_synsets(word)           
            negScore = np.mean([s.neg_score() for s in synsets])
            
            synsets = swn.senti_synsets(word)
            objScore = np.mean([s.obj_score() for s in synsets])
                                         
            compScore = posScore - negScore
            
            self.word_sentiment_dict[word] = [posScore * 100, negScore * 100, objScore * 100, compScore * 100]    
    
            if posScore >= 0.1 and posScore > negScore:
                self.priorSentiment[i] = 1    
            elif negScore >= 0.1 and negScore > posScore:
                self.priorSentiment[i] = 0    
                
 #       print("\n------------------------------ Sentiment score für jedes Wort im Dokument ----------------------------------")
        df_word_sentiment = pd.DataFrame(data = self.word_sentiment_dict) 
        df_word_sentiment.rename(index={0:'pos', 1:'neg', 2:'inc', 3:'com'}, inplace=True)
        df_word_sentiment = df_word_sentiment.transpose()
        df_word_sentiment.columns.names = ['Sentiment:']
#        df_word_sentiment.index.names = ['Sentiment']
#        display(df_word_sentiment)
            
        '''
            Topic and Sentiment distribution in Document
        '''
        list_of_dict_for_sentiment = []
        for d in range(numDocs):
            topicDistribution = sampleFromDirichlet(alphaVec)
            sentimentDistribution = np.zeros((self.numTopics, self.numSentiments))
            self.topic_distribution_dict["Dokument " + str(d+1)] = topicDistribution*100
           
            for t in range(self.numTopics):
                sentimentDistribution[t, :] = sampleFromDirichlet(gammaVec)
                self.topic_sentiment_dict["Document " + str(d+1) + " - " + "Topic " + str(t+1)] = sentimentDistribution[t, :] * 100
                sentidis_proTotpic = sentimentDistribution[t, :] * 100
                sentiDist_proTopic = sentidis_proTotpic.tolist()
                
                for sen in range(len(sentiDist_proTopic)):
                    if sen == 0:
                        self.topic_distribution_dict_2["topic_" + str(t+1) + "_neg"]  = sentiDist_proTopic[sen]
                    elif sen == 1:
                        self.topic_distribution_dict_2["topic_" + str(t+1) + "_pos"]  = sentiDist_proTopic[sen]
                    elif sen == 2:
                        self.topic_distribution_dict_2["topic_" + str(t+1) + "_unc"]  = sentiDist_proTopic[sen]
                    elif sen == 3:
                        self.topic_distribution_dict_2["topic_" + str(t+1) + "_com"]  = sentiDist_proTopic[sen]
                    else:
                        print("Nicht")
                        
            list_of_dict_for_sentiment.append(self.topic_distribution_dict_2)
            self.topic_distribution_dict_2 = {}
            
            for i, w in enumerate(word_indices(self.wordOccuranceMatrix[d, :])):
#               t = sampleFromCategorical(topicDistribution)
                s = sampleFromCategorical(sentimentDistribution[t, :])
            
                self.topics[(d, i)] = t
                self.sentiments[(d, i)] = s
                self.n_dt[d, t] += 1
                self.n_dts[d, t, s] += 1
                self.n_d[d] += 1
                self.n_vts[w, t, s] += 1
                self.n_ts[t, s] += 1
                
                
#        print("\n-------------------------------- Topic distribution für jedes Dokument -----------------------------------")
        df_topic = pd.DataFrame(data = self.topic_distribution_dict)
        df_topic = df_topic.transpose()
        df_topic.columns.names = ['Topic Distribution:']
#        display(df_topic)        
                
#        print("\n------------------------------ Sentiment distrbution für jedes Dokument ----------------------------------")          
        df = pd.DataFrame(data = self.topic_sentiment_dict) 
        df.rename(index={0:'neg', 1:'pos', 2:'inc', 3:'com'}, inplace=True)
        df = df.transpose()
        df.columns.names = ['Sentiment:']
#        display(df)
        
#        print("\n-------------------------- Sentiment distrbution für jedes Dokument 2. Dataframe ------------------------") 
        df_2 = pd.DataFrame(data = list_of_dict_for_sentiment) 
#        display(df_2)
        
        # Sentiment Topic in JSON
#        print("\n----------------------------- Sentiment distrbution für jedes Dokument JSON -----------------------------")
#        print(df_2.to_json(orient='split'))
#        print("\n")
        
        return df_2
                
    def conditionalDistribution(self, d, v):
        """
        Calculates the (topic, sentiment) probability for word v in document d
        Returns:    a matrix (numTopics x numSentiments) storing the probabilities
        """
        probabilities_ts = np.ones((self.numTopics, self.numSentiments))
        firstFactor = (self.n_dt[d] + self.alpha) / \
            (self.n_d[d] + self.numTopics * self.alpha)
        secondFactor = (self.n_dts[d, :, :] + self.gamma) / \
            (self.n_dt[d, :] + self.numSentiments * self.gamma)[:, np.newaxis]
        thirdFactor = (self.n_vts[v, :, :] + self.beta) / \
            (self.n_ts + self.n_vts.shape[0] * self.beta)
        probabilities_ts *= firstFactor[:, np.newaxis]
        probabilities_ts *= secondFactor * thirdFactor
        probabilities_ts /= np.sum(probabilities_ts)
        
        return probabilities_ts

    def getTopKWordsByLikelihood(self, K):
        """
        Returns top K discriminative words for topic t and sentiment s
        ie words v for which p(t, s | v) is maximum
        """
        pseudocounts = np.copy(self.n_vts)
        normalizer = np.sum(pseudocounts, (1, 2))
        pseudocounts /= normalizer[:, np.newaxis, np.newaxis]
        for t in range(self.numTopics):
            for s in range(self.numSentiments):
                topWordIndices = pseudocounts[:, t, s].argsort()[-1:-(K + 1):-1]
                vocab = self.vectorizer.get_feature_names()
                print (t, s, [vocab[i] for i in topWordIndices]) 
                  
                
    def getTopKWords(self, K):
        """
        Returns top K discriminative words for topic t and sentiment s
        ie words v for which p(v | t, s) is maximum
        """
        pseudocounts = np.copy(self.n_vts)
        normalizer = np.sum(pseudocounts, (0))
        pseudocounts /= normalizer[np.newaxis, :, :]
        for t in range(self.numTopics):
            print("-------------------------  score of TopWord for Topic " + str(t+1) + "  -----------------------")
            for s in range(self.numSentiments):
                topWordIndices = pseudocounts[:, t, s].argsort()[-1:-(K + 1):-1]
                topWordScore = np.sort(pseudocounts[:, t, s])[-1:-(K + 1):-1]
                vocab = self.vectorizer.get_feature_names()
                
                if s == 0:
                    sent = 'neg'
                elif s == 1:
                    sent = 'pos'
                elif s == 2:
                    sent = 'inc'
                elif s == 3:
                    sent = 'com'
                else :
                    sent = s

                print("Topic " + str(t+1), " words | " + str(sent), [vocab[i] for i in topWordIndices]) 
#                print("Topic " + str(t+1), " score | " + str(sent), topWordScore)
#                print("\n")
            print("\n")
                


    def run(self, reviews, maxIters=30, saveAs=None, saveOverride=False):
        """
        Runs Gibbs sampler for sentiment-LDA
        i: Anzahl Elemente in Object word_indices
        v: index von Wort in  wordOccuranceMatrix
        """
        topic_sentiments = self._initialize_(reviews, saveAs, saveOverride)
        numDocs, vocabSize = self.wordOccuranceMatrix.shape
        

        for iteration in range(maxIters):
            print("Starting iteration %d of %d" % (iteration + 1, maxIters)) 
            for d in range(numDocs):                
                for i, v in enumerate(word_indices(self.wordOccuranceMatrix[d, :])):
                    t = self.topics[(d, i)]
                    s = self.sentiments[(d, i)]

                    self.n_dt[d, t] -= 1
                    self.n_d[d] -= 1
                    self.n_dts[d, t, s] -= 1
                    self.n_vts[v, t, s] -= 1
                    self.n_ts[t, s] -= 1

                    probabilities_ts = self.conditionalDistribution(d, v)
                    
                    if v in self.priorSentiment:
                        s = self.priorSentiment[v]
                        t = sampleFromCategorical(probabilities_ts[:, s])
                    else:
                        # flatten: Return a copy of the array collapsed into one dimension.
                        ind = sampleFromCategorical(probabilities_ts.flatten()) 
                        t, s = np.unravel_index(ind, probabilities_ts.shape)                    
                
                    self.topics[(d, i)] = t
                    self.sentiments[(d, i)] = s
                    self.n_dt[d, t] += 1
                    self.n_d[d] += 1
                    self.n_dts[d, t, s] += 1
                    self.n_vts[v, t, s] += 1
                    self.n_ts[t, s] += 1
                    
        return topic_sentiments
    

### Import Conference Data from Json

In [15]:
import json
with open('data_1000_erste_final.json') as json_file:  
    get_data = json.load(json_file)

get_datas = []
x = 0
while x < len(get_data):
    get_datas.append(get_data[x])
    x += 1

get_data = get_datas

reviews = []
for x in range(100):
    print("----------------------------------------------------  Konferenz {}  ----------------------------------------------------------\n".format(x+1))
    print(get_data[x]['general'])
    reviews.append(get_data[x]['general'])



----------------------------------------------------  Konferenz 1  ----------------------------------------------------------

**Operator**

Good morning, ladies and gentlemen, and welcome to the NuPathe's third quarter
2012 earnings conference call. [Operator instructions.] As a reminder, today's
call is being recorded. It is now my pleasure to turn the conference over to
turn the call over to Mr. John Woolford. Please go ahead sir.

**John Woolford**

Thank you, operator, and good morning, everyone. With me on today’s call are
Armando Anido, chief executive officer; and Keith Goldan, vice president and
chief financial officer. Terri Sebree, president; and Jerry McLaughlin, chief
commercial officer, will also be joining us for the Q&A portion of the call.

We issued a press release detailing third quarter 2012 financial results this
morning. For those of you who may not have seen the release, it is available
on our website at www.nupathe.com, in the Investor Relations section.

The fo

### Class SentimentLDAGibbsSampler laden mit Anzahl von Topics (z.ß: 5)

In [16]:
''' Erster Wert ist Anzahl von Topics hier 5'''

sampler = SentimentLDAGibbsSampler(10, 5, 0.1, 0.3)
sampler

<__main__.SentimentLDAGibbsSampler at 0x21e098ea0f0>

### SentimentLDAGibbsSampler ausführen 

In [17]:
'''
    Hier Dokument  und Anzahl von Iterationen eingeben (z.ß: 20 Iterationen)
'''

topic_sentiments = sampler.run(reviews, 20, True)
# Dataframe mit Sentiment/Topic
topic_sentiments

Starting iteration 1 of 20
Starting iteration 2 of 20
Starting iteration 3 of 20
Starting iteration 4 of 20
Starting iteration 5 of 20
Starting iteration 6 of 20
Starting iteration 7 of 20
Starting iteration 8 of 20
Starting iteration 9 of 20
Starting iteration 10 of 20
Starting iteration 11 of 20
Starting iteration 12 of 20
Starting iteration 13 of 20
Starting iteration 14 of 20
Starting iteration 15 of 20
Starting iteration 16 of 20
Starting iteration 17 of 20
Starting iteration 18 of 20
Starting iteration 19 of 20
Starting iteration 20 of 20


Unnamed: 0,topic_10_com,topic_10_neg,topic_10_pos,topic_10_unc,topic_1_com,topic_1_neg,topic_1_pos,topic_1_unc,topic_2_com,topic_2_neg,...,topic_7_pos,topic_7_unc,topic_8_com,topic_8_neg,topic_8_pos,topic_8_unc,topic_9_com,topic_9_neg,topic_9_pos,topic_9_unc
0,0.553521,89.144266,9.225386,1.076827,1.470997,0.406783,97.479552,0.642668,4.888526e+00,8.575560,...,1.802900,3.562812,4.610500e-02,2.260756,21.395005,76.298134,28.230417,42.412628,0.009423,29.347532
1,1.018552,14.790590,83.732781,0.458077,0.423775,61.179558,1.527416,36.869251,6.985517e+01,7.334417,...,0.466286,7.325449,8.862813e+01,0.307084,11.054021,0.010763,10.723965,0.655197,22.533090,66.087749
2,0.083328,0.045246,99.146010,0.725416,1.927944,18.682759,60.358181,19.031115,2.316119e+01,3.649115,...,50.616928,29.468442,1.697723e+00,0.115344,25.698904,72.488029,20.110604,52.187438,19.071211,8.630747
3,7.129725,3.004089,65.858495,24.007691,27.360623,36.387099,13.058128,23.194150,3.410606e-03,31.479886,...,8.404194,28.768068,5.592272e-01,5.340321,24.013979,70.086473,45.696030,35.356147,18.338424,0.609399
4,57.385852,15.799796,0.000068,26.814284,3.678149,0.000031,78.975137,17.346684,4.388176e+01,0.716905,...,11.249373,1.523493,7.889038e+00,27.091384,3.515394,61.504184,0.015033,0.016934,4.568571,95.399462
5,85.156707,2.963456,0.061656,11.818181,71.176999,0.276499,27.336256,1.210245,1.692046e+01,16.503711,...,2.612494,52.986120,3.259091e+01,9.928630,7.489851,49.990606,9.815447,15.927715,0.266297,73.990540
6,88.967571,9.618226,0.285867,1.128337,18.021672,73.985583,7.413003,0.579742,4.174594e+01,0.485613,...,1.984807,5.731792,2.076007e+01,0.357978,73.314654,5.567293,7.714383,71.333059,2.881697,18.070862
7,4.132926,0.464743,95.400144,0.002186,10.351239,21.130233,56.297626,12.220902,6.256478e+00,67.780347,...,60.491958,30.799354,5.928107e-02,0.403973,1.135561,98.401184,52.780127,0.249815,46.196591,0.773467
8,0.013501,51.134069,1.154990,47.697440,63.024135,1.507552,31.732860,3.735453,3.785938e+00,11.445939,...,0.435125,82.893996,1.943999e+01,3.477022,16.985205,60.097779,16.864858,74.886783,0.001817,8.246543
9,48.025036,0.922279,41.629804,9.422882,19.809889,70.886568,0.001574,9.301969,6.045189e+01,39.434581,...,1.663730,7.748651,1.310055e+01,45.951176,0.501019,40.447255,0.335639,0.116930,12.562235,86.985197


### TopWords und Score von TopWords

In [19]:
''' 
    Hier Anzhal von Topwords eingeben
    Anzhal von Topwords ( z.ß: 6) 
'''

TopWord = sampler.getTopKWords(6)

-------------------------  score of TopWord for Topic 1  -----------------------
Topic 1  words | neg ['patient', 'treatment', 'week', 'potential', 'event', 'common']
Topic 1  words | pos ['feraheme', 'launch', 'gaap', 'segment', 'protocol', 'client']
Topic 1  words | inc ['water', 'authority', 'cayman', 'plant', 'government', 'retail']
Topic 1  words | com ['mobile', 'user', 'nq', 'nationsky', 'internet', 'overseas']



-------------------------  score of TopWord for Topic 2  -----------------------
Topic 2  words | neg ['september', 'ended', 'month', 'plan', 'company', 'boe']
Topic 2  words | pos ['asset', 'credit', 'portfolio', 'security', 'revenue', 'strong']
Topic 2  words | inc ['develops', 'message', 'patricia', 'quickening', 'qiao', 'aluminum']
Topic 2  words | com ['accounted', 'cargo', 'certificate', 'briefly', 'language', 'dsos']



-------------------------  score of TopWord for Topic 3  -----------------------
Topic 3  words | neg ['core', 'lead', 'america', 'capacity', 't