In [36]:
import pandas as pd
import numpy as np
import time
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

In [37]:
# preprocessing doc

def preprocessing():
    # read data
    # TODO: in the script that generates non-phrases, remove ',' in numbers
    file = pd.read_csv('NSF_awardtopics.txt', error_bad_lines=False)
    file.columns = ['NSF Award Title Non-phrases']
    documents = file['NSF Award Title Non-phrases']
    
    word2id = {}
    id2word = {}
    docs = []
    currentDocument = []
    currentWordId = 0
    
    for document in documents:
        segList = gensim.utils.simple_preprocess(document)
        for word in segList: 
            if len(word) >= 3 and word not in gensim.parsing.preprocessing.STOPWORDS:
                if word in word2id:
                    currentDocument.append(word2id[word])
                else:
                    currentDocument.append(currentWordId)
                    word2id[word] = currentWordId
                    id2word[currentWordId] = word
                    currentWordId += 1
        docs.append(currentDocument)
        currentDocument = []
    return docs, word2id, id2word


In [38]:
# Go through each document & randomly assign each word in the document to a topic z
# gives you the topic representation of all documents & word distributions of all topics, though not good ones

def randomInitialize():
    # d: document index, doc: document content
    for d, doc in enumerate(docs): 
        zCurrentDoc = []   # topics for each word in the current document
        for w in doc:  # w: each word in the document
            pz = np.divide(np.multiply(ndz[d, :], nzw[:, w]), nz)
            
            # draw samples from a multinomial distribution; 
            # (n, pvals, size=None): n is num of experiments, pvals is probabilities of each of the p different outcomes
            # randomly sample a topic for word w. z is the topic index 
            z = np.random.multinomial(1, pz / pz.sum()).argmax()
            
            zCurrentDoc.append(z)  # add this word's topic to zCurrentDoc
            
            # update variables
            ndz[d, z] += 1
            nzw[z, w] += 1
            nz[z] += 1
            
        Z.append(zCurrentDoc)

In [39]:
# Improve by using gibbs sampling:
# for each doc d, 
#    for each word w, 
#         for each topic z, compute two things: 1. p(topic z| doc d) 2. p(word w| topic t)
#         we reassign w a new topic z with probability p(topic z| doc d) * p(word w| topic t) (p that topic t generated w)

def gibbsSampling():
    # regenerate topics for each word in each document
    for d, doc in enumerate(docs):
        for index, w in enumerate(doc):
            z = Z[d][index]
            # decrement the topic cnt for this word in this doc
            ndz[d, z] -= 1
            nzw[z, w] -= 1
            nz[z] -= 1
            
            # recalculate the probability of each word w belonging to each topic z
            pz = np.divide(np.multiply(ndz[d, :], nzw[:, w]), nz)
            
            # resample from the updated distribution
            z = np.random.multinomial(1, pz / pz.sum()).argmax()
            
            Z[d][index] = z 
            
            # update variables
            ndz[d, z] += 1
            nzw[z, w] += 1
            nz[z] += 1

In [40]:
# define hyperparameters
alpha = 5
beta = 0.1

# define iteration number
iterationNum = 20

# define how many topics we want to generate
K = 10  # number of topics

# preprocess
docs, word2id, id2word = preprocessing()
# print(docs)

# variables
Z = []  # list of lists. Z[i,j] means the topic index of [ith document, jth word]
N = len(docs) # total number of documents
M = len(word2id) # length of word list

ndz = np.zeros([N, K]) + alpha  # ndz[i,z] means in the ith document, number of words generated by topic z
nzw = np.zeros([K, M]) + beta   # nzw[z,w] means the number of word w generated by topic z
nz = np.zeros([K]) + M * beta   # nz[z] means the number of each words generated by topic z

# initialize
randomInitialize()


# realizing the pgm by gibbs sampling
for i in range(0, iterationNum):
    gibbsSampling()
    print("Iteration {} completed.".format(i+1))
    
# show results
topicwords = []
maxTopicWordsNum = 8  # let's say we want to show max top 10 words that contribute to each topic

for z in range(0, K): # for each topic
    ids = nzw[z, :].argsort()  # number of each word generated by topic z, ranked
    topicword = []
    for j in ids:
        topicword.insert(0, id2word[j])  # in decrementing order
    topicwords.append(topicword[0 : min(10, len(topicword))])
    
print(topicwords)

b'Skipping line 946: expected 1 fields, saw 2\nSkipping line 1957: expected 1 fields, saw 2\nSkipping line 2826: expected 1 fields, saw 2\nSkipping line 4376: expected 1 fields, saw 2\nSkipping line 5810: expected 1 fields, saw 3\nSkipping line 5811: expected 1 fields, saw 3\n'


Iteration 1 completed.
Iteration 2 completed.
Iteration 3 completed.
Iteration 4 completed.
Iteration 5 completed.
Iteration 6 completed.
Iteration 7 completed.
Iteration 8 completed.
Iteration 9 completed.
Iteration 10 completed.
Iteration 11 completed.
Iteration 12 completed.
Iteration 13 completed.
Iteration 14 completed.
Iteration 15 completed.
Iteration 16 completed.
Iteration 17 completed.
Iteration 18 completed.
Iteration 19 completed.
Iteration 20 completed.
[['research', 'data', 'systems', 'students', 'applications', 'science', 'analysis', 'techniques', 'technology', 'project'], ['information', 'new', 'human', 'learning', 'project', 'algorithms', 'models', 'work', 'design', 'time']]
