# Libaries

In [1]:
from bs4 import BeautifulSoup
from requests import get
from nltk import word_tokenize
from nltk.corpus import stopwords
import collections
import pandas as pd

# LDA Generative Model

Priors: 
- Distribution over vocabulary for topic k in {1..K}: beta\[k\] ~ Dirichlet(V, eta)
- Distribution over topics (latent variables): theta ~ Dirichlet(K, alpha)

For each document:
- Choose number of words: N ~ Poisson(ξ)

For each of the N words w:
- Choose a topic: z ~ Cat(K, theta)
- Choose a word: w ~ Cat(V, beta\[z\])

Note: This model follows the ‘bag of words’ assumption, such that given the topic proportions,
each word drawn is independent of any other words in the document. 

![Graphical representation](images/LDA_PGM_representation.PNG)

### Variational Inference

To use variational inference, the edges between θ (theta), z and w are removed to make inference on LDA model tractable. 

![Variational Inference](images\Variational_Distribution_representation.PNG)

# Global Variables

In [2]:
faculty_url = 'https://www.eecs.mit.edu/people/faculty-advisors'
arXiv_format = 'arxiv.org/find/{}/1/au:+{}_{}/0/1/0/all/0/1' # arxiv.org/find/(subject)/1/au:+(lastname)_(initial)/0/1/0/all/0/1
search_url_format = 'https://arxiv.org/search/?query="{}"&searchtype=author'
subjects = {'Computer Science': 'Computer Science', 
            'Electrical Engineering': 'Electrical Engineering and Systems Science',
            'Physics': 'Physics'}
all_papers_columns = ['Name', 'Abstract']

# Web Sraping

1. Get Facultys

In [3]:
def getFacultyNames():
    faculty_page = get(faculty_url)
    faculty_page_content = BeautifulSoup(faculty_page.content, 'html.parser')
    names_cont = faculty_page_content.select('div.views-field-title span.card-title a')
    names = [name_cont.contents[0] for name_cont in names_cont]
    
    return names

In [4]:
names = getFacultyNames()

2. Scrape Papers

In [5]:
def scrapeArXiV(names):
    papers = list()
    for name in names:
        search_url = search_url_format.format(name.replace(' ', '+'))
        papers_author = get(search_url)
        papers_author_content = BeautifulSoup(papers_author.content, 'html.parser')
        papers_author_body = papers_author_content.body
        results = papers_author_body.find_all("li", class_="arxiv-result")
        abstracts = [result.find("span", class_="abstract-full") for result in results]
        
        abstracts_content = [abstract.a.unwrap() for abstract in abstracts]
        abstracts_content = [abstract.contents[0] for abstract in abstracts]

        if abstracts_content:
            papers = papers + abstracts_content
        
    return papers 

In [6]:
papers = scrapeArXiV(names)

# Text Preprocessing

In [7]:
def word_cleaning_and_count(s):
    s_lower = s.lower()
    
    cleaning_set = set(stopwords.words('english'))
    tokens = word_tokenize(s_lower)
    tokens = [token for token in tokens if token.isalpha()]
    word_dict = dict(collections.Counter(tokens))
    for key in cleaning_set:
        word_dict.pop(key, None)
    return word_dict

In [None]:
papers_word_dict = [word_cleaning_and_count(paper) for paper in papers]
dup_keys = []
for i in range(len(papers_word_dict)):
    dup_keys = dup_keys + list(papers_word_dict[i].keys())

vocab = list(collections.Counter(dup_keys).keys())
lookup_table = dict(zip(vocab, range(len(vocab))))

### Save data

In [19]:
import json
with open('data/names', 'w') as fout:
    json.dump(names, fout)
with open('data/papers', 'w') as fout:
    json.dump(papers, fout)
with open('data/papers_word_dict', 'w') as fout:
    json.dump(papers_word_dict, fout)
with open('data/vocab', 'w') as fout:
    json.dump(vocab, fout)
with open('data/lookup_table', 'w') as fout:
    json.dump(lookup_table, fout)

# LDA

In [3]:
no_topics = 5

### Load data

In [4]:
import json
with open('data/names', 'r') as json_file:
    names = json.load(json_file)
with open('data/papers', 'r') as json_file:
    papers = json.load(json_file)
with open('data/papers_word_dict', 'r') as json_file:
    papers_word_dict = json.load(json_file)
with open('data/vocab', 'r') as json_file:
    vocab = json.load(json_file)
with open('data/lookup_table', 'r') as json_file:
    lookup_table = json.load(json_file)
    
vocab_size = len(vocab)

### Using sklearn

In [5]:
doc_vecs = []
for paper in papers_word_dict: 
    doc_vec = [0 for _ in range(vocab_size)]
    for token, occurs in paper.items(): 
        doc_vec[lookup_table[token]] = occurs
    doc_vecs.append(doc_vec)

In [6]:
from sklearn.decomposition import LatentDirichletAllocation

# Run the LDA
lda = LatentDirichletAllocation(n_topics=no_topics, learning_method='online').fit(doc_vecs)

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print('Topic %d:' % (topic_idx))
        print(' '.join([vocab[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
display_topics(lda, doc_vecs, no_top_words)



Topic 0:
information algorithms model results problem show paper network time based
Topic 1:
data model learning network neural networks using models performance training
Topic 2:
channel capacity csi coding also transmitter receiver scheme number length
Topic 3:
planning motion policy robot policies magnetic spin emission observations density
Topic 4:
n problem algorithm graph show time linear algorithms k results


### End-to-end Code (SVILDA algorithm) 

In [5]:
doc_vecs = []
for paper in papers_word_dict: 
    wordslist = []
    countslist = []
    for token, occurs in paper.items(): 
        wordslist.append(lookup_table[token])
        countslist.append(occurs)
    doc_vecs.append((wordslist, countslist))

In [6]:
from svilda import SVILDA
iterations = 10000
lda = SVILDA(vocab, no_topics, len(doc_vecs), 0.1, 0.01, 1, 0.75, iterations)
lda.runSVI(doc_vecs)

ITERATION 0  running document number  161
ITERATION 100  running document number  787
ITERATION 200  running document number  369
ITERATION 300  running document number  1031
ITERATION 400  running document number  61
ITERATION 500  running document number  884
ITERATION 600  running document number  630
ITERATION 700  running document number  1142
ITERATION 800  running document number  982
ITERATION 900  running document number  176
ITERATION 1000  running document number  1493
ITERATION 1100  running document number  826
ITERATION 1200  running document number  495
ITERATION 1300  running document number  1658
ITERATION 1400  running document number  1016
ITERATION 1500  running document number  708
ITERATION 1600  running document number  591
ITERATION 1700  running document number  438
ITERATION 1800  running document number  1660
ITERATION 1900  running document number  808
ITERATION 2000  running document number  1217
ITERATION 2100  running document number  1461
ITERATION 2200 

In [7]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model._lambda):
        print('Topic %d:' % (topic_idx))
        print(' '.join([vocab[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
display_topics(lda, doc_vecs, no_top_words)

Topic 0:
paper number two problems information methods use systems provide large
Topic 1:
problem show n data also optimal networks performance propose channel
Topic 2:
model time network given one applications demonstrate function linear many
Topic 3:
algorithm algorithms new present approach based set work consider k
Topic 4:
results using method graph models learning system study result distribution
