# Libaries

In [79]:
from bs4 import BeautifulSoup
from requests import get
from nltk import word_tokenize
from nltk.corpus import stopwords
import collections

# Global Variables

In [12]:
faculty_url = 'https://www.eecs.mit.edu/people/faculty-advisors'
arXiv_format = 'arxiv.org/find/{}/1/au:+{}_{}/0/1/0/all/0/1' # arxiv.org/find/(subject)/1/au:+(lastname)_(initial)/0/1/0/all/0/1
search_url_format = 'https://arxiv.org/search/?query="{}"&searchtype=author'
subjects = {'Computer Science': 'Computer Science', 
            'Electrical Engineering': 'Electrical Engineering and Systems Science',
            'Physics': 'Physics'}
all_papers_columns = ['Name', 'Abstract']

# Web Sraping

1. Get Facultys

In [3]:
def getFacultyNames():
    faculty_page = get(faculty_url)
    faculty_page_content = BeautifulSoup(faculty_page.content, 'html.parser')
    names_cont = faculty_page_content.select('div.views-field-title span.card-title a')
    names = [name_cont.contents[0] for name_cont in names_cont]
    
    return names

In [None]:
names = getFacultyNames()

2. Scrape Papers

In [69]:
def scrapeArXiV(names):
    papers = list()
    for name in names:
        search_url = search_url_format.format(name.replace(' ', '+'))
        papers_author = get(search_url)
        papers_author_content = BeautifulSoup(papers_author.content, 'html.parser')
        papers_author_body = papers_author_content.body
        results = papers_author_body.find_all("li", class_="arxiv-result")
        abstracts = [result.find("span", class_="abstract-full") for result in results]
        
        abstracts_content = [abstract.a.unwrap() for abstract in abstracts]
        abstracts_content = [abstract.contents[0] for abstract in abstracts]

        if abstracts_content:
            papers = papers + abstracts_content
        
    return papers 

In [None]:
papers = scrapeArXiV(names)

# Text Preprocessing

In [130]:
def word_cleaning_and_count(s):
    s_lower = s.lower()
    
    cleaning_set = set(stopwords.words('english'))
    tokens = word_tokenize(s_lower)
    tokens = [token for token in tokens if token.isalpha()]
    word_dict = dict(collections.Counter(tokens))
    for key in cleaning_set:
        word_dict.pop(key, None)
    return word_dict

In [134]:
papers_word_dict = [word_cleaning_and_count(paper) for paper in papers]
dup_keys = []
for i in range(len(papers_word_dict)):
    dup_keys = dup_keys + list(papers_word_dict[i].keys())

vocab = list(collections.Counter(dup_keys).keys())
vocab_size = len(vocab)
lookup_table = dict(zip(vocab, range(len(vocab))))

# LDA

In [145]:
no_topics = 5

### With sklearn

In [150]:
from sklearn.decomposition import LatentDirichletAllocation

doc_vecs = []
for paper in papers_word_dict: 
    doc_vec = [0 for _ in range(vocab_size)]
    for token, occurs in paper.items(): 
        doc_vec[lookup_table[token]] = occurs
    doc_vecs.append(doc_vec)

# Run the LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=200, learning_method='online', learning_offset=50.,random_state=0).fit(doc_vecs)

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print('Topic %d:' % (topic_idx))
        print(' '.join([vocab[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
display_topics(lda, doc_vecs, no_top_words)

Topic 0:
motion mri time temporal model series robust markov deformations volumetric
Topic 1:
refer vector seven proposed maps matching objective algorithm initial suggest
Topic 2:
problem matching pursuit unknown algorithms sparsity three squares least msso
Topic 3:
substantial second examining commonly registration two fetal template advantage often
Topic 4:
motion temporal time mri series images model registration assumption small


