# Topic Extraction with respect to author 

This book take much of the work from this repo: https://github.com/braemy/mentor-mentee-recommender-system. Except for the number of topic (increased to 15), all other parameters and methodology has not been altered with.

In [25]:
import nltk
import pandas as pd
import numpy as np
import pprint
import utils as utl
from time import time
from gensim import corpora, models, utils
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import EnglishStemmer
from tqdm import tqdm
from tqdm import tqdm_notebook as tqdm
import csv
from ast import literal_eval

### Inputting Publication Title and Text

In [2]:
df_auth_titl=pd.read_csv('data/01titleAuthorVenue.txt', sep = '|', 
                     names = ['publicationTitle','authorName'], 
                     encoding='utf-8', usecols=(0,2), skiprows=1, dtype={0:str, 1:str})

Concatenating the titles 

In [3]:
tqdm.pandas(desc='Progress: ')
df_auth_subj=df_auth_titl.groupby('authorName')['publicationTitle'].progress_apply(lambda x: ' '.join(x.astype(str)))


Progress: 100%|█████████▉| 2055660/2055661 [02:46<00:00, 12363.86it/s]


In [4]:
#Uncomment this cell if you don't have the data on your computer
# nltk.download("stopwords")
# nltk.download("wordnet")

### Text Pre-processing

Adding more stop words

In [5]:
english_stop_words = ["a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount",  "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as",  "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the",'like', 'think', 'know', 'want', 'sure', 'thing', 'send', 'sent', 'speech', 'print', 'time','want', 'said', 'maybe', 'today', 'tomorrow', 'thank', 'thanks']
specific_stop_words = ['base', 'use', 'model', 'process', 'network']
sw =stopwords.words('english') + english_stop_words + specific_stop_words

Applying the NLTK stemmer

In [6]:
stemmer = EnglishStemmer()

In [7]:
df_auth_subj=df_auth_subj.to_frame()
df_auth_subj.reset_index(inplace=True)

In [8]:
df_auth_subj[['authorName']].to_csv('tm_data/id_auth.txt')

In [34]:
df_aid_subj = df_auth_subj['publicationTitle'].to_dict()

In [35]:
def pre_processing(titles):
    tokens = utils.simple_preprocess(titles)
    tokens = [stemmer.stem(x) for x in tokens]
    tokens = list(filter(lambda t: t not in sw, tokens))
    return tokens

In [36]:
authorID_to_titles_stem = {id_: pre_processing(titles) for id_, titles in tqdm(df_aid_subj.items())}



[A[A




In [37]:
utl.pickle_data(authorID_to_titles_stem, "tm_data/authorID_to_titles_stem.p")    

In [26]:
# df_load = pd.read_csv('tm_data/auth_titl_clean.csv')

In [39]:
authorID_to_document = authorID_to_titles_stem

### Building topic model use gensim LDA approach

In [40]:
dictionary = corpora.Dictionary([doc for doc in tqdm(authorID_to_document.values())])
corpus = [dictionary.doc2bow(doc) for doc in tqdm(authorID_to_document.values())]

In [41]:
#parameters
num_topics = 50 # number of topics LDA has to select
passes = 1 # number of passe in the lda training
num_words = 5 # number of most important word in one topic to be printed

In [42]:
tmp = corpus
corpus = tmp
corpus = np.random.choice(corpus, int(len(corpus)/1000))
len(corpus)

2055

In [43]:
c = [c for c in tqdm(tmp) if len(c)> 100]
len(c)

89728

Building the model

In [None]:
start = time()
pp = pprint.PrettyPrinter(depth=2)
lda = models.LdaModel(c, num_topics=num_topics, id2word = dictionary, passes=passes)
print("Training time:", round((time()-start)/60,2),"[min]")
pp.pprint(lda.print_topics(lda.num_topics, num_words=num_words))
lda.save('lda.model')
utl.pickle_data(lda, "tm_data/lda_model__20_100.p")

Computing topic scores for each author over publication title 'documents'

In [None]:
def compute_score(titles):
    total_score = np.zeros(num_topics)
    for title in titles:
        #lda output : [(id1, score1), (id2, score2),... if id != 0]
        for id_, value in lda[dictionary.doc2bow(title)]:
            total_score[id_] += value
    return total_score    

In [None]:
score_by_author_by_document = [compute_score([doc]) for _, doc in tqdm(authorID_to_document.items())]
utl.pickle_data(score_by_author_by_document, "tm_data/score_by_author_by_document.p")