In [None]:
#***Data
# 50000_WoS.txt
# 50000_MedLine.txt
with open('../datasets/50000_WoS.txt','r',encoding='utf-8') as file:
    lines = file.readlines()
docs = list(lines)
print(len(docs))
print(docs[0][:500])

In [None]:
#***Pre-process and vectorize the documents
def remove_stopword():
    stopword = []
    with open('../wordList/stopword_list.txt','r',encoding = 'utf-8') as f:
        lines = f.readlines()
        for line in lines:
            line = line.replace('\n','')
            stopword.append(line)
    return stopword

def remove_academic_word():
    academic_word = []
    with open('../wordList/academic_word_list-2980.txt','r',encoding = 'utf-8') as f:
        lines = f.readlines()
        for line in lines:
            line = line.replace('\n','')
            academic_word.append(line)
    return academic_word

In [None]:
import re
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

def NLP_Prpcessing(text):
    text = str(text)
    # Remove non-English characters
    text = re.sub(r'[^a-zA-Z\']', " ", text)
    # Remove redundant spaces
    text = ' '.join(text.split())
    # Convert uppercase to lowercase and split words
    text = text.lower()
    tokenizer = RegexpTokenizer(r'\w+')         
    text = tokenizer.tokenize(text)
    
    # Remove stop words and common academic words 
    stopword = remove_stopword()  
    text = [word for word in text if word not in stopword]

    # Remove words that are only one character
    text = [token for token in text if not token.isnumeric()]
    text = [token for token in text if len(token) > 1]
    # Lemmatize the documents
    lemmatizer = WordNetLemmatizer()
    #text = [lemmatizer.lemmatize(token, pos = 'v') for token in text]
    text = [lemmatizer.lemmatize(token, pos = 'a') for token in text]
    text = [lemmatizer.lemmatize(token, pos = 'n') for token in text]

    academic_word = remove_academic_word()  
    text = [word for word in text if word not in academic_word]

    return text

In [None]:
from tqdm import  tqdm
for i in tqdm(range(len(docs))):
    docs[i] = NLP_Prpcessing(docs[i])
print(docs[0])

In [None]:
#*Preserve phrases. When a phrase occurs in more than 20 documents, it will not be split. e.g., machine_learning
from gensim.models import Phrases
bigram = Phrases(docs, min_count = 20)
for idx in tqdm(range(len(docs))):
    docs[idx] = bigram[docs[idx]]


#* Remove low frequency terms (occurring in less than 2 documents) and high frequency terms (occurring in more than 60% of documents)
from gensim import corpora
from gensim.corpora import Dictionary
dictionary = Dictionary(docs)
dictionary.filter_extremes(no_below = 2, no_above = 0.5)
#print(dictionary)

In [None]:
#* Convert text to vectors
corpus = [dictionary.doc2bow(doc) for doc in docs]
#print(corpus[0])

print('Number of unique tokens: %d' % len(dictionary))  
print('Number of documents: %d' % len(corpus))       

In [None]:
'''
unique_word_list = []
for i in range(len(dictionary)):
    unique_word_list.append(dictionary[i])
f = open("unique_word_list.txt","w")
for line in unique_word_list:
    f.write(line+'\n')
f.close()
'''

In [None]:
#***Training
from gensim.models import LdaModel
temp = dictionary[0] 
id2word = dictionary
num_topics = 10
print('Number of topics: %d' % num_topics)
lda_model = LdaModel(corpus = corpus, id2word = id2word, alpha = 'auto', eta = 'auto',
                     iterations = 6000, num_topics = num_topics, chunksize = 5000, passes = 40)
# , random_state = 1

In [None]:
document_distribution = lda_model.get_document_topics(bow = corpus)
LDAlist = []
for documents in document_distribution:
    LDAlist.append(documents)

In [None]:
import numpy as ny
prob = [[]*len(LDAlist) for x in range(len(LDAlist))]
prob_max_index = [[]*len(LDAlist) for x in range(len(LDAlist))]
doc_labels = []
for docid in tqdm(range(len(LDAlist))):
    for tid in range(len(LDAlist[docid])):
        prob[docid].append(LDAlist[docid][tid][1])
    prob_max_index[docid] = prob[docid].index(max(prob[docid]))
    doc_labels.append(LDAlist[docid][prob_max_index[docid]][0])    
doc_labels[:20]

In [None]:
def get_ground_truth_label():
    ground_truth_label = []
    # 50000_WoS_WC.txt
    # 50000_MedLine_Label.txt
    with open('../datasets/50000_WoS_Lable.txt','r',encoding = 'utf-8') as f:
        lines = f.readlines()
        for line in lines:
            line = int(line.replace('\n',''))
            ground_truth_label.append(line)
    return ground_truth_label
ground_truth_label = get_ground_truth_label()

In [None]:
from sklearn import metrics
print(metrics.adjusted_rand_score(doc_labels, ground_truth_label))
print(metrics.fowlkes_mallows_score(doc_labels, ground_truth_label))
print(metrics.adjusted_mutual_info_score(doc_labels, ground_truth_label))