In [7]:
import re
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

In [8]:
def pre_processing(sentence):
    return remove_stopwords(tokenize_sentence(remove_punctuation(sentence)))

# Remove punctuation from a list of words
def remove_punctuation(sentence):
    return re.sub(r'[^\w\s]', '', sentence)

# Remove stopwords from a list of words
def remove_stopwords(words_list):
    stopwords = open("stop_words_FULL.txt", "r")
    stopwords_list = []
    for word in stopwords:
        stopwords_list.append(word.replace('\n', ''))
    stopwords.close()
    return [value.lower() for value in words_list if value.lower() not in stopwords_list]

# Tokenize the input sentence and also lemmatize its words
def tokenize_sentence(sentence):
    words_list = []
    lmtzr = WordNetLemmatizer()
    for tag in nltk.pos_tag(word_tokenize(sentence)):
        if (tag[1][:2] == "NN"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.NOUN))
        elif (tag[1][:2] == "VB"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.VERB))
        elif (tag[1][:2] == "RB"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.ADV))
        elif (tag[1][:2] == "JJ"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.ADJ))
    return words_list

# Union of the pre-processed words of the definitions and terms from the examples in WN for a sense.
def get_signature(sense):
    signature = []
    for word in tokenize_sentence(sense.definition()):  # definition tokenization
        signature.append(word)
    for example in sense.examples():  # example tokenization
        for word in tokenize_sentence(example):
            # Merge definition and examples
            signature.append(word)
    return signature

Read corpus file. It return a list of list of documents and words for each document.

In [9]:
def read_corpus(txt_file):
    
    with open(txt_file, encoding='utf-8') as file:
        # for each doc create list of pre-processed words in that doc (list of lists)
        documents_words = []

        for line in file:
            if "<doc" in line:  # tag for new doc
                document_words = []  # list of words that will be part of the document
                while True:
                    next_line = file.readline()  # read next line

                    # remove unuseful tags
                    next_line_proc = next_line.replace("<p> ", "").replace("</p>\n", "").replace("/p", "")

                    if "</doc>" in next_line:
                        break
                    # pre-processing steps
                    sentence_words = pre_processing(next_line_proc)
                    document_words.extend(sentence_words)
                documents_words.append(document_words)
        file.close()

    print("Documents number: ", len(documents_words))
    
    return documents_words

Topic modelling using gensim

In [10]:
from gensim import corpora, models

topic_num = 10
topic_words_num = 5

def topic_modelling(documents_words):
    
    # Create a dict with integer keys for all words
    dictionary_LDA = corpora.Dictionary(documents_words)

    # delete all terms that do NOT appear in at least 3 documents.
    # delete all terms that appear in more than 60% of documents (see filter_extremes official doc).
    dictionary_LDA.filter_extremes(no_below=3, no_above=0.6)

    # Converts each document into a list of BoW (list of (id_term, term_frequency) for each term in doc)
    corpus_idbow_freq = [dictionary_LDA.doc2bow(document_words) for document_words in documents_words]
    
    # https://radimrehurek.com/gensim/models/ldamodel.html
    lda_model = models.LdaModel(corpus_idbow_freq, num_topics=topic_num, \
                                id2word=dictionary_LDA, \
                                passes=3, alpha=[0.01] * topic_num, \
                                eta=[0.01] * len(dictionary_LDA.keys()))
    
    return lda_model, corpus_idbow_freq

Show topics.

In [11]:
documents_words = read_corpus("travelling.txt")

model, corpus_idbow_freq = topic_modelling(documents_words)

topics = {'Topic ' + str(i): [(token, round(score, 3)) for token, score in model.show_topic(i, topn=topic_words_num)] for i in range(0, model.num_topics)}
for key, value in topics.items():
    print(key,":", topics[key])

Documents number:  100
Topic 0 : [('students', 0.024), ('good', 0.013), ('stay', 0.013), ('language', 0.012), ('exam', 0.012)]
Topic 1 : [('clause', 0.023), ('word', 0.02), ('example', 0.015), ('result', 0.015), ('learn', 0.014)]
Topic 2 : [('clause', 0.048), ('condition', 0.019), ('conditionals', 0.018), ('main', 0.018), ('happen', 0.018)]
Topic 3 : [('students', 0.066), ('exam', 0.034), ('speak', 0.026), ('book', 0.026), ('sb', 0.026)]
Topic 4 : [('book', 0.016), ('grammar', 0.014), ('hotel', 0.012), ('beach', 0.01), ('word', 0.009)]
Topic 5 : [('clause', 0.02), ('happen', 0.019), ('study', 0.014), ('didnt', 0.012), ('help', 0.012)]
Topic 6 : [('level', 0.018), ('clause', 0.017), ('esl', 0.016), ('teach', 0.016), ('student', 0.015)]
Topic 7 : [('travel', 0.016), ('holiday', 0.016), ('clause', 0.009), ('place', 0.009), ('visit', 0.007)]
Topic 8 : [('travel', 0.037), ('trip', 0.014), ('place', 0.013), ('money', 0.013), ('journey', 0.013)]
Topic 9 : [('lesson', 0.051), ('audio', 0.012),

Show topics for each document.

In [12]:
print ("Documents topic list")
for i in range (0, len(corpus_idbow_freq)):
    print ("Doc", i, ":", model[corpus_idbow_freq[i]])

Documents topic list
Doc 0 : [(8, 0.99850225)]
Doc 1 : [(3, 0.12565248), (7, 0.87409115)]
Doc 2 : [(2, 0.11221407), (9, 0.8859294)]
Doc 3 : [(3, 0.13379017), (4, 0.7342464), (9, 0.13156591)]
Doc 4 : [(9, 0.99982005)]
Doc 5 : [(1, 0.94413257), (3, 0.055813007)]
Doc 6 : [(7, 0.99901205)]
Doc 7 : [(1, 0.9993387)]
Doc 8 : [(7, 0.13536003), (8, 0.86456543)]
Doc 9 : [(4, 0.022431323), (7, 0.8822507), (8, 0.069938906), (9, 0.025176316)]
Doc 10 : [(1, 0.8725095), (2, 0.1184589)]
Doc 11 : [(5, 0.048307106), (6, 0.85210794), (8, 0.09937094)]
Doc 12 : [(4, 0.9991354)]
Doc 13 : [(6, 0.9996692)]
Doc 14 : [(3, 0.019953169), (7, 0.95620304), (9, 0.023457186)]
Doc 15 : [(3, 0.023912033), (4, 0.26207802), (6, 0.71389556)]
Doc 16 : [(1, 0.9992862)]
Doc 17 : [(7, 0.99975)]
Doc 18 : [(2, 0.7755761), (5, 0.16119501), (6, 0.06296783)]
Doc 19 : [(1, 0.8710723), (5, 0.1286116)]
Doc 20 : [(1, 0.3518342), (5, 0.64800864)]
Doc 21 : [(1, 0.031995986), (2, 0.9676928)]
Doc 22 : [(1, 0.9996566)]
Doc 23 : [(5, 0.9996