In [1]:
import re
import nltk
import string
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

In [2]:
def pre_processing(sentence):
    return remove_stopwords(tokenize_sentence(remove_punctuation(sentence)))

# Remove punctuation from a list of words
def remove_punctuation(sentence):
    return re.sub(r'[^\w\s]', '', sentence)

# Remove stopwords from a list of words
def remove_stopwords(words_list):
    stopwords = open("stop_words_FULL.txt", "r")
    stopwords_list = []
    for word in stopwords:
        stopwords_list.append(word.replace('\n', ''))
    stopwords.close()
    return [value.lower() for value in words_list if value.lower() not in stopwords_list]

# Tokenize the input sentence and also lemmatize its words
def tokenize_sentence(sentence):
    words_list = []
    lmtzr = WordNetLemmatizer()
    for tag in nltk.pos_tag(word_tokenize(sentence)):
        if (tag[1][:2] == "NN"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.NOUN))
        elif (tag[1][:2] == "VB"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.VERB))
        elif (tag[1][:2] == "RB"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.ADV))
        elif (tag[1][:2] == "JJ"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.ADJ))
    return words_list

def get_signature(sense):
    signature = []
    for word in tokenize_sentence(sense.definition()):  # definition tokenization
        signature.append(word)
    for example in sense.examples():  # example tokenization
        for word in tokenize_sentence(example):
            # Merge definition and examples
            signature.append(word)
    return signature

In [3]:
# Topic modelling using gensim
from gensim import corpora, models

FILE = 'italian_cuisine_2.txt'  #file contains a set of documents divided by paragraphs
TOPIC_NUMBER = 10
WORDS_FOR_TOPIC = 5

with open(FILE, encoding='utf-8') as file:
    # for each doc create list of pre-processed words in that doc (list of lists)
    list_document_words = []

    for line in file:
        if "<doc" in line:  # tag for new doc
            document_words = []  # list of words that will be part of the document
            while True:
                next_line = file.readline()  # read next line

                #remove unuseful tags
                new_next_line = next_line.replace("<p> ", "").replace("</p>\n", "").replace("/p", "")

                if "</doc>" in next_line:
                    break
                #pre-processing steps
                sentence_words = pre_processing(new_next_line)
                document_words.extend(sentence_words)
            list_document_words.append(document_words)
    file.close()

    print("Numero di documenti: ", len(list_document_words))

    # Create a dict with integer keys for all words
    dictionary_LDA = corpora.Dictionary(list_document_words)

    # delete all terms that do NOT appear in at least 3 documents.
    #delete all terms that appear in more than 50% of documents (see filter_extremes official doc).
    dictionary_LDA.filter_extremes(no_below=3)

    # Converts each document into a list of BoW (list of (id_term, term_frequency) for each term in doc)
    corpus = [dictionary_LDA.doc2bow(document_words) for document_words in list_document_words]

    lda_model = models.LdaModel(corpus, num_topics=TOPIC_NUMBER, \
                                id2word=dictionary_LDA, \
                                passes=4, alpha=[0.01] * TOPIC_NUMBER, \
                                eta=[0.01] * len(dictionary_LDA.keys()))

    for i, topic in lda_model.show_topics(formatted=True, num_topics=TOPIC_NUMBER, num_words=WORDS_FOR_TOPIC):
        print(str(i) + ": " + topic)
        print()

    # probability distribution of topics in the document 5
    print(lda_model[corpus[5]])  # corpus[0] means the first document.

Numero di documenti:  310
0: 0.040*"knife" + 0.014*"blade" + 0.010*"steel" + 0.010*"chef" + 0.009*"cast"

1: 0.019*"wedding" + 0.014*"italian" + 0.012*"bride" + 0.010*"chef" + 0.010*"christmas"

2: 0.021*"sausage" + 0.017*"pan" + 0.013*"italian" + 0.011*"dough" + 0.010*"hair"

3: 0.018*"chef" + 0.018*"blog" + 0.018*"italian" + 0.017*"chicken" + 0.007*"pan"

4: 0.013*"meat" + 0.011*"pork" + 0.009*"italian" + 0.009*"chop" + 0.009*"onion"

5: 0.016*"subscribe" + 0.014*"email" + 0.012*"newsletter" + 0.012*"steak" + 0.010*"feature"

6: 0.022*"italian" + 0.018*"cake" + 0.008*"dish" + 0.008*"love" + 0.007*"chef"

7: 0.020*"sauce" + 0.011*"sushi" + 0.008*"tomato" + 0.008*"italian" + 0.008*"roll"

8: 0.017*"cuisine" + 0.014*"dish" + 0.013*"italian" + 0.009*"include" + 0.007*"italy"

9: 0.044*"chop" + 0.042*"pork" + 0.013*"pan" + 0.009*"side" + 0.009*"oil"

[(1, 0.37192294), (9, 0.6272611)]
