In [1]:
# Based on code from https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21

import nltk
import spacy
import gensim
import random
import pickle
import pandas as pd
from gensim import corpora
from spacy.lang.en import English
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

#spacy.load('en_core_web_sm')
spacy.load('es_core_news_sm')
#spacy.load("fr_core_news_sm")
#spacy.load("pt_core_news_sm")
nltk.download('wordnet')
nltk.download('stopwords')
#stop_list = set(nltk.corpus.stopwords.words('english'))
stop_list = set(nltk.corpus.stopwords.words('spanish'))
#stop_list = set(nltk.corpus.stopwords.words('french'))
#stop_list = set(nltk.corpus.stopwords.words('portuguese'))
stop_list.add('coronavirus')
stop_list.add('covid19')
stop_list.add('corona')
stop_list.add('covid-19')

parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in stop_list]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

[nltk_data] Downloading package wordnet to /Users/fv/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/fv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def get_topics(name, num_topics):
    text_data = []
    count = 0
    
    f = pd.read_json('data/qcri/' + name +'_geo.json', lines=True)
    for line in f['text']:
        count+= 1
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            #print('List of tokens:', tokens)
            text_data.append(tokens)
    
    f = pd.read_csv('data/out/' + name + '.csv', 'rb', delimiter = '\t')
    for line in f['text']:
        count+= 1
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            #print('List of tokens:', tokens)
            text_data.append(tokens)
    
    print(count)
    dictionary = corpora.Dictionary(text_data)
    corpus = [dictionary.doc2bow(text) for text in text_data]

    pickle.dump(corpus, open('models/corpus.pkl', 'wb'))
    dictionary.save('models/dictionary.gensim')

    NUM_TOPICS = num_topics
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
    ldamodel.save('models/model' + str(NUM_TOPICS) + '.gensim')
    topics = ldamodel.print_topics(num_words=15)
    for topic in topics:
        print(topic)

In [3]:
get_topics('Sanidadgob', 5)

63765
(0, '0.007*"aclara" + 0.007*"covidー19" + 0.006*"recomendaciones" + 0.006*"ministerio" + 0.005*"sanidad" + 0.005*"medidas" + 0.005*"contacto" + 0.005*"frente" + 0.005*"capacidad" + 0.005*"respecto" + 0.005*"madrid" + 0.004*"siempre" + 0.004*"casos" + 0.004*"menos" + 0.004*"mascarillas"')
(1, '0.009*"estevirusloparamosunidos" + 0.007*"españa" + 0.007*"riesgo" + 0.006*"recomendaciones" + 0.006*"información" + 0.005*"salud" + 0.005*"covidー19" + 0.005*"persona" + 0.004*"yomequedoencasa" + 0.004*"personal" + 0.004*"sanitarios" + 0.004*"población" + 0.004*"covid_19" + 0.003*"casos" + 0.003*"pacientes"')
(2, '0.007*"sanidad" + 0.007*"ministro" + 0.007*"estevirusloparamosunidos" + 0.006*"medidas" + 0.006*"enfermos" + 0.006*"fernando" + 0.006*"síntomas" + 0.006*"simón" + 0.005*"hospital" + 0.005*"epidemia" + 0.005*"pandemia" + 0.004*"deben" + 0.004*"muerto" + 0.004*"realidad" + 0.004*"alguien"')
(3, '0.030*"fallecidos" + 0.030*"datos" + 0.027*"curados" + 0.025*"estevirusloparamosunidos" + 