# Mining Topics

## Text Processing 

In [1]:
# Import the libraries needed
from nlp_functions import text_processing as tp
from nlp_functions import word_association as wa
from nlp_functions import article_segmentation as arse
from nlp_functions import mining_topics as mt
import numpy as np

## Mining Articles and Titles

In [2]:
titles = arse.get_titles('./../EXCELSIOR_100_files/', './titles/', 0)
articles = arse.get_articles('./../EXCELSIOR_100_files/', 0)

### Normalize articles

In [3]:
tokenize_articles = [tp.sentence_tokenize(article) for article in articles]

In [4]:
clean_articles = [tp.delete_stop_words_sents(article, './nlp_functions/stopwords_and_lemmas/stopwords_es.txt') for article in tokenize_articles]

In [5]:
lemmatize_articles = list()
for article in clean_articles:
    lemmatize_articles.append(tp.lemmatize_sents(article, './nlp_functions/stopwords_and_lemmas/generate.txt'))

## Mining Topics

## Distribution of Topics in Documents

In [6]:
vocabulary = arse.get_vocabulary_from_articles(lemmatize_articles)
probs_word_background = mt.get_probs_word_background(lemmatize_articles, vocabulary)
probs_word_topic = mt.get_probs_word_topic(vocabulary)
article_2 = lemmatize_articles[1]
counts_article_2 = mt.get_counts_article(article_2, vocabulary)
len_voc = len(vocabulary)
probs_word_topic = mt.em(probs_word_topic, probs_word_background, counts_article_2, len_voc)

Iteration 0 :
	Log Maximum Likelihood: -2028.225242802328
Iteration 1 :
	Log Maximum Likelihood: -1581.086917581526
Iteration 2 :
	Log Maximum Likelihood: -1562.0612377887032
Iteration 3 :
	Log Maximum Likelihood: -1559.9733431122038
Iteration 4 :
	Log Maximum Likelihood: -1559.4204121037308
Iteration 5 :
	Log Maximum Likelihood: -1559.252306241006
Iteration 6 :
	Log Maximum Likelihood: -1559.186444088226
Iteration 7 :
	Log Maximum Likelihood: -1559.1531260448373
Iteration 8 :
	Log Maximum Likelihood: -1559.1330691694293
Iteration 9 :
	Log Maximum Likelihood: -1559.1196783898877
Iteration 10 :
	Log Maximum Likelihood: -1559.1102086097587
Iteration 11 :
	Log Maximum Likelihood: -1559.1032840492987
Iteration 12 :
	Log Maximum Likelihood: -1559.098100454588
Iteration 13 :
	Log Maximum Likelihood: -1559.0941411763363
Iteration 14 :
	Log Maximum Likelihood: -1559.0910591090274
Iteration 15 :
	Log Maximum Likelihood: -1559.0886163830605
Iteration 16 :
	Log Maximum Likelihood: -1559.086648027

In [16]:
dist_background = mt.create_dict(vocabulary, probs_word_background)
dist_background = mt.sort_dict(dist_background)
dist_background[:10]

[('poder', 0.028153564899451554),
 ('año', 0.022851919561243144),
 ('hacer', 0.02230347349177331),
 ('millón', 0.02120658135283364),
 ('decir', 0.01809872029250457),
 ('empresa', 0.01736745886654479),
 ('méxico', 0.015539305301645339),
 ('país', 0.015539305301645339),
 ('gobierno', 0.015173674588665448),
 ('abril', 0.014990859232175503)]

In [8]:
dist_topic = mt.create_dict(vocabulary, probs_word_topic)
dist_topic = mt.sort_dict(dist_topic)

In [9]:
dist_topic[:10]

[('mussolini', 0.029176898398818058),
 ('italia', 0.024314081999015046),
 ('duce', 0.01945126559921204),
 ('italiano', 0.014588449199409029),
 ('scalfaro', 0.014588449199409029),
 ('alemán', 0.01422281532580865),
 ('romo', 0.0140399983245969),
 ('julio', 0.013308729847111437),
 ('extraordinario', 0.00972563279960602),
 ('hitler', 0.00972563279960602)]

Implementación del mismo algoritmo usando la libreria Gensim

In [10]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
import gensim
from gensim import corpora

## Obtensión de articulos

In [11]:
titles = arse.get_titles('./../EXCELSIOR_100_files/', './titles/', 0)
articles = arse.get_articles('./../EXCELSIOR_100_files/', 0)

Normalización

In [12]:
tokenize_articles = [tp.word_tokenize(article) for article in articles]
clean_articles = [tp.delete_stop_words(article, './nlp_functions/stopwords_and_lemmas/stopwords_es.txt') for article in tokenize_articles]
lemmatize_articles = [tp.lemmatize(article, './nlp_functions/stopwords_and_lemmas/generate.txt') for article in clean_articles]

## Minning topics


In [15]:
dictionary = corpora.Dictionary(lemmatize_articles)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in lemmatize_articles]

Lda = gensim.models.ldamodel.LdaModel

ldamodel = Lda(doc_term_matrix, num_topics=4, id2word = dictionary, passes=500)

In [17]:
print(ldamodel.print_topics(num_topics=4, num_words=10))

[(0, '0.009*"millón" + 0.008*"año" + 0.007*"empresa" + 0.006*"financiero" + 0.005*"poder" + 0.005*"méxico" + 0.005*"país" + 0.005*"dólar" + 0.004*"mil" + 0.004*"leche"'), (1, '0.006*"gobierno" + 0.006*"poder" + 0.005*"hacer" + 0.005*"peso" + 0.004*"decir" + 0.004*"internet" + 0.004*"empresa" + 0.004*"tierra" + 0.004*"trabajador" + 0.004*"hectárea"'), (2, '0.007*"poder" + 0.004*"decir" + 0.004*"hacer" + 0.004*"si" + 0.004*"justicia" + 0.004*"público" + 0.003*"país" + 0.003*"política" + 0.003*"año" + 0.003*"méxico"'), (3, '0.007*"internet" + 0.007*"edición" + 0.006*"excelsior" + 0.006*"hacer" + 0.004*"información" + 0.003*"red" + 0.003*"día" + 0.003*"medio" + 0.003*"dar" + 0.003*"siguiente"')]
