In [1]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import wordnet
from sklearn.decomposition import NMF,LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models.ldamodel import LdaModel

In [125]:
# list of text documents for 1 author
text = []
authors = 10
#for i in range(authors):
for j in range(16,21):
    filename = "articles/test/author_"+str(authors)+"/"+str(j)+".txt"
    with open(filename, 'r' ,encoding = "utf8") as file:
        sentence = file.read()
        sentence = sentence.translate(str.maketrans('', '', string.punctuation))
        text.append(sentence.lower())                       

In [126]:
# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

In [127]:
def tokenizer_lemmatizer(sentence):
    lemmatizer = WordNetLemmatizer()
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return lemmatized_sentence

In [128]:
# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfid65f = tfidf_vectorizer.fit_transform(text)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(stop_words='english')
tf = tf_vectorizer.fit_transform(text)
tf_feature_names = tf_vectorizer.get_feature_names()

In [135]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

no_topics = 5

# Run NMF
#nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)




In [136]:
df = pd.DataFrame(lda.transform(tf))
print(np.mean(df,axis=0))

0    0.591542
1    0.005371
2    0.197554
3    0.005374
4    0.200160
dtype: float64


In [137]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
#display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
got soccer mom did school game tourney hung nt cool
Topic 1:
kid guys whatnot main dying doing checked dumb jena lot
Topic 2:
games cities ve hey g2g school past just weekend couple
Topic 3:
tourney got place tomorrow heat party played saturday wicked cuz
Topic 4:
summer just tongiht scared wore lot house keely loving missed


In [None]:
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [None]:
cleaned_text = [tokenizer_lemmatizer(sentence) for sentence in text]
print(cleaned_text)

In [None]:
# Create Dictionary
import gensim.corpora as corpora
id2word = corpora.Dictionary(cleaned_text)

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in cleaned_text]

In [None]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

In [None]:
from gensim.models.ldamodel import LdaModel
# Build LDA model
lda_model = LdaModel(corpus=corpus,
                   id2word=id2word,
                   num_topics=10, 
                   random_state=100,
                   update_every=1,
                   chunksize=100,
                   passes=10,
                   alpha='auto',
                   per_word_topics=True)

In [None]:
# Print the Keyword in the 10 topics
print(lda_model.print_topics())
doc_lda = lda_model[corpus]