In [4]:
import pandas as pd
import gensim
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from pprint import pprint
import numpy as np
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

数据下载地址:https://www.kaggle.com/therohk/million-headlines/data

In [5]:
nltk.download('wordnet')
data = pd.read_csv('./data/abcnews-date-text.csv', error_bad_lines=False);
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
print(len(documents))
print(documents[:5])

1103663
                                       headline_text  index
0  aba decides against community broadcasting lic...      0
1     act fire witnesses must be aware of defamation      1
2     a g calls for infrastructure protection summit      2
3           air nz staff in aust strike for pay rise      3
4      air nz strike to affect australian travellers      4


###  Lemmatize example

In [7]:
print(WordNetLemmatizer().lemmatize('went', pos='v'))

go


### Stemmer Example

In [8]:
stemmer = SnowballStemmer('english')
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [9]:
stemmer = SnowballStemmer('english')
#提取词干,词性还原
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

#过滤停用词和长度小于3的单词
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [10]:
doc_sample = documents[documents['index'] == 100].values[0][0]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['more', 'women', 'urged', 'to', 'become', 'councillors']


 tokenized and lemmatized document: 
['women', 'urg', 'councillor']


In [11]:
processed_docs = documents['headline_text'].map(preprocess)
processed_docs[:10]

0            [decid, communiti, broadcast, licenc]
1                               [wit, awar, defam]
2           [call, infrastructur, protect, summit]
3                      [staff, aust, strike, rise]
4             [strike, affect, australian, travel]
5               [ambiti, olsson, win, tripl, jump]
6           [antic, delight, record, break, barca]
7    [aussi, qualifi, stosur, wast, memphi, match]
8            [aust, address, secur, council, iraq]
9                         [australia, lock, timet]
Name: headline_text, dtype: object

### Bag of words on the dataset

In [12]:
dictionary = gensim.corpora.Dictionary(processed_docs)
for i in range(10):
    print(i,dictionary[i])
print('number of total words:',len(dictionary))

0 decid
1 communiti
2 licenc
3 broadcast
4 awar
5 defam
6 wit
7 call
8 summit
9 infrastructur
number of total words: 62240


In [13]:
for i in range(10):
    print(i,dictionary.dfs[i])



0 1319
1 5640
2 1181
3 375
4 543
5 372
6 1705
7 9558
8 1041
9 1060


In [14]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
len(dictionary)
print('number of total words:',len(dictionary))

number of total words: 14142


In [15]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(2963, 1), (3393, 1), (4956, 1), (10226, 1)]

In [16]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
print(processed_docs[0])
print(bow_corpus[0])

['decid', 'communiti', 'broadcast', 'licenc']
[(5196, 1), (5568, 1), (6334, 1), (8141, 1)]


In [17]:
bow_doc_0 = bow_corpus[0]
for i in range(len(bow_doc_0)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_0[i][0], 
                                               dictionary[bow_doc_0[i][0]], 
bow_doc_0[i][1]))

Word 5196 ("decid") appears 1 time.
Word 5568 ("communiti") appears 1 time.
Word 6334 ("licenc") appears 1 time.
Word 8141 ("broadcast") appears 1 time.


### TF-IDF

In [18]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
for doc in corpus_tfidf:
    pprint(doc)
    break

[(5196, 0.4964985175717023),
 (5568, 0.38929654337861147),
 (6334, 0.5046520327464028),
 (8141, 0.5892908867507543)]


### Running LDA using Bag of Words

In [19]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.016*"council" + 0.014*"plan" + 0.013*"tasmanian" + 0.012*"turnbul" + 0.011*"work" + 0.011*"servic" + 0.010*"say" + 0.009*"park" + 0.009*"govern" + 0.008*"green"
Topic: 1 
Words: 0.026*"year" + 0.019*"market" + 0.017*"live" + 0.017*"australian" + 0.015*"rise" + 0.013*"price" + 0.013*"share" + 0.013*"farmer" + 0.012*"busi" + 0.011*"bank"
Topic: 2 
Words: 0.022*"govern" + 0.017*"elect" + 0.017*"interview" + 0.014*"say" + 0.013*"labor" + 0.013*"school" + 0.012*"leagu" + 0.012*"fund" + 0.012*"feder" + 0.011*"budget"
Topic: 3 
Words: 0.029*"melbourn" + 0.011*"hold" + 0.011*"game" + 0.009*"turn" + 0.008*"go" + 0.008*"video" + 0.007*"care" + 0.007*"climat" + 0.007*"season" + 0.007*"star"
Topic: 4 
Words: 0.022*"nation" + 0.021*"report" + 0.016*"rural" + 0.013*"health" + 0.012*"concern" + 0.011*"chang" + 0.010*"senat" + 0.009*"find" + 0.009*"parti" + 0.009*"liber"
Topic: 5 
Words: 0.049*"australia" + 0.020*"south" + 0.019*"north" + 0.018*"coast" + 0.015*"final" + 0.014*"china

### Running LDA using TF-IDF

In [20]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.010*"govern" + 0.008*"health" + 0.007*"rural" + 0.007*"fund" + 0.006*"drum" + 0.006*"council" + 0.006*"plan" + 0.005*"budget" + 0.005*"say" + 0.005*"chang"
Topic: 1 Word: 0.017*"polic" + 0.017*"charg" + 0.014*"murder" + 0.013*"crash" + 0.012*"woman" + 0.010*"court" + 0.010*"alleg" + 0.010*"jail" + 0.010*"death" + 0.009*"shoot"
Topic: 2 Word: 0.010*"leagu" + 0.008*"hill" + 0.008*"australia" + 0.007*"world" + 0.006*"rugbi" + 0.006*"octob" + 0.006*"novemb" + 0.006*"john" + 0.006*"peter" + 0.006*"tuesday"
Topic: 3 Word: 0.014*"interview" + 0.007*"final" + 0.007*"open" + 0.007*"septemb" + 0.007*"monday" + 0.006*"thursday" + 0.006*"climat" + 0.005*"miss" + 0.005*"april" + 0.005*"shark"
Topic: 4 Word: 0.007*"juli" + 0.007*"australia" + 0.006*"south" + 0.006*"korea" + 0.006*"june" + 0.005*"fiji" + 0.005*"china" + 0.004*"north" + 0.004*"russia" + 0.004*"protest"
Topic: 5 Word: 0.007*"asylum" + 0.006*"seeker" + 0.006*"island" + 0.006*"dairi" + 0.004*"firefight" + 0.004*"energi" 

### Performance evaluation by classifying sample document using LDA Bag of Words model

In [21]:
docId=100
print(documents['headline_text'][docId])
print(processed_docs[docId])
print('-------------------------------------')
for index, score in sorted(lda_model[bow_corpus[docId]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic{}: {}".format(score,index, lda_model.print_topic(index, 10)))

more women urged to become councillors
['women', 'urg', 'councillor']
-------------------------------------

Score: 0.7749839460818331	 
Topic0: 0.016*"council" + 0.014*"plan" + 0.013*"tasmanian" + 0.012*"turnbul" + 0.011*"work" + 0.011*"servic" + 0.010*"say" + 0.009*"park" + 0.009*"govern" + 0.008*"green"

Score: 0.025004759463502946	 
Topic6: 0.033*"queensland" + 0.027*"adelaid" + 0.024*"home" + 0.023*"open" + 0.018*"donald" + 0.018*"win" + 0.015*"break" + 0.014*"take" + 0.012*"lead" + 0.011*"hobart"

Score: 0.025003833795233864	 
Topic4: 0.022*"nation" + 0.021*"report" + 0.016*"rural" + 0.013*"health" + 0.012*"concern" + 0.011*"chang" + 0.010*"senat" + 0.009*"find" + 0.009*"parti" + 0.009*"liber"

Score: 0.02500318640073601	 
Topic7: 0.020*"test" + 0.019*"countri" + 0.016*"attack" + 0.014*"claim" + 0.013*"kill" + 0.013*"protest" + 0.012*"say" + 0.010*"island" + 0.010*"announc" + 0.010*"minist"

Score: 0.025001921876355325	 
Topic2: 0.022*"govern" + 0.017*"elect" + 0.017*"interview" 

### Performance evaluation by classifying sample document using LDA TF-IDF mode

In [22]:
print(documents['headline_text'][docId])
print(processed_docs[docId])
print('-------------------------------------')
for index, score in sorted(lda_model_tfidf[bow_corpus[docId]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic{}: {}".format(score,index, lda_model_tfidf.print_topic(index, 10)))

more women urged to become councillors
['women', 'urg', 'councillor']
-------------------------------------

Score: 0.7749672025110785	 
Topic0: 0.010*"govern" + 0.008*"health" + 0.007*"rural" + 0.007*"fund" + 0.006*"drum" + 0.006*"council" + 0.006*"plan" + 0.005*"budget" + 0.005*"say" + 0.005*"chang"

Score: 0.02501242505708388	 
Topic8: 0.024*"countri" + 0.022*"hour" + 0.011*"market" + 0.011*"turnbul" + 0.008*"weather" + 0.008*"share" + 0.007*"live" + 0.006*"price" + 0.005*"kid" + 0.005*"wall"

Score: 0.025005198952628434	 
Topic2: 0.010*"leagu" + 0.008*"hill" + 0.008*"australia" + 0.007*"world" + 0.006*"rugbi" + 0.006*"octob" + 0.006*"novemb" + 0.006*"john" + 0.006*"peter" + 0.006*"tuesday"

Score: 0.025002765044766295	 
Topic4: 0.007*"juli" + 0.007*"australia" + 0.006*"south" + 0.006*"korea" + 0.006*"june" + 0.005*"fiji" + 0.005*"china" + 0.004*"north" + 0.004*"russia" + 0.004*"protest"

Score: 0.02500241208928745	 
Topic7: 0.024*"trump" + 0.010*"grandstand" + 0.006*"histori" + 0.0

### Testing model on unseen document

In [23]:
unseen_document = "Five bystanders shot during police shootout in New Orleans"
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))
print('---------------------------------------------------------------------------------------------')
for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.3500005565346774	 Topic: 0.054*"polic" + 0.031*"sydney" + 0.020*"crash" + 0.020*"perth" + 0.018*"die"
Score: 0.1833332196787608	 Topic: 0.016*"council" + 0.014*"plan" + 0.013*"tasmanian" + 0.012*"turnbul" + 0.011*"work"
Score: 0.18333311398038923	 Topic: 0.033*"queensland" + 0.027*"adelaid" + 0.024*"home" + 0.023*"open" + 0.018*"donald"
Score: 0.18333310974357916	 Topic: 0.020*"test" + 0.019*"countri" + 0.016*"attack" + 0.014*"claim" + 0.013*"kill"
Score: 0.016666666678315654	 Topic: 0.029*"melbourn" + 0.011*"hold" + 0.011*"game" + 0.009*"turn" + 0.008*"go"
Score: 0.016666666677291653	 Topic: 0.033*"trump" + 0.031*"charg" + 0.030*"court" + 0.023*"murder" + 0.019*"face"
Score: 0.016666666677209844	 Topic: 0.026*"year" + 0.019*"market" + 0.017*"live" + 0.017*"australian" + 0.015*"rise"
Score: 0.01666666667708792	 Topic: 0.022*"govern" + 0.017*"elect" + 0.017*"interview" + 0.014*"say" + 0.013*"labor"
Score: 0.016666666676871875	 Topic: 0.022*"nation" + 0.021*"report" + 0.016*"rur