In [1]:
# download news group data set from sklearn
from sklearn.datasets import fetch_20newsgroups

In [3]:
#assign train and test data
ng_train = fetch_20newsgroups(subset = 'train',shuffle=True)
ng_test   = fetch_20newsgroups(subset = 'test',shuffle=True)

In [4]:
print(list(ng_train.target_names))

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [5]:
#sample news
ng_train.data[:3]

["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n",
 "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 

In [6]:
#shape of the subsets
print(ng_train.filenames.shape,ng_train.target.shape)

(11314,) (11314,)


In [9]:
#Data Preprocessing
#Tokenization
'''
Loading gensim and nltk libraries
'''
#!pip install gensim
import gensim

# Converts into tokens (Alternative to word_tokenize)
from gensim.utils import simple_preprocess

from gensim.parsing.preprocessing import STOPWORDS

from nltk.stem import WordNetLemmatizer, SnowballStemmer
#from nltk.stem import *
import numpy as np
np.random.seed(400)

In [10]:
#testing the lemmatize statment. It should convert past or future tense into present tense
WordNetLemmatizer().lemmatize('mules')

'mule'

In [11]:
#testing the stemming part before preprocessing. This should be changing any plural into singular word
import pandas as pd

stemmer = SnowballStemmer("english")

original_words=['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']

singles= [WordNetLemmatizer().lemmatize(plural,pos='v') for plural in original_words]

pd.DataFrame(data={'Original Words':original_words, 'Lemma':singles})

#Stemma is not performing well 
#singles1= [stemmer.stem(plural) for plural in original_words]
#singles2= [stemmer.stem(plural2) for plural2 in singles]
#stemma = pd.DataFrame(data={'Lemma':singles, 'Stemmed':singles2})

Unnamed: 0,Original Words,Lemma
0,caresses,caress
1,flies,fly
2,dies,die
3,mules,mules
4,denied,deny
5,died,die
6,agreed,agree
7,owned,own
8,humbled,humble
9,sized,size


In [12]:
# writing function for the entire dataset
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text,pos='v'))

#Tokenize and Lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token)>3:
            result.append(lemmatize_stemming(token))
            
    return result

In [13]:
#preview a document before preprocessing

#document_num = 50
doc_sample = 'This disk has failed many times. I would like to get it replaced.'

print("Original Document: ")
words=[]
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print("\n\nTokenized and lemmatized document: ")
print(preprocess(doc_sample))

Original Document: 
['This', 'disk', 'has', 'failed', 'many', 'times.', 'I', 'would', 'like', 'to', 'get', 'it', 'replaced.']


Tokenized and lemmatized document: 
['disk', 'fail', 'time', 'like', 'replac']


In [14]:
Processed_doc=[]
for doc in ng_train.data:
    Processed_doc.append(preprocess(doc))

In [15]:
len(Processed_doc)

11314

In [17]:
print(Processed_doc[11313])

['gun', 'caltech', 'kevin', 'gun', 'subject', 'steal', 'organ', 'california', 'institut', 'technolog', 'pasadena', 'line', 'distribut', 'nntp', 'post', 'host', 'alumni', 'caltech', 'summari', 'steal', 'pasadena', 'blue', 'white', 'honda', 'california', 'plate', 'serial', 'number', 'engin', 'number', 'turn', 'signal', 'mirror', 'light', 'tap', 'track', 'rider', 'session', 'willow', 'spring', 'tomorrow', 'guess', 'miss', 'help', 'babi']


In [20]:
print(Processed_doc[1133])

['tclock', 'orion', 'clock', 'subject', 'final', 'solut', 'gaza', 'nntp', 'post', 'host', 'orion', 'organ', 'univers', 'california', 'irvin', 'line', 'articl', 'center', 'polici', 'research', 'write', 'center', 'polici', 'research', 'subject', 'final', 'solut', 'gaza', 'isra', 'jew', 'fete', 'upris', 'warsaw', 'ghetto', 'fete', 'word', 'formal', 'common', 'refer', 'posit', 'joyous', 'event', 'misus', 'unsettl', 'repress', 'violent', 'mean', 'upris', 'gaza', 'ghetto', 'attempt', 'starv', 'gazan', 'certain', 'abhor', 'isra', 'polici', 'attitud', 'abus', 'palestinian', 'gazan', 'give', 'comparison', 'realiti', 'warsaw', 'ghetto', 'gaza', 'right', 'gazan', 'popul', 'resist', 'occup', 'recogn', 'intern', 'person', 'sens', 'justic', 'intern', 'recogn', 'right', 'occupi', 'entiti', 'maintain', 'order', 'especi', 'face', 'element', 'conscious', 'attempt', 'disrupt', 'civil', 'structur', 'iron', 'intern', 'recogn', 'focus', 'occupi', 'occupi', 'inher', 'conflict', 'israel', 'deni', 'gazan', 'op

In [18]:
#bag of words on data set
dictionary=gensim.corpora.Dictionary(Processed_doc)

In [19]:
print(dictionary)

Dictionary(61411 unique tokens: ['addit', 'bodi', 'bricklin', 'bring', 'bumper']...)


In [21]:
#Lets see if dictionary created sucessfully
count=0
for k,v in dictionary.iteritems():
    print(k,v)
    count+=1
    if count >10:
        break

0 addit
1 bodi
2 bricklin
3 bring
4 bumper
5 call
6 colleg
7 door
8 earli
9 engin
10 enlighten


In [23]:
#Remove rare and repeatative words
dictionary.filter_extremes(no_below=15,no_above=0.1,keep_n=100000)

In [24]:
print(dictionary)

Dictionary(6535 unique tokens: ['addit', 'bodi', 'bring', 'bumper', 'call']...)


In [27]:
#Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
#words and how many times those words appear. Save this to 'bow_corpus'
bow_corpus = [dictionary.doc2bow(doc) for doc in Processed_doc]

In [28]:
bow_corpus[4]

[(23, 1),
 (85, 2),
 (112, 1),
 (143, 2),
 (158, 1),
 (159, 1),
 (160, 1),
 (161, 1),
 (162, 2),
 (163, 1),
 (164, 1),
 (165, 1),
 (166, 1),
 (167, 1),
 (168, 1),
 (169, 1),
 (170, 1),
 (171, 1),
 (172, 5),
 (173, 1),
 (174, 1),
 (175, 1),
 (176, 1),
 (177, 2),
 (178, 1),
 (179, 2),
 (180, 1),
 (181, 1),
 (182, 1),
 (183, 1),
 (184, 1),
 (185, 1),
 (186, 1),
 (187, 1),
 (188, 1),
 (189, 1),
 (190, 1),
 (191, 1),
 (192, 1),
 (193, 1),
 (194, 3)]

In [35]:
#preview 
document_num = 4
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]],
                                                     bow_doc_x[i][1]))

Word 23 ("wonder") appears 1 time.
Word 85 ("expect") appears 2 time.
Word 112 ("real") appears 1 time.
Word 143 ("jonathan") appears 2 time.
Word 158 ("aren") appears 1 time.
Word 159 ("astrophys") appears 1 time.
Word 160 ("baker") appears 1 time.
Word 161 ("basic") appears 1 time.
Word 162 ("bug") appears 2 time.
Word 163 ("cambridg") appears 1 time.
Word 164 ("caution") appears 1 time.
Word 165 ("check") appears 1 time.
Word 166 ("clear") appears 1 time.
Word 167 ("code") appears 1 time.
Word 168 ("condit") appears 1 time.
Word 169 ("crew") appears 1 time.
Word 170 ("curious") appears 1 time.
Word 171 ("dumb") appears 1 time.
Word 172 ("error") appears 5 time.
Word 173 ("harvard") appears 1 time.
Word 174 ("head") appears 1 time.
Word 175 ("ignor") appears 1 time.
Word 176 ("introduc") appears 1 time.
Word 177 ("launch") appears 2 time.
Word 178 ("mcdowel") appears 1 time.
Word 179 ("memori") appears 2 time.
Word 180 ("observatori") appears 1 time.
Word 181 ("pack") appears 1 time.

In [37]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 5, 
                                   id2word = dictionary,                                    
                                   passes = 20,
                                   workers = 2)



In [38]:
for idx, topic in lda_model.print_topics():
    print("Index: {} \nTopic: {}".format(idx, topic ))
    print("\n")

Index: 0 
Topic: 0.007*"govern" + 0.007*"encrypt" + 0.006*"secur" + 0.005*"chip" + 0.005*"presid" + 0.004*"clipper" + 0.004*"public" + 0.004*"israel" + 0.004*"isra" + 0.003*"clinton"


Index: 1 
Topic: 0.012*"game" + 0.010*"team" + 0.010*"space" + 0.008*"nasa" + 0.008*"play" + 0.006*"player" + 0.005*"hockey" + 0.004*"season" + 0.004*"toronto" + 0.004*"orbit"


Index: 2 
Topic: 0.010*"christian" + 0.007*"jesus" + 0.005*"exist" + 0.004*"moral" + 0.004*"word" + 0.004*"bibl" + 0.004*"life" + 0.004*"bike" + 0.004*"religion" + 0.004*"church"


Index: 3 
Topic: 0.010*"window" + 0.009*"file" + 0.007*"drive" + 0.006*"program" + 0.005*"card" + 0.004*"version" + 0.004*"softwar" + 0.004*"imag" + 0.004*"avail" + 0.004*"data"


Index: 4 
Topic: 0.006*"armenian" + 0.005*"govern" + 0.004*"turkish" + 0.004*"jew" + 0.003*"studi" + 0.003*"weapon" + 0.003*"crime" + 0.003*"nation" + 0.003*"human" + 0.003*"food"




In [39]:
num = 5
unseen_document = ng_test.data[num]
print(unseen_document)

From: banschbach@vms.ocom.okstate.edu
Subject: Re: Candida(yeast) Bloom, Fact or Fiction
Organization: OSU College of Osteopathic Medicine
Lines: 91
Nntp-Posting-Host: vms.ocom.okstate.edu

In article <1rp8p1$2d3@usenet.INS.CWRU.Edu>, esd3@po.CWRU.Edu (Elisabeth S. Davidson) writes:
> 
> In a previous article, banschbach@vms.ocom.okstate.edu () says:
>>least a few "enlightened" physicians practicing in the U.S.  It's really 
>>too bad that most U.S. medical schools don't cover nutrition because if 
>>they did, candida would not be viewed as a non-disease by so many in the 
>>medical profession.
> 
> Case Western Reserve Med School teaches nutrition in its own section as
> well as covering it in other sections as they apply (i.e. B12
> deficiency in neuro as a cause of neuropathy, B12 deficiency in
> hematology as a cause of megaloblastic anemia), yet I sill
> hold the viewpoint of mainstream medicine:  candida can cause
> mucocutaneous candidiasis, and, in already very sick patients
> 

In [40]:
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 15)))

Score: 0.11397389322519302	 Topic: 0.010*"christian" + 0.007*"jesus" + 0.005*"exist" + 0.004*"moral" + 0.004*"word" + 0.004*"bibl" + 0.004*"life" + 0.004*"bike" + 0.004*"religion" + 0.004*"church" + 0.003*"claim" + 0.003*"atheist" + 0.003*"true" + 0.003*"faith" + 0.003*"live"
Score: 0.05450720340013504	 Topic: 0.010*"window" + 0.009*"file" + 0.007*"drive" + 0.006*"program" + 0.005*"card" + 0.004*"version" + 0.004*"softwar" + 0.004*"imag" + 0.004*"avail" + 0.004*"data" + 0.004*"driver" + 0.003*"disk" + 0.003*"control" + 0.003*"graphic" + 0.003*"email"
Score: 0.8298696875572205	 Topic: 0.006*"armenian" + 0.005*"govern" + 0.004*"turkish" + 0.004*"jew" + 0.003*"studi" + 0.003*"weapon" + 0.003*"crime" + 0.003*"nation" + 0.003*"human" + 0.003*"food" + 0.003*"health" + 0.003*"live" + 0.003*"control" + 0.003*"muslim" + 0.003*"caus"
