In [1]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train', shuffle = True)
newsgroups_test = fetch_20newsgroups(subset='test', shuffle = True)

In [2]:
print(list(newsgroups_train.target_names))


['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [3]:
newsgroups_train.data[:2]


["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n",
 "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 

In [4]:
print(newsgroups_train.filenames.shape, newsgroups_train.target.shape)


(11314,) (11314,)


In [5]:
'''
Loading Gensim and nltk libraries
'''
# pip install gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)

In [6]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SSG\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
print(WordNetLemmatizer().lemmatize('went', pos = 'v')) # past tense to present tense


go


In [8]:
import pandas as pd
stemmer = SnowballStemmer("english")
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]

pd.DataFrame(data={'original word':original_words, 'stemmed':singles })

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [9]:
'''
Write a function to perform the pre processing steps on the entire dataset
'''
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

In [10]:
'''
Preview a document after preprocessing
'''
document_num = 50
doc_sample = 'This disk has failed many times. I would like to get it replaced.'

print("Original document: ")
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print("\n\nTokenized and lemmatized document: ")
print(preprocess(doc_sample))

Original document: 
['This', 'disk', 'has', 'failed', 'many', 'times.', 'I', 'would', 'like', 'to', 'get', 'it', 'replaced.']


Tokenized and lemmatized document: 
['disk', 'fail', 'time', 'like', 'replac']


In [11]:
processed_docs = []

for doc in newsgroups_train.data:
    processed_docs.append(preprocess(doc))

In [12]:
'''
Preview 'processed_docs'
'''
print(processed_docs[:2])

[['lerxst', 'thing', 'subject', 'nntp', 'post', 'host', 'organ', 'univers', 'maryland', 'colleg', 'park', 'line', 'wonder', 'enlighten', 'door', 'sport', 'look', 'late', 'earli', 'call', 'bricklin', 'door', 'small', 'addit', 'bumper', 'separ', 'rest', 'bodi', 'know', 'tellm', 'model', 'engin', 'spec', 'year', 'product', 'histori', 'info', 'funki', 'look', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst'], ['guykuo', 'carson', 'washington', 'subject', 'clock', 'poll', 'final', 'summari', 'final', 'clock', 'report', 'keyword', 'acceler', 'clock', 'upgrad', 'articl', 'shelley', 'qvfo', 'innc', 'organ', 'univers', 'washington', 'line', 'nntp', 'post', 'host', 'carson', 'washington', 'fair', 'number', 'brave', 'soul', 'upgrad', 'clock', 'oscil', 'share', 'experi', 'poll', 'send', 'brief', 'messag', 'detail', 'experi', 'procedur', 'speed', 'attain', 'rat', 'speed', 'card', 'adapt', 'heat', 'sink', 'hour', 'usag', 'floppi', 'disk', 'function', 'floppi', 'especi', 'request', 'summar', 'day',

In [13]:
'''
Create a dictionary from 'processed_docs' containing the number of times a word appears 
in the training set using gensim.corpora.Dictionary and call it 'dictionary'
'''
dictionary = gensim.corpora.Dictionary(processed_docs)

In [14]:
'''
Checking dictionary created
'''
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 addit
1 bodi
2 bricklin
3 bring
4 bumper
5 call
6 colleg
7 door
8 earli
9 engin
10 enlighten


In [16]:
'''
Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
words and how many times those words appear. Save this to 'bow_corpus'
'''
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [17]:
'''
Preview BOW for our sample preprocessed document
'''
document_num = 20
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 13 ("host") appears 1 time.
Word 18 ("line") appears 1 time.
Word 24 ("nntp") appears 1 time.
Word 25 ("organ") appears 1 time.
Word 27 ("post") appears 1 time.
Word 29 ("rest") appears 1 time.
Word 34 ("subject") appears 1 time.
Word 37 ("thing") appears 5 time.
Word 115 ("give") appears 1 time.
Word 128 ("like") appears 1 time.
Word 138 ("peopl") appears 1 time.
Word 212 ("write") appears 1 time.
Word 221 ("clear") appears 1 time.
Word 338 ("say") appears 1 time.
Word 386 ("think") appears 1 time.
Word 437 ("refer") appears 1 time.
Word 452 ("true") appears 1 time.
Word 504 ("technolog") appears 1 time.
Word 564 ("christian") appears 1 time.
Word 582 ("exampl") appears 1 time.
Word 607 ("jew") appears 1 time.
Word 612 ("lead") appears 1 time.
Word 615 ("littl") appears 3 time.
Word 664 ("wors") appears 2 time.
Word 920 ("keith") appears 3 time.
Word 933 ("punish") appears 1 time.
Word 1016 ("california") appears 1 time.
Word 1083 ("institut") appears 1 time.
Word 1147 ("similar"

In [18]:
# lda_model = gensim.models.LdaModel(bow_corpus, 
#                                    num_topics = 10, 
#                                    id2word = dictionary,                                    
#                                    passes = 50)

# LDA multicore 
'''
Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'
'''
# TODO
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 8, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

In [19]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.012*"line" + 0.012*"subject" + 0.012*"organ" + 0.011*"write" + 0.011*"articl" + 0.009*"space" + 0.007*"post" + 0.006*"univers" + 0.005*"nasa" + 0.005*"host"


Topic: 1 
Words: 0.011*"game" + 0.010*"line" + 0.009*"team" + 0.009*"subject" + 0.009*"organ" + 0.008*"year" + 0.007*"write" + 0.007*"play" + 0.006*"player" + 0.006*"articl"


Topic: 2 
Words: 0.008*"christian" + 0.008*"write" + 0.008*"think" + 0.008*"peopl" + 0.007*"subject" + 0.007*"know" + 0.006*"line" + 0.006*"believ" + 0.006*"say" + 0.006*"organ"


Topic: 3 
Words: 0.015*"line" + 0.014*"subject" + 0.013*"organ" + 0.008*"post" + 0.008*"drive" + 0.008*"univers" + 0.008*"window" + 0.007*"host" + 0.007*"nntp" + 0.006*"write"


Topic: 4 
Words: 0.009*"line" + 0.008*"subject" + 0.007*"file" + 0.007*"window" + 0.006*"organ" + 0.006*"program" + 0.006*"imag" + 0.005*"write" + 0.004*"post" + 0.004*"wire"


Topic: 5 
Words: 0.010*"peopl" + 0.006*"say" + 0.006*"write" + 0.006*"state" + 0.005*"go" + 0.005*"right" + 0.0

In [26]:
num = 2000
unseen_document = newsgroups_test.data[num]
print(unseen_document)

From: mrowley@pebbles.es.com (Michael Rowley)
Subject: Re: Command Loss Timer (Re: Galileo Update - 04/22/93)
Keywords: Galileo, JPL
Nntp-Posting-Host: 130.187.85.70
Organization: Design Systems Division, Evans & Sutherland, SLC, UT
Lines: 27

In article <1993Apr26.193924.1189@bnr.ca> jcobban@bnr.ca (Jim Cobban) writes:
>Having read in the past about the fail-safe mechanisms on spacecraft, I had
>assumed that the Command Loss Timer had that sort of function.  However I
>always find disturbing the oxymoron of a "NO-OP" command that does something.
>If the command changes the behavior or status of the spacecraft it is not
>a "NO-OP" command.
>
>Of course this terminology comes from a Jet Propulsion Laboratory which has
>nothing to do with jet propulsion.
>

	I don't know where you got this idea from, JPL's history dates back to 
	to the 1930s when a Caltech professor named Von Karman conducted  
	experiments in rocket PROPULSION with a group of graduate students
	on the present site of t

In [23]:
newsgroups_test.data[:2]


['From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. GANDLER)\nSubject: Need info on 88-89 Bonneville\nOrganization: University at Buffalo\nLines: 10\nNews-Software: VAX/VMS VNEWS 1.41\nNntp-Posting-Host: ubvmsd.cc.buffalo.edu\n\n\n I am a little confused on all of the models of the 88-89 bonnevilles.\nI have heard of the LE SE LSE SSE SSEI. Could someone tell me the\ndifferences are far as features or performance. I am also curious to\nknow what the book value is for prefereably the 89 model. And how much\nless than book value can you usually get them for. In other words how\nmuch are they in demand this time of year. I have heard that the mid-spring\nearly summer is the best time to buy.\n\n\t\t\tNeil Gandler\n',
 'From: Rick Miller <rick@ee.uwm.edu>\nSubject: X-Face?\nOrganization: Just me.\nLines: 17\nDistribution: world\nNNTP-Posting-Host: 129.89.2.33\nSummary: Go ahead... swamp me.  <EEP!>\n\nI\'m not familiar at all with the format of these "X-Face:" thingies, but\nafter seeing them 

In [27]:
# Data preprocessing step for the unseen document
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.5073091983795166	 Topic: 0.012*"line" + 0.012*"subject" + 0.012*"organ" + 0.011*"write" + 0.011*"articl"
Score: 0.22603581845760345	 Topic: 0.010*"peopl" + 0.006*"say" + 0.006*"write" + 0.006*"state" + 0.005*"go"
Score: 0.17304913699626923	 Topic: 0.015*"line" + 0.014*"subject" + 0.013*"organ" + 0.008*"post" + 0.008*"drive"
Score: 0.08842587471008301	 Topic: 0.008*"christian" + 0.008*"write" + 0.008*"think" + 0.008*"peopl" + 0.007*"subject"


In [28]:
print(newsgroups_test.target[num])


14
