In [1]:
#Loading dataset
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train', shuffle = True)
newsgroups_test = fetch_20newsgroups(subset='test', shuffle = True)

In [2]:
print(list(newsgroups_train.target_names))

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [3]:
newsgroups_train.data[:2]

["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n",
 "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 

In [4]:
print(newsgroups_train.filenames.shape, newsgroups_train.target.shape)

(11314,) (11314,)


In [7]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np

In [8]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gauravtiwari\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Preprocessing

In [9]:
def lemmatize_stemming(text):
    stemmer = SnowballStemmer("english")
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

In [10]:
processed_docs = []

for doc in newsgroups_train.data:
    processed_docs.append(preprocess(doc))

In [11]:
print(processed_docs[:3])

[['lerxst', 'thing', 'subject', 'nntp', 'post', 'host', 'organ', 'univers', 'maryland', 'colleg', 'park', 'line', 'wonder', 'enlighten', 'door', 'sport', 'look', 'late', 'earli', 'call', 'bricklin', 'door', 'small', 'addit', 'bumper', 'separ', 'rest', 'bodi', 'know', 'tellm', 'model', 'engin', 'spec', 'year', 'product', 'histori', 'info', 'funki', 'look', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst'], ['guykuo', 'carson', 'washington', 'subject', 'clock', 'poll', 'final', 'summari', 'final', 'clock', 'report', 'keyword', 'acceler', 'clock', 'upgrad', 'articl', 'shelley', 'qvfo', 'innc', 'organ', 'univers', 'washington', 'line', 'nntp', 'post', 'host', 'carson', 'washington', 'fair', 'number', 'brave', 'soul', 'upgrad', 'clock', 'oscil', 'share', 'experi', 'poll', 'send', 'brief', 'messag', 'detail', 'experi', 'procedur', 'speed', 'attain', 'rat', 'speed', 'card', 'adapt', 'heat', 'sink', 'hour', 'usag', 'floppi', 'disk', 'function', 'floppi', 'especi', 'request', 'summar', 'day',

### Bag of words in the dataset

In [12]:
# Create a dictionary from 'processed_docs' containing the number of times a word appears in the training set
dictionary = gensim.corpora.Dictionary(processed_docs)

In [13]:
# Convert document (a list of words) into the bag-of-words format = list of (token_id, token_count) 2-tuples where token id
# is the index of word in the dictionary
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [15]:
bow_corpus[:2]

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 2),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 2),
  (18, 1),
  (19, 2),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1)],
 [(13, 1),
  (18, 1),
  (24, 1),
  (25, 1),
  (27, 1),
  (34, 1),
  (36, 1),
  (38, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 2),
  (51, 5),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 2),
  (57, 1),
  (58, 2),
  (59, 2),
  (60, 1),
  (61, 2),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 3),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 1),
  (83, 2

In [16]:
document_num = 20
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 13 ("host") appears 1 time.
Word 18 ("line") appears 1 time.
Word 24 ("nntp") appears 1 time.
Word 25 ("organ") appears 1 time.
Word 27 ("post") appears 1 time.
Word 29 ("rest") appears 1 time.
Word 34 ("subject") appears 1 time.
Word 37 ("thing") appears 5 time.
Word 115 ("give") appears 1 time.
Word 128 ("like") appears 1 time.
Word 138 ("peopl") appears 1 time.
Word 212 ("write") appears 1 time.
Word 221 ("clear") appears 1 time.
Word 338 ("say") appears 1 time.
Word 386 ("think") appears 1 time.
Word 437 ("refer") appears 1 time.
Word 452 ("true") appears 1 time.
Word 504 ("technolog") appears 1 time.
Word 564 ("christian") appears 1 time.
Word 582 ("exampl") appears 1 time.
Word 607 ("jew") appears 1 time.
Word 612 ("lead") appears 1 time.
Word 615 ("littl") appears 3 time.
Word 664 ("wors") appears 2 time.
Word 920 ("keith") appears 3 time.
Word 933 ("punish") appears 1 time.
Word 1016 ("california") appears 1 time.
Word 1083 ("institut") appears 1 time.
Word 1147 ("similar"

### Running LDA on the bad of words

In [17]:
# num_topics=number of requested latent topics to be extracted from the training corpus
# id2word is a mapping from word ids (integers) to words (strings) 
# workers is the number of extra processes to use for parallelization. Uses all available cores by default.
# passes is the number of training passes through the corpus.
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 8, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

In [18]:
# For each topic, we will explore the words occuring in that topic and its relative weight
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.007*"encrypt" + 0.006*"line" + 0.006*"subject" + 0.005*"govern" + 0.005*"secur" + 0.005*"chip" + 0.005*"write" + 0.005*"israel" + 0.005*"organ" + 0.005*"isra"


Topic: 1 
Words: 0.012*"game" + 0.010*"team" + 0.007*"play" + 0.007*"line" + 0.007*"year" + 0.007*"subject" + 0.006*"organ" + 0.006*"player" + 0.005*"hockey" + 0.005*"think"


Topic: 2 
Words: 0.009*"write" + 0.007*"organ" + 0.007*"line" + 0.007*"articl" + 0.007*"subject" + 0.007*"peopl" + 0.005*"like" + 0.004*"post" + 0.004*"know" + 0.004*"think"


Topic: 3 
Words: 0.016*"line" + 0.015*"subject" + 0.015*"organ" + 0.012*"write" + 0.010*"articl" + 0.009*"post" + 0.008*"univers" + 0.006*"host" + 0.006*"nntp" + 0.006*"like"


Topic: 4 
Words: 0.014*"line" + 0.013*"organ" + 0.013*"subject" + 0.011*"write" + 0.010*"post" + 0.010*"articl" + 0.009*"host" + 0.008*"nntp" + 0.006*"univers" + 0.006*"like"


Topic: 5 
Words: 0.011*"space" + 0.009*"nasa" + 0.005*"work" + 0.004*"organ" + 0.004*"program" + 0.004*"line" + 0.

### Data processing on unseen document

In [19]:
num = 100
unseen_document = newsgroups_test.data[num]
print(unseen_document)

Subject: help
From: C..Doelle@p26.f3333.n106.z1.fidonet.org (C. Doelle)
Lines: 13

Hello All!

    It is my understanding that all True-Type fonts in Windows are loaded in
prior to starting Windows - this makes getting into Windows quite slow if you
have hundreds of them as I do.  First off, am I correct in this thinking -
secondly, if that is the case - can you get Windows to ignore them on boot and
maybe make something like a PIF file to load them only when you enter the
applications that need fonts?  Any ideas?


Chris

 * Origin: chris.doelle.@f3333.n106.z1.fidonet.org (1:106/3333.26)



In [20]:
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.901461124420166	 Topic: 0.013*"line" + 0.012*"subject" + 0.010*"organ" + 0.010*"window" + 0.008*"file"
Score: 0.08107218891382217	 Topic: 0.010*"peopl" + 0.008*"think" + 0.007*"say" + 0.007*"know" + 0.006*"write"


In [21]:
print(newsgroups_test.target_names[newsgroups_test.target[num]])

comp.os.ms-windows.misc


The model correctly classifies the unseen document with 'x'% probability to the X category.