In [16]:
import pandas as pd

data = pd.read_csv('nasdaq_labeled_news.csv', error_bad_lines=False)
data_text = data[['News']]
data_text['index'] = data_text.index
documents = data_text



  data = pd.read_csv('nasdaq_labeled_news.csv', error_bad_lines=False)


In [17]:
len(documents)

5000

In [18]:
documents[:5]

Unnamed: 0,News,index
0,A Canadian judge on Monday rejected United Sta...,0
1,"TSMC (2330.TW), the world largest contract chi...",1
2,"Taiwan UMC (2303.TW), the world second-largest...",2
3,Australian law firm Slater Gordon is looking ...,3
4,American Tower (AMT.N) is one of the three fir...,4


### Data Preprocessing

In [19]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

In [20]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dyush\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### Lemmatize example

In [21]:
print(WordNetLemmatizer().lemmatize('went', pos='v'))

go


#### Stemmer Example

In [22]:
stemmer = SnowballStemmer('english')
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [23]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [24]:
doc_sample = documents[documents['index'] == 4310].values[0][0]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['A', 'federal', 'judge', 'on', 'Monday', 'declined', 'to', 'approve', 'Citigroup', 'Inc', '(C.N)', '$75', 'million', 'settlement', 'with', 'the', 'U.S.', 'Securities', 'and', 'Exchange', 'Commission', 'of', 'charges', 'it', 'misled', 'investors', 'by', 'failing', 'to', 'disclose', 'roughly', '$40', 'billion', 'of', 'subprime', 'mortgages.', 'U.S.', 'District', 'Judge', 'Ellen', 'Huvelle', 'at', 'a', 'hearing', 'asked', 'both', 'sides', 'for', 'more', 'information', 'on', 'the', 'accord', 'before', 'approving', 'it.', 'Another', 'hearing', 'on', 'the', 'matter', 'is', 'expected', 'on', 'Sept.', '24.', '"It', 'a', 'home', 'run', 'for', 'the', 'good', 'guys,"', 'said', 'Richard', 'Greenfield,', 'a', 'lawyer', 'for', 'a', 'Citigroup', 'shareholder,', 'who', 'said', 'he', 'attended', 'the', 'hearing.', '"The', 'bottom', 'line', 'is', 'that', 'the', 'SEC', 'put', 'virtually', 'nothing', 'forward', 'to', 'justify', 'the', 'settlement."', 'SEC', 'spokesman', 'Kevin', 'Call

In [25]:
processed_docs = documents['News'].map(preprocess)

In [26]:
processed_docs[:10]

0    [canadian, judg, monday, reject, unit, state, ...
1    [tsmc, world, largest, contract, chip, maker, ...
2    [taiwan, world, second, largest, contract, chi...
3    [australian, firm, slater, gordon, look, possi...
4    [american, tower, firm, talk, telecom, tower, ...
5    [american, tower, firm, talk, telecom, tower, ...
6    [australia, propos, percent, mine, hurt, cater...
7    [telecom, media, tycoon, richard, privat, equi...
8    [collaps, commerci, real, estat, bubbl, crush,...
9    [follow, main, factor, expect, affect, swiss, ...
Name: News, dtype: object

### Bag of words on the dataset

In [27]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [28]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 affect
1 allan
2 alleg
3 approv
4 argu
5 author
6 base
7 benefit
8 blame
9 block
10 break


In [29]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [30]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(1, 1),
 (2, 3),
 (5, 1),
 (17, 1),
 (20, 2),
 (22, 1),
 (31, 2),
 (34, 1),
 (36, 1),
 (42, 5),
 (46, 1),
 (51, 2),
 (53, 1),
 (69, 1),
 (81, 1),
 (83, 1),
 (86, 1),
 (108, 2),
 (117, 1),
 (125, 1),
 (145, 1),
 (148, 2),
 (164, 1),
 (186, 2),
 (193, 1),
 (196, 2),
 (197, 1),
 (203, 3),
 (209, 3),
 (217, 3),
 (229, 4),
 (279, 1),
 (286, 1),
 (296, 1),
 (302, 1),
 (334, 1),
 (338, 1),
 (341, 4),
 (366, 1),
 (374, 1),
 (379, 1),
 (383, 1),
 (388, 1),
 (391, 2),
 (413, 1),
 (426, 2),
 (428, 1),
 (440, 1),
 (454, 1),
 (462, 1),
 (513, 3),
 (526, 1),
 (538, 1),
 (549, 9),
 (579, 1),
 (595, 1),
 (598, 1),
 (610, 1),
 (612, 1),
 (624, 1),
 (647, 1),
 (697, 1),
 (725, 1),
 (745, 1),
 (776, 1),
 (779, 1),
 (857, 1),
 (863, 1),
 (899, 1),
 (909, 1),
 (926, 2),
 (964, 1),
 (966, 1),
 (980, 1),
 (993, 1),
 (1044, 4),
 (1084, 1),
 (1112, 1),
 (1114, 1),
 (1208, 1),
 (1252, 1),
 (1257, 1),
 (1370, 1),
 (1404, 1),
 (1410, 1),
 (1431, 1),
 (1436, 1),
 (1441, 1),
 (1519, 1),
 (1542, 1),
 (1610, 1),
 (1

In [31]:
bow_doc_4310 = bow_corpus[4310]

for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                                     dictionary[bow_doc_4310[i][0]], 
                                                     bow_doc_4310[i][1]))

Word 1 ("alleg") appears 1 time.
Word 2 ("approv") appears 3 time.
Word 5 ("base") appears 1 time.
Word 17 ("corp") appears 1 time.
Word 20 ("court") appears 2 time.
Word 22 ("decis") appears 1 time.
Word 31 ("feder") appears 2 time.
Word 34 ("gari") appears 1 time.
Word 36 ("govern") appears 1 time.
Word 42 ("judg") appears 5 time.
Word 46 ("lawyer") appears 1 time.
Word 51 ("monday") appears 2 time.
Word 53 ("nation") appears 1 time.
Word 69 ("reject") appears 1 time.
Word 81 ("sue") appears 1 time.
Word 83 ("takeov") appears 1 time.
Word 86 ("time") appears 1 time.
Word 108 ("chief") appears 2 time.
Word 117 ("financi") appears 1 time.
Word 125 ("jonathan") appears 1 time.
Word 145 ("second") appears 1 time.
Word 148 ("sharehold") appears 2 time.
Word 164 ("exchang") appears 1 time.
Word 186 ("billion") appears 2 time.
Word 193 ("debt") appears 1 time.
Word 196 ("exposur") appears 2 time.
Word 197 ("fail") appears 1 time.
Word 203 ("hear") appears 3 time.
Word 209 ("investor") appea

### TF-IDF

In [32]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)

In [33]:
corpus_tfidf = tfidf[bow_corpus]

In [34]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.04740335979405965),
 (1, 0.05981785536616545),
 (2, 0.07933472162027505),
 (3, 0.06999493037504778),
 (4, 0.047814003040763386),
 (5, 0.02032364784655052),
 (6, 0.09704768233305229),
 (7, 0.08004216311084526),
 (8, 0.05548383532177425),
 (9, 0.10750941390799126),
 (10, 0.10209032644900137),
 (11, 0.2768492942605557),
 (12, 0.037153576412417455),
 (13, 0.09899735559954259),
 (14, 0.057175445056947494),
 (15, 0.08318161323828102),
 (16, 0.07363838083901733),
 (17, 0.017601985012046104),
 (18, 0.03187734923367338),
 (19, 0.0786892340082342),
 (20, 0.1388888150158044),
 (21, 0.053960478695264526),
 (22, 0.18248120320221692),
 (23, 0.03334333815919392),
 (24, 0.06522068739286731),
 (25, 0.044131916726854147),
 (26, 0.05951726211655632),
 (27, 0.06479483321574765),
 (28, 0.03898947143996813),
 (29, 0.050293143888028656),
 (30, 0.068587542260898),
 (31, 0.07917207121704016),
 (32, 0.14881005762315033),
 (33, 0.1464517282369123),
 (34, 0.05981785536616545),
 (35, 0.07845063724685906),
 

### Running LDA using Bag of Words

In [35]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [43]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.035*"govern" + 0.024*"open" + 0.018*"coast" + 0.017*"tasmanian" + 0.017*"gold" + 0.014*"australia" + 0.013*"beat" + 0.010*"win" + 0.010*"ahead" + 0.009*"shark"
Topic: 1 
Words: 0.023*"world" + 0.014*"final" + 0.013*"record" + 0.012*"break" + 0.011*"lose" + 0.011*"australian" + 0.011*"leagu" + 0.011*"test" + 0.010*"australia" + 0.010*"hill"
Topic: 2 
Words: 0.018*"rural" + 0.018*"council" + 0.015*"fund" + 0.014*"plan" + 0.013*"health" + 0.012*"chang" + 0.011*"nation" + 0.010*"price" + 0.010*"servic" + 0.009*"say"
Topic: 3 
Words: 0.025*"elect" + 0.022*"adelaid" + 0.012*"perth" + 0.011*"take" + 0.011*"say" + 0.010*"labor" + 0.010*"turnbul" + 0.009*"vote" + 0.009*"royal" + 0.009*"time"
Topic: 4 
Words: 0.032*"court" + 0.022*"face" + 0.020*"charg" + 0.020*"home" + 0.018*"tasmania" + 0.017*"murder" + 0.015*"trial" + 0.012*"accus" + 0.012*"abus" + 0.012*"child"
Topic: 5 
Words: 0.024*"countri" + 0.021*"hour" + 0.020*"australian" + 0.019*"warn" + 0.016*"live" + 0.013*"indig

Cool! Can you distinguish different topics using the words in each topic and their corresponding weights?

### Running LDA using TF-IDF

In [36]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [37]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.006*"share" + 0.006*"cent" + 0.005*"million" + 0.005*"quarter" + 0.004*"refineri" + 0.004*"revenu" + 0.003*"earn" + 0.003*"barrel" + 0.003*"expect" + 0.003*"gulf"
Topic: 1 Word: 0.005*"bank" + 0.003*"note" + 0.003*"billion" + 0.003*"sale" + 0.003*"share" + 0.003*"million" + 0.003*"sourc" + 0.003*"quarter" + 0.003*"gulf" + 0.003*"spill"
Topic: 2 Word: 0.004*"share" + 0.004*"ford" + 0.004*"bank" + 0.003*"quarter" + 0.003*"sale" + 0.003*"million" + 0.003*"cent" + 0.003*"billion" + 0.003*"fund" + 0.003*"profit"
Topic: 3 Word: 0.008*"cent" + 0.007*"share" + 0.007*"quarter" + 0.005*"million" + 0.005*"revenu" + 0.005*"earn" + 0.005*"profit" + 0.004*"analyst" + 0.004*"expect" + 0.004*"rise"
Topic: 4 Word: 0.006*"share" + 0.005*"quarter" + 0.005*"cent" + 0.005*"million" + 0.004*"earn" + 0.004*"revenu" + 0.004*"expect" + 0.003*"profit" + 0.003*"analyst" + 0.003*"billion"
Topic: 5 Word: 0.005*"share" + 0.004*"quarter" + 0.004*"cent" + 0.004*"million" + 0.003*"sale" + 0.003*"loss"

### Classification of the topics

### Performance evaluation by classifying sample document using LDA Bag of Words model

In [38]:
processed_docs[4310]

['feder',
 'judg',
 'monday',
 'declin',
 'approv',
 'citigroup',
 'million',
 'settlement',
 'secur',
 'exchang',
 'commiss',
 'charg',
 'mislead',
 'investor',
 'fail',
 'disclos',
 'rough',
 'billion',
 'subprim',
 'mortgag',
 'district',
 'judg',
 'ellen',
 'huvell',
 'hear',
 'ask',
 'side',
 'inform',
 'accord',
 'approv',
 'hear',
 'matter',
 'expect',
 'sept',
 'home',
 'good',
 'guy',
 'say',
 'richard',
 'greenfield',
 'lawyer',
 'citigroup',
 'sharehold',
 'say',
 'attend',
 'hear',
 'line',
 'virtual',
 'forward',
 'justifi',
 'settlement',
 'spokesman',
 'kevin',
 'callahan',
 'say',
 'agenc',
 'provid',
 'court',
 'inform',
 'request',
 'citigroup',
 'spokeswoman',
 'molli',
 'meiner',
 'say',
 'york',
 'base',
 'bank',
 'answer',
 'judg',
 'question',
 'concern',
 'matter',
 'announc',
 'propos',
 'settlement',
 'juli',
 'say',
 'citigroup',
 'underst',
 'exposur',
 'subprim',
 'mortgag',
 'billion',
 'nation',
 'hous',
 'crisi',
 'earli',
 'stag',
 'sue',
 'citigroup',


In [39]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.9921139478683472	 
Topic: 0.012*"million" + 0.010*"insur" + 0.008*"file" + 0.008*"court" + 0.007*"york" + 0.006*"lilli" + 0.006*"financi" + 0.006*"accord" + 0.006*"group" + 0.005*"unit"


Our test document has the highest probability to be part of the topic on the top.

### Performance evaluation by classifying sample document using LDA TF-IDF model

In [40]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.72900390625	 
Topic: 0.005*"index" + 0.004*"point" + 0.004*"bank" + 0.003*"share" + 0.003*"sourc" + 0.003*"china" + 0.002*"billion" + 0.002*"goldman" + 0.002*"group" + 0.002*"european"

Score: 0.1399843394756317	 
Topic: 0.004*"share" + 0.004*"ford" + 0.004*"bank" + 0.003*"quarter" + 0.003*"sale" + 0.003*"million" + 0.003*"cent" + 0.003*"billion" + 0.003*"fund" + 0.003*"profit"

Score: 0.1264623999595642	 
Topic: 0.006*"bank" + 0.003*"million" + 0.003*"share" + 0.003*"billion" + 0.003*"manag" + 0.003*"quarter" + 0.003*"sale" + 0.003*"invest" + 0.003*"hire" + 0.002*"asset"


Our test document has the highest probability to be part of the topic on the top.

### Testing model on unseen document

In [41]:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.5905829071998596	 Topic: 0.012*"billion" + 0.012*"bank" + 0.010*"million" + 0.009*"share" + 0.009*"group"
Score: 0.24930903315544128	 Topic: 0.010*"bank" + 0.008*"court" + 0.007*"airlin" + 0.006*"case" + 0.006*"expect"
Score: 0.020018169656395912	 Topic: 0.017*"bank" + 0.017*"billion" + 0.016*"million" + 0.010*"quarter" + 0.009*"share"
Score: 0.020015809684991837	 Topic: 0.006*"share" + 0.006*"drug" + 0.006*"state" + 0.006*"trade" + 0.005*"plan"
Score: 0.020013874396681786	 Topic: 0.010*"bank" + 0.009*"million" + 0.009*"billion" + 0.009*"group" + 0.009*"share"
Score: 0.020013414323329926	 Topic: 0.010*"billion" + 0.010*"share" + 0.008*"million" + 0.008*"plan" + 0.007*"stock"
Score: 0.020012380555272102	 Topic: 0.012*"million" + 0.010*"insur" + 0.008*"file" + 0.008*"court" + 0.007*"york"
Score: 0.020011989399790764	 Topic: 0.009*"gulf" + 0.008*"manag" + 0.008*"bank" + 0.008*"mexico" + 0.008*"oper"
Score: 0.02001143805682659	 Topic: 0.022*"share" + 0.017*"million" + 0.011*"quart