In [1]:
import pandas as pd

data = pd.read_csv("abcnews-date-text.csv", error_bad_lines=False)
data_text = data[ : 300000][['headline_text']]
data_text['index'] = data_text.index
documents = data_text

In [2]:
data_text

Unnamed: 0,headline_text,index
0,aba decides against community broadcasting lic...,0
1,act fire witnesses must be aware of defamation,1
2,a g calls for infrastructure protection summit,2
3,air nz staff in aust strike for pay rise,3
4,air nz strike to affect australian travellers,4
...,...,...
299995,broughton hall audit reveals serious breaches,299995
299996,broughton hall fails key standards,299996
299997,broughton hall safe for residents govt says,299997
299998,burn off at conservation park aims to prevent,299998


In [3]:
# Load gensim and NLTK libraries

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)

In [4]:
import nltk
nltk.data.path.append('../../../../Python Libs/')

In [5]:
print(WordNetLemmatizer().lemmatize('went', pos='v'))

go


In [7]:
stemmer = SnowballStemmer("english")
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data={"original_words": original_words, "singles": singles})

Unnamed: 0,original_words,singles
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [9]:
def lemmatize_string(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in simple_preprocess(text) :
        if token not in STOPWORDS and len(token) > 3:
            result.append(lemmatize_string(token))
    return result


In [10]:
document_num = 4310
doc_sample = documents[documents['index'] == document_num].values[0][0]

print("Original Document : ")
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print("\n\n Tokenized and Lemmatized document : ")
print(preprocess(text=doc_sample))

Original Document : 
['rain', 'helps', 'dampen', 'bushfires']


 Tokenized and Lemmatized document : 
['rain', 'help', 'dampen', 'bushfir']


In [16]:
documents[documents['index'] == document_num].values[0][0]

'rain helps dampen bushfires'

In [17]:
preprocessed_docs = documents['headline_text'].map(preprocess)

In [18]:
preprocessed_docs[:10]

0            [decid, communiti, broadcast, licenc]
1                               [wit, awar, defam]
2           [call, infrastructur, protect, summit]
3                      [staff, aust, strike, rise]
4             [strike, affect, australian, travel]
5               [ambiti, olsson, win, tripl, jump]
6           [antic, delight, record, break, barca]
7    [aussi, qualifi, stosur, wast, memphi, match]
8            [aust, address, secur, council, iraq]
9                         [australia, lock, timet]
Name: headline_text, dtype: object

##### Bag of Words

In [19]:
from gensim.corpora import Dictionary

In [20]:
dictionary = Dictionary(preprocessed_docs)

In [21]:
count = 0 
for (k, v) in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 broadcast
1 communiti
2 decid
3 licenc
4 awar
5 defam
6 wit
7 call
8 infrastructur
9 protect
10 summit


In [32]:
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n=100000)

In [33]:
bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs]

In [34]:
bow_corpus[document_num]

[(71, 1), (107, 1), (462, 1), (3530, 1)]

In [35]:
bow_doc_4310 = bow_corpus[document_num]

for i in range(0, len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} times".format(bow_doc_4310[i][0], dictionary[bow_doc_4310[i][0]], bow_doc_4310[i][1]))

Word 71 ("bushfir") appears 1 times
Word 107 ("help") appears 1 times
Word 462 ("rain") appears 1 times
Word 3530 ("dampen") appears 1 times
