##### Exercise 12.1

In [2]:
import os
import nltk
import numpy as np
import scipy
import gensim
from scipy.spatial.distance import euclidean

In [3]:
#%% Getting a list of directory contents
def gettextlist(directory_path):
    directory_textfiles=[]
    directory_nontextfiles=[]
    directory_nonfiles=[]
    # Process each item in the directory
    directory_contents=os.listdir(directory_path)
    for contentitem in directory_contents:
        temp_fullpath=os.path.join(directory_path, contentitem)
        # Non-files (e.g. subdirectories) are stored separately
        if os.path.isfile(temp_fullpath)==0:
            directory_nonfiles.append(contentitem)
        else:
            # Is this a non-text file (not ending in .txt)?
            if temp_fullpath.find('.txt')==-1:
                directory_nontextfiles.append(contentitem)
            else:
                # This is a text file
                directory_textfiles.append(contentitem)
    return(directory_textfiles,directory_nontextfiles,directory_nonfiles)

In [4]:
#%% Basic file crawler
def basicfilecrawler(directory_path):
    # Store filenames read and their text content
    num_files_read=0
    crawled_filenames=[]
    mycrawled_texts=[]
    directory_contentlists=gettextlist(directory_path)
    # In this basic crawled we just process text files
    # and do not handle subdirectories
    #directory_textfiles=directory_contentlists[0]
    directory_nontextfiles=directory_contentlists[1]
    for contentitem in directory_nontextfiles:
        #print('Reading file:')
        #print(contentitem)
        # Open the file and read its contents
        temp_fullpath=os.path.join(directory_path, contentitem)
        #temp_file=open(temp_fullpath,'r',encoding='utf-8',errors='ignore')
        temp_file=open(temp_fullpath,'r')
        temp_text=temp_file.read()
        temp_file.close()
        # Store the read filename and content
        crawled_filenames.append(contentitem)
        mycrawled_texts.append(temp_text)
        num_files_read=num_files_read+1
    return(crawled_filenames, mycrawled_texts)

In [5]:
crawled_filenames, mycrawled_texts = basicfilecrawler('20news')

In [7]:
crawled_filenames[0:4]

['100521', '101551', '101552', '101553']

In [8]:
mycrawled_texts[1]

"Path: cantaloupe.srv.cs.cmu.edu!das-news.harvard.edu!ogicse!uwm.edu!wupost!uunet!brunix!cs.brown.edu!cs012055\nFrom: cs012055@cs.brown.edu (Hok-Chung Tsang)\nNewsgroups: rec.autos\nSubject: Re: Saturn's Pricing Policy\nMessage-ID: <1993Apr5.230808.581@cs.brown.edu>\nDate: 5 Apr 93 23:08:08 GMT\nArticle-I.D.: cs.1993Apr5.230808.581\nReferences: <C4oxwp.KKM@news.cso.uiuc.edu> <C4vIr5.L3r@shuksan.ds.boeing.com>\nSender: news@cs.brown.edu\nOrganization: Brown Computer Science Dept.\nLines: 51\n\nIn article <C4vIr5.L3r@shuksan.ds.boeing.com>, fredd@shuksan (Fred Dickey) writes:\n|> CarolinaFan@uiuc (cka52397@uxa.cso.uiuc.edu) wrote:\n|> : \tI have been active in defending Saturn lately on the net and would\n|> : like to state my full opinion on the subject, rather than just reply to others'\n|> : points.\n|> : \t\n|> : \tThe biggest problem some people seem to be having is that Saturn\n|> : Dealers make ~$2K on a car.  I think most will agree with me that the car is\n|> : comparably priced

In [None]:
# Exclude header lines from each message
excludedlinemarkers=['Xref:','Path:','From:','Newsgroups:','Subject:','Summary:',
                     'Keywords:','Message-ID:','Date:','Expires:','Followup-To:','Distribution:', 
                     'Organization:','Approved:','Supersedes:','Lines:','NNTP-Posting-Host:', 
                     'References:','Sender:','In-Reply-To:','Article-I.D.:','Reply-To:', 
                     'Nntp-Posting-Host:', 'Newsgroup', 'document_id']

for k in range(len(mycrawled_texts)):
    print(k)
    templines = mycrawled_texts[k].splitlines()
    remaininglines = []
    for l in range(len(templines)):
        line_should_be_excluded = 0
        for m in range(len(excludedlinemarkers)):
            if len(templines[l]) >= len(excludedlinemarkers[m]):
                if excludedlinemarkers[m] == templines[l][0:len(excludedlinemarkers[m])]:
                    line_should_be_excluded = 1
                    break
        if line_should_be_excluded == 0:
            remaininglines.append(templines[l])
    mycrawled_texts[k] = '\n'.join(remaininglines)

In [11]:
downloaded_texts = []
# remove leading and trailing whitespaces
for k in range(len(mycrawled_texts)):
    temp_texts = mycrawled_texts[k].strip()
    temp_texts = ' '.join(temp_texts.split())
    downloaded_texts.append(temp_texts)

In [13]:
len(downloaded_texts)

4000

In [14]:
# Tokenize downloaded texts and change them to NLTK format
mydownloaded_nltk_texts = []
for k in range(len(downloaded_texts)):
    temp_tokenizedtext = nltk.word_tokenize(downloaded_texts[k])
    temp_nltktext = nltk.Text(temp_tokenizedtext)
    mydownloaded_nltk_texts.append(temp_nltktext)

In [15]:
# Make all downloaded texts lowercase
mydownloaded_lowercase_texts = []
for k in range(len(mydownloaded_nltk_texts)):
    temp_lowercase_text = []
    for l in range(len(mydownloaded_nltk_texts[k])):
        lowercase_word = mydownloaded_nltk_texts[k][l].lower()
        temp_lowercase_text.append(lowercase_word)
    temp_lowercasetest = nltk.Text(temp_lowercase_text)
    mydownloaded_lowercase_texts.append(temp_lowercase_text)

In [17]:
# Convert a POS tag for WordNet
def tagtowordnet(postag):
    wordnettag=-1
    if postag[0]=='N':
        wordnettag='n'
    elif postag[0]=='V':
        wordnettag='v'
    elif postag[0]=='J':
        wordnettag='a'
    elif postag[0]=='R':
        wordnettag='r'
    return(wordnettag)

In [19]:
# POS tag and lemmatize the loaded texts
# Download tagger and wordnet resources if you do not have them already
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# lemmatize downloaded texts
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatizetext(nltktexttolemmatize):
    # tag the text with POS tags
    taggedtext = nltk.pos_tag(nltktexttolemmatize)
    # lemmatize each word text
    lemmatizedtext = []
    for l in range(len(taggedtext)):
        # Lemmatize a word using the WordNet converted POS tag
        wordtolemmatize = taggedtext[l][0]
        wordnettag = tagtowordnet(taggedtext[l][1])
        if wordnettag != -1:
            lemmatizedword = lemmatizer.lemmatize(wordtolemmatize, wordnettag)
        else:
            lemmatizedword = wordtolemmatize
        # store the lemmatized word
        lemmatizedtext.append(lemmatizedword)
    return(lemmatizedtext)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\tunde\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tunde\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
mydownloaded_lemmatizedtexts = []
for k in range(len(mydownloaded_lowercase_texts)):
    lemmatizedtext = lemmatizetext(mydownloaded_lowercase_texts[k])
    lemmatizedtext = nltk.Text(lemmatizedtext)
    mydownloaded_lemmatizedtexts.append(lemmatizedtext)

In [21]:
# find the vocabularies of each document
myvocabularies = []
myindices_in_vocabularies = []
for k in range(len(mydownloaded_lemmatizedtexts)):
    # get unique words and where they occur
    temptext = mydownloaded_lemmatizedtexts[k]
    unique_results = np.unique(temptext, return_inverse = True)
    unique_words = unique_results[0]
    word_indices = unique_results[1]
    # store the vocabularies and the indices
    myvocabularies.append(unique_words)
    myindices_in_vocabularies.append(word_indices)

In [22]:
# unify the vocabularies
# first concatenate all vocabularies
tempvocabulary = []
for k in range(len(mydownloaded_lemmatizedtexts)):
    tempvocabulary.extend(myvocabularies[k])

# find unique words among all the vocabularies
uniqueresults = np.unique(tempvocabulary, return_inverse = True)
unifiedvocabulary = uniqueresults[0]
wordindices = uniqueresults[1]

In [23]:
# translate previous indices to the unified vocabulary
# must keep track where each vocabulary started in the concatenated one
vocabularystart = 0
myindices_in_unifiedvocabulary = []
for k in range(len(mydownloaded_lemmatizedtexts)):
    # in order to shift word indices, we must temporarily
    # convert their data type to a numpy array
    tempindices = np.array(myindices_in_vocabularies[k])
    tempindices = tempindices + vocabularystart
    tempindices = wordindices[tempindices]
    myindices_in_unifiedvocabulary.append(tempindices)
    vocabularystart = vocabularystart + len(myvocabularies[k])

In [24]:
# total count of each unique word over all the downloaded documents
unifiedvocabulary_totaloccurrencecounts = np.zeros((len(unifiedvocabulary),1))

# count occurrences
for k in range(len(mydownloaded_lemmatizedtexts)):
    occurrencecounts = np.zeros((len(unifiedvocabulary), 1))
    for l in range(len(myindices_in_unifiedvocabulary[k])):
        occurrencecounts[myindices_in_unifiedvocabulary[k][l]] = (occurrencecounts[myindices_in_unifiedvocabulary[k][l]]
                                                                 + 1)
    unifiedvocabulary_totaloccurrencecounts = unifiedvocabulary_totaloccurrencecounts + occurrencecounts

In [25]:
# top-100 words according to the largest total occurrence count
highest_totaloccurrences_indices = np.argsort(-1*unifiedvocabulary_totaloccurrencecounts, axis=0)
print(np.squeeze(unifiedvocabulary[highest_totaloccurrences_indices[0:100]]))
print(np.squeeze(unifiedvocabulary_totaloccurrencecounts[highest_totaloccurrences_indices[0:100]]))

['>' 'the' ',' '.' '--' 'be' 'a' 'to' 'i' ')' 'in' 'and' '(' 'of' ':' '@'
 'have' 'that' 'it' 'you' '!' 'do' 'for' '?' "'s" 'on' '|' "n't" '-'
 'this' 'with' 'but' 'he' "''" 'not' '0' 'they' '...' '1' 'as' '<' '``'
 'at' 'if' 'get' 'write' 'or' 'my' '2' 'article' 'game' 'go' 'would' '#'
 'can' 'about' 'one' 'will' 'all' 'there' 'an' 'what' 'from' 'out' 'so'
 '*' 'good' 'car' 'like' 'year' 'by' 'think' 'me' 'team' 'up' 'his' '%'
 'just' 'when' '3' 'make' 'more' 'no' 'your' 'know' 'any' 'say' 'who'
 'than' 'we' '4' 'time' 'some' 'play' 'well' 'see' 'how' 'player' "'m"
 'only']
[40640. 40004. 39497. 39447. 35859. 27283. 17906. 17356. 16024. 14883.
 14093. 14011. 13692. 13179. 11444.  9901.  9497.  9222.  8767.  7162.
  7079.  7035.  7012.  6686.  6181.  6043.  5925.  5222.  4878.  4748.
  4509.  4285.  4178.  4150.  4000.  3977.  3935.  3868.  3795.  3751.
  3648.  3629.  3567.  3525.  3266.  3261.  3238.  3217.  2837.  2745.
  2726.  2692.  2684.  2518.  2408.  2392.  2371.  2306.  2274.

In [26]:
#%% Vocabulary pruning
nltkstopwords = nltk.corpus.stopwords.words('english')
pruningdecisions = np.zeros((len(unifiedvocabulary),1))
for k in range(len(unifiedvocabulary)):
    # Rule 1: check the nltk stop word list
    if (unifiedvocabulary[k] in nltkstopwords):
        pruningdecisions[k] = 1
    # Rule 2: if the word is too short
    if (len(unifiedvocabulary[k]) < 3):
        pruningdecisions[k] = 1
      # Rule 3: if the word is too long
    if (len(unifiedvocabulary[k]) > 20):
        pruningdecisions[k] = 1
    # Rule 4: if the word is in the top 1% of frequent words
    if (k in highest_totaloccurrences_indices[0:int(np.floor(len(unifiedvocabulary)*0.01))]):
        pruningdecisions[k] = 1
    # Rule 5: if the word occurs less than 4 times
    if(unifiedvocabulary_totaloccurrencecounts[k] < 4):
        pruningdecisions[k] = 1

In [27]:
print('Top-100 words after pruning the unified vocabulary:\n')
remaining_indices = np.squeeze(np.where(pruningdecisions==0)[0])
remaining_vocabulary = unifiedvocabulary[remaining_indices]
remainingvocabulary_totaloccurrencecounts = unifiedvocabulary_totaloccurrencecounts[remaining_indices]
remaining_highest_totaloccurrences_indices = np.argsort(-1*remainingvocabulary_totaloccurrencecounts, axis=0)
print(np.squeeze(remaining_vocabulary[remaining_highest_totaloccurrences_indices[0:100]]))
print(np.squeeze(remainingvocabulary_totaloccurrencecounts[remaining_highest_totaloccurrences_indices[0:100]]))

Top-100 words after pruning the unified vocabulary:

['ago' 'cost' 'large' 'city' 'mind' 'total' 'mention' 'brave' 'man' 'ford'
 'penalty' 'final' 'anyway' 'bill' 'coach' 'idea' 'saw' 'performance'
 'beat' 'version' 'rule' 'montreal' 'hitter' 'seat' 'group' 'friend' 'ice'
 'today' 'face' 'although' 'order' 'almost' 'comment' 'minute' 'stats'
 'bos' 'hold' 'det' 'follow' 'job' 'american' 'insurance' 'month' 'smith'
 'past' 'local' 'easy' 'honda' 'tie' 'cal' 'news' 'hell' 'x-newsreader'
 'wait' 'ticket' 'bring' 'jet' 'walk' 'hope' 'helmet' 'rider' 'joe'
 'stuff' 'morris' 'van' 'left' 'compare' 'note' 'word' 'york' 'experience'
 'center' 'flyer' 'information' 'add' 'break' 'defense' 'puck' 'set' 'pen'
 'rear' 'contact' 'netcom.com' 'others' 'e-mail' 'later' 'young' 'claim'
 'late' 'whether' 'tor' 'design' 'matter' 'pit' 'life' 'pull' 'decide'
 'instead' 'open' 'difference']
[247. 246. 246. 246. 245. 244. 244. 243. 242. 241. 240. 240. 236. 232.
 231. 230. 229. 229. 228. 225. 223. 223. 223.

In [28]:
#%% Get indices of documents to remaining words
oldtopruned=[]
tempind=-1
for k in range(len(unifiedvocabulary)):
    if pruningdecisions[k]==0:
        tempind=tempind+1
        oldtopruned.append(tempind)
    else:
        oldtopruned.append(-1)

In [29]:
#%% Create pruned texts
mycrawled_prunedtexts=[]
myindices_in_prunedvocabulary=[]
for k in range(len(mydownloaded_lemmatizedtexts)):
    #print(k)
    temp_newindices=[]
    temp_newdoc=[]
    for l in range(len(mydownloaded_lemmatizedtexts[k])):
        temp_oldindex=myindices_in_unifiedvocabulary[k][l]
        temp_newindex=oldtopruned[temp_oldindex]
        if temp_newindex!=-1:
            temp_newindices.append(temp_newindex)
            temp_newdoc.append(unifiedvocabulary[temp_oldindex])
    mycrawled_prunedtexts.append(temp_newdoc)
    myindices_in_prunedvocabulary.append(temp_newindices)


In [30]:
mycrawled_prunedtexts[1]

['fred',
 'dickey',
 'uiuc',
 'cka52397',
 'uxa.cso.uiuc.edu',
 'active',
 'defend',
 'saturn',
 'lately',
 'full',
 'subject',
 'reply',
 'others',
 'saturn',
 'competitor',
 'overprice',
 'compare',
 'class',
 'understand',
 'whether',
 'understand',
 'profit',
 'figure',
 'minimize',
 'profit',
 'minimize',
 'total',
 'out-of-pocket',
 'expense',
 'generally',
 'saturn',
 'january',
 "'92",
 'study',
 'decide',
 'comparable',
 'cheaply',
 'saturn',
 'saturn',
 'out-of-pocket',
 'expense',
 'important',
 'reduce',
 'profit',
 'reduce',
 'profit',
 'important',
 'experience',
 'reduce',
 'profit',
 'necessarily',
 'fred',
 'saturn',
 '13k',
 'profit',
 'profit',
 '1000',
 '12k',
 'moreover',
 'saturn',
 'reduce',
 'profit',
 'margin',
 '1000',
 'saturn',
 'already',
 '1000',
 'market',
 'class',
 'reduce',
 'profit',
 '2000',
 'market',
 'attract',
 'saturn',
 'force',
 'competitor',
 'lower',
 'survive',
 'saturn',
 'benefit',
 'profit',
 'buyer',
 '0.02',
 'doug']

In [32]:
gensim_tagged_docs = []
for k in range(len(mycrawled_prunedtexts)):
    doctag = 'doc' + str(k)
    tagged_document = gensim.models.doc2vec.TaggedDocument(mycrawled_prunedtexts[k], [doctag])
    gensim_tagged_docs.append(tagged_document)
# Create a dictionary from the documents
gensim_dictionary = gensim.corpora.Dictionary(mycrawled_prunedtexts)

In [33]:
len(gensim_tagged_docs)

4000

In [34]:
gensim_tagged_docs[1]

TaggedDocument(words=['fred', 'dickey', 'uiuc', 'cka52397', 'uxa.cso.uiuc.edu', 'active', 'defend', 'saturn', 'lately', 'full', 'subject', 'reply', 'others', 'saturn', 'competitor', 'overprice', 'compare', 'class', 'understand', 'whether', 'understand', 'profit', 'figure', 'minimize', 'profit', 'minimize', 'total', 'out-of-pocket', 'expense', 'generally', 'saturn', 'january', "'92", 'study', 'decide', 'comparable', 'cheaply', 'saturn', 'saturn', 'out-of-pocket', 'expense', 'important', 'reduce', 'profit', 'reduce', 'profit', 'important', 'experience', 'reduce', 'profit', 'necessarily', 'fred', 'saturn', '13k', 'profit', 'profit', '1000', '12k', 'moreover', 'saturn', 'reduce', 'profit', 'margin', '1000', 'saturn', 'already', '1000', 'market', 'class', 'reduce', 'profit', '2000', 'market', 'attract', 'saturn', 'force', 'competitor', 'lower', 'survive', 'saturn', 'benefit', 'profit', 'buyer', '0.02', 'doug'], tags=['doc1'])

In [35]:
doc2vecmodel = gensim.models.doc2vec.Doc2Vec(gensim_tagged_docs, vector_size=10, window=5, min_count=1, 
                                             workers=4, dm_concat=0)

In [37]:
doc2vecmodel['doc1']

array([ 0.32201952, -0.07698996, -0.31582958,  0.10277903, -0.07306157,
       -0.16128775,  0.04651752,  0.6257165 , -0.06180112, -0.05683586],
      dtype=float32)

In [46]:
doc_names = ['101551', '103118 (2)', '98657', '52550'] # 103118 has a duplicate. Only 103118 (2) is part of
                                                       # part of rec.motorcycles

In [48]:
doc_indices = []
for doc_name in doc_names:
    doc_indices.append(crawled_filenames.index(doc_name))

In [49]:
doc_indices # corresponding document indices

[1, 681, 3998, 2997]

In [62]:
# distances between doc 101551 and other documents
dist_101551_103118 = euclidean(doc2vecmodel['doc1'], doc2vecmodel['doc681'])
dist_101551_98657 = euclidean(doc2vecmodel['doc1'], doc2vecmodel['doc3998'])
dist_101551_52550 = euclidean(doc2vecmodel['doc1'], doc2vecmodel['doc2997'])  

In [63]:
dist_101551_103118, dist_101551_98657, dist_101551_52550

(0.8489734530448914, 0.9890953302383423, 0.8680312037467957)

In [64]:
# distances between 103118 and (98657 and 52550)
dist_103118_98657 = euclidean(doc2vecmodel['doc681'], doc2vecmodel['doc3998'])
dist_103118_52550 = euclidean(doc2vecmodel['doc681'], doc2vecmodel['doc2997'])

In [65]:
dist_103118_98657, dist_103118_52550

(1.2827506065368652, 1.0138620138168335)

In [66]:
# distance between 98657 and 52550
dist_98657_52550 = euclidean(doc2vecmodel['doc3998'], doc2vecmodel['doc2997'])

In [67]:
dist_98657_52550

0.8446446061134338

In [69]:
doc2vecmodel['doc1'], doc2vecmodel['doc681']

(array([ 0.32201952, -0.07698996, -0.31582958,  0.10277903, -0.07306157,
        -0.16128775,  0.04651752,  0.6257165 , -0.06180112, -0.05683586],
       dtype=float32),
 array([-0.1675683 , -0.01746913,  0.10660884, -0.20813224,  0.04478887,
        -0.04763159, -0.01295135,  0.30668226, -0.13105063,  0.19904618],
       dtype=float32))

In [70]:
doc2vecmodel['doc3998'], doc2vecmodel['doc2997']

(array([ 0.61556876, -0.15115848,  0.19895129,  0.32541257,  0.51497835,
        -0.11751217,  0.38376427,  0.5241279 , -0.21683079, -0.33301568],
       dtype=float32),
 array([ 0.19670025, -0.19116794, -0.2614279 ,  0.04757068,  0.40910816,
        -0.14905551,  0.26861793,  0.3582254 , -0.64323956, -0.22158138],
       dtype=float32))

**The closest documents are from the same newsgroup**
- **Documents 101551 and 103118 are the closest when either of them is compared with other documents**
- **Documents 98657 and 52550 are the closest when either of them is compared with other documents**
- **The contents of the closest pair of documents seem to match that of the original documents**