# Advanced: Text Processing in Matrices

## Load Natural Language Toolkit for Parsing

In [2]:
! pip install nltk
import nltk

# Enter 'd' for Download, then 'punkt', and then 'q' for quit
nltk.download()


[33mYou are using pip version 8.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

## Import text files into dictionary

As a "corpus" we fetched some data from Wikipedia, based on currently
trendy (2/18/2017) topics.  Each topic had multiple interpretations, some of which 
we suspected would "intersect" in interesting ways (e.g., Trump/Putin, Cloud/Google, 
Cloud/Climate).  Others had various interpretations (e.g., there are many types of 
Football).  See _Wikipedia.ipynb_ for the original download code.

Selected topics (for which the top-10 matches were returned by Wikipedia) were:

 * Pennsylvania
 * Trump
 * Apple
 * Google
 * Farm
 * Climate
 * Cloud
 * Football
 * Government
 * Putin

*docs* is a map from file --> text

In [3]:
import os

docs = {}

for filename in os.listdir('text'):
    file = open('text/' + filename)
    docs[filename] = file.read()
    print ('Loaded',filename)

print ("All files loaded")

Loaded Province of Pennsylvania.txt
Loaded Eric Trump.txt
Loaded Apple.txt
Loaded Putin khuilo!.txt
Loaded Cooking apple.txt
Loaded Apple TV.txt
Loaded Pennsylvania Historical and Museum Commission.txt
Loaded Football player.txt
Loaded Donald Trump.txt
Loaded Public image of Vladimir Putin.txt
Loaded Google Developers.txt
Loaded Alpine climate.txt
Loaded Desert climate.txt
Loaded Century Farm.txt
Loaded Apple Inc..txt
Loaded Animal Farm.txt
Loaded Google Books.txt
Loaded Google Account.txt
Loaded Oort cloud.txt
Loaded HP Cloud.txt
Loaded Farm Aid.txt
Loaded History of Pennsylvania.txt
Loaded E-government.txt
Loaded Trump University.txt
Loaded Outline of Pennsylvania.txt
Loaded Google Search.txt
Loaded Arrest of Vladimir Putin viral video.txt
Loaded AtGoogleTalks.txt
Loaded Cloud computing.txt
Loaded Government of Australia.txt
Loaded Government.txt
Loaded Family of Donald Trump.txt
Loaded Stratus cloud.txt
Loaded Brook Farm.txt
Loaded Google.txt
Loaded Wind farm.txt
Loaded Subarctic cl

## Other preliminaries to get you started.

The function *has_letter* should be used to filter words based on the presence of a letter.

The set *stopwords* includes words to ignore.

In [4]:
import nltk
from nltk.stem.porter import *
import re
import numpy as np

"""
# Returns True if the input (string) parameter has
# any sort of letter in it, else returns False.
"""
def has_letter(x):
    return re.match('.*[a-zA-Z].*',x) != None

# Stopwords are words we will ignore for search
# purposes, because they are too common to be useful
stopwords = set()

stop_file = open('stopwords.txt')
for line in stop_file:
    stopwords.add(line.strip())

# The NLTK parser breaks apostrophe-s into a separate "word"
# so we'll want to add it to the list... Though it's technically
# not a stop word in the traditional sense.
stopwords.add("'s")

# Use this as the maximum number of words we will index
MAX_WORDS = 18174

# Create the word stemmer
stemmer = PorterStemmer()

# Your Code Goes Here!

Note that you may want to read more about TF*IDF scoring at:

* http://nlp.stanford.edu/IR-book/html/htmledition/term-frequency-and-weighting-1.html
* https://en.wikipedia.org/wiki/Tf%E2%80%93idf

In [5]:
docs['Wind farm.txt'][0:20]

'A wind farm is a gro'

In [6]:
len(stopwords)

175

In [7]:
lexicon = {}
inverse_lexicon = []
word_count = 0
doc_vectors = np.zeros((len(docs), MAX_WORDS))

In [8]:
def doc_vector(content, vector, lexicon, inverse_lexicon, stopwords, word_count):
    tokens = nltk.word_tokenize(content)
    for token in tokens:
        if(has_letter(token) and token not in stopwords):
            stem_token = stemmer.stem(token)
            if (stem_token in lexicon):
                vector[lexicon[stem_token]] += 1
            else:
                if(word_count < MAX_WORDS):
                    lexicon[stem_token] = word_count
                    inverse_lexicon.append(stem_token)
                    vector[word_count] += 1
                    word_count += 1
    return word_count

In [9]:
# doc_vector(docs['Farm.txt'], doc_vectors[0], lexicon, inverse_lexicon, stopwords, 0)
i = 0
for doc in docs:
    word_count = doc_vector(docs[doc], doc_vectors[i], lexicon, inverse_lexicon, stopwords, word_count)
    i += 1

In [10]:
for i in range(len(doc_vectors)):
    print(len([x for x in doc_vectors[i] if x != 0]))

1034
2249
1272
219
360
508
1998
157
1135
586
188
154
1053
1553
220
284
1289
1132
882
526
667
819
548
305
1351
1483
194
304
490
529
1469
581
633
1535
408
114
323
1253
1216
1215
110
87
2509
400
704
642
587
1896
467
981
72
479
502
511
406
1393
633
827
37
399
1641
213
272
317
382
1991
1591
232
108
1040
509
1739
399
1992
359
594
1394
717
662
511
2347
349
531
573
961
213
1326
530
1276
1238
376
1585
297
393
959
317
314
302


In [11]:
doc_vectors[:,0]

array([  9.,  16.,  13.,   1.,   0.,   0.,  15.,   2.,  14.,   3.,   7.,
         0.,   9.,  19.,   1.,   1.,   6.,  13.,   4.,   0.,   2.,   5.,
         3.,   2.,   7.,  15.,   0.,   0.,   5.,   1.,  13.,   6.,   5.,
         4.,   6.,   0.,   0.,  10.,   3.,  11.,   1.,   2.,  12.,   3.,
        19.,   2.,   1.,   2.,   0.,   9.,   1.,   4.,   2.,   4.,   1.,
        14.,   3.,   7.,   1.,   2.,   8.,   4.,   2.,   0.,   7.,   9.,
        20.,   0.,   1.,  16.,   1.,  11.,   0.,   4.,   1.,   2.,  10.,
         4.,   6.,   1.,  15.,   1.,   3.,   1.,   1.,   0.,   8.,   1.,
        18.,  18.,   7.,   9.,   0.,   2.,   8.,   1.,   3.,   4.])

In [12]:
corpus_size = len(docs)

In [13]:
from math import log10
idfs = [log10(corpus_size / np.count_nonzero(doc_vectors[:,i])) for i in range(MAX_WORDS)]

In [14]:
def create_query_vector(query):
    query_vector = np.zeros(MAX_WORDS)
    global word_count
    word_count = doc_vector(query, query_vector, lexicon, inverse_lexicon, stopwords, word_count)
    return query_vector

In [71]:
import pandas as pd
def search(vectors, idf, query, num_results):
    query_idf = idf * query
    
    scores = np.array([(vectors[i] * idf) @ query_idf / (np.linalg.norm(vectors[i] * idf) * np.linalg.norm(query_idf))
                       for i in range(corpus_size)])
    dict = {"docid":np.arange(corpus_size), "docname":np.array(list(docs.keys())), "score":pd.to_numeric(scores, errors='coerce')}
    result = pd.DataFrame.from_dict(dict)
    result = result.sort_values("score", ascending=False)
    return result[0:10]
#     return result

## Step 5

In [74]:
search(doc_vectors, idfs, create_query_vector("Apple Steve jobs"), 10)

Unnamed: 0,docid,docname,score
80,80,Apple Inc..txt,0.48885
31,31,Apple I.txt,0.433946
20,20,Apple III.txt,0.400616
60,60,Apple II series.txt,0.349057
84,84,Apple Store.txt,0.33698
37,37,Apple.txt,0.329733
7,7,Cooking apple.txt,0.303956
17,17,Apple TV.txt,0.301958
24,24,Apple Corps.txt,0.274117
48,48,Home Farm F.C..txt,0.019623


In [77]:
search(doc_vectors, idfs, create_query_vector("Trump Putin"), 5)

Unnamed: 0,docid,docname,score
42,42,Donald Trump.txt,0.660205
91,91,Legal affairs of Donald Trump.txt,0.635921
89,89,The Trump Organization.txt,0.627659
77,77,Trump University.txt,0.592956
69,69,Public image of Vladimir Putin.txt,0.590706
72,72,Family of Donald Trump.txt,0.583825
1,1,Vladimir Putin.txt,0.575655
74,74,Trump family.txt,0.523255
59,59,Eric Trump.txt,0.4856
65,65,Russia under Vladimir Putin.txt,0.461298


In [78]:
search(doc_vectors, idfs, create_query_vector("Google Cloud"), 10)

Unnamed: 0,docid,docname,score
47,47,Google.txt,0.654294
96,96,Google Developers.txt,0.549571
61,61,Google Account.txt,0.518488
82,82,Google Talk.txt,0.486282
2,2,Cloud computing.txt,0.484473
10,10,Arcus cloud.txt,0.454484
12,12,Google Books.txt,0.450616
25,25,Cloud.txt,0.439715
3,3,AtGoogleTalks.txt,0.431917
64,64,Stratus cloud.txt,0.424013
