In [1]:
mydoclist = ['Julie loves me more than Linda loves me',
'Jane likes me more than Julie loves me',
'He likes basketball more than baseball']

In [2]:
from collections import Counter

In [3]:
doc = mydoclist[1]
doc

'Jane likes me more than Julie loves me'

In [4]:
a = doc.split()
a

['Jane', 'likes', 'me', 'more', 'than', 'Julie', 'loves', 'me']

In [5]:
a.count('likes')

1

In [6]:
for doc in mydoclist:
    tf = Counter()
    for word in doc.split():
        tf[word] += 1
tf

Counter({'He': 1,
         'likes': 1,
         'basketball': 1,
         'more': 1,
         'than': 1,
         'baseball': 1})

In [7]:
tf.keys()

dict_keys(['He', 'likes', 'basketball', 'more', 'than', 'baseball'])

In [8]:
dict = {'Name': 'Zara', 'Age': 7, 'Class': 'First'}
dict.keys()

dict_keys(['Name', 'Age', 'Class'])

In [9]:
dict.values()

dict_values(['Zara', 7, 'First'])

In [10]:
dict.items()

dict_items([('Name', 'Zara'), ('Age', 7), ('Class', 'First')])

In [11]:
import string

In [12]:
def build_lexicon(corpus):
    lexicon = set()
    for doc in corpus:
        lexicon.update([word for word in doc.split()])
    return lexicon

def tf(term, document):
    return freq(term, document)

def freq(term, document):
    return document.split().count(term)

In [13]:
vocabulary = build_lexicon(mydoclist)
vocabulary #set

{'He',
 'Jane',
 'Julie',
 'Linda',
 'baseball',
 'basketball',
 'likes',
 'loves',
 'me',
 'more',
 'than'}

In [14]:
b = list(vocabulary)
print(', '.join(b))
b

baseball, Julie, me, Linda, likes, Jane, more, basketball, loves, than, He


['baseball',
 'Julie',
 'me',
 'Linda',
 'likes',
 'Jane',
 'more',
 'basketball',
 'loves',
 'than',
 'He']

In [15]:
print('Our vocabulary vector is [' + ', '.join(list(vocabulary)) + ']')


Our vocabulary vector is [baseball, Julie, me, Linda, likes, Jane, more, basketball, loves, than, He]


In [16]:
doc_term_matrix = []
print('Our vocabulary vector is [' + ', '.join(list(vocabulary)) + ']')
for index, doc in enumerate(mydoclist):
    print('The doc is "' + doc + '"')
    tf_vector = [tf(word, doc) for word in vocabulary]
    tf_vector_string = ', '.join(format(freq) for freq in tf_vector)
    print('The tf vector for Document %d is [%s]' % (index+1, tf_vector_string))
    doc_term_matrix.append(tf_vector)
    
print('All combined, here is our master document term matrix: ')
print(doc_term_matrix)

Our vocabulary vector is [baseball, Julie, me, Linda, likes, Jane, more, basketball, loves, than, He]
The doc is "Julie loves me more than Linda loves me"
The tf vector for Document 1 is [0, 1, 2, 1, 0, 0, 1, 0, 2, 1, 0]
The doc is "Jane likes me more than Julie loves me"
The tf vector for Document 2 is [0, 1, 2, 0, 1, 1, 1, 0, 1, 1, 0]
The doc is "He likes basketball more than baseball"
The tf vector for Document 3 is [1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1]
All combined, here is our master document term matrix: 
[[0, 1, 2, 1, 0, 0, 1, 0, 2, 1, 0], [0, 1, 2, 0, 1, 1, 1, 0, 1, 1, 0], [1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1]]


Normalizing vectors to L2 Norm = 1

In [17]:
import math
import numpy as np

def l2_normalizer(vector):
    denom = np.sum([element**2 for element in vector])
    return [(element / math.sqrt(denom)) for element in vector]

doc_term_matrix_l2 = []
for vec in doc_term_matrix:
    doc_term_matrix_l2.append(l2_normalizer(vec))
    
print ('A regular old documnet term matrix: ')
print (np.matrix(doc_term_matrix))
print ('\nA document term matrix with row-wise L2 norms of 1:')
print (np.matrix(doc_term_matrix_l2))

A regular old documnet term matrix: 
[[0 1 2 1 0 0 1 0 2 1 0]
 [0 1 2 0 1 1 1 0 1 1 0]
 [1 0 0 0 1 0 1 1 0 1 1]]

A document term matrix with row-wise L2 norms of 1:
[[ 0.          0.28867513  0.57735027  0.28867513  0.          0.
   0.28867513  0.          0.57735027  0.28867513  0.        ]
 [ 0.          0.31622777  0.63245553  0.          0.31622777  0.31622777
   0.31622777  0.          0.31622777  0.31622777  0.        ]
 [ 0.40824829  0.          0.          0.          0.40824829  0.
   0.40824829  0.40824829  0.          0.40824829  0.40824829]]


In [18]:
print(doc_term_matrix)

[[0, 1, 2, 1, 0, 0, 1, 0, 2, 1, 0], [0, 1, 2, 0, 1, 1, 1, 0, 1, 1, 0], [1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1]]


In [19]:
from numpy import linalg as la
print(la.norm(doc_term_matrix[0]))
print(la.norm(doc_term_matrix_l2[0]))

3.46410161514
1.0


IDF frequency weighting

In [20]:
def numDocsContaining(word, doclist):
    doccount = 0
    for doc in doclist:
        if freq(word, doc) > 0:
            doccount += 1
    return doccount

In [21]:
def idf(word, doclist):
    num_docs = len(doclist)
    df = numDocsContaining(word, doclist)
    return np.log(num_docs/df + 1)

my_idf_vector = [idf(word, mydoclist) for word in vocabulary]

In [22]:
print ('Our vocabulary vector is [' + ', '.join(list(vocabulary)) + ']')
print ('The inverse documnet frequency vector is [' + ', '.join(format(idf, 'f') for idf in my_idf_vector) + ']')

Our vocabulary vector is [baseball, Julie, me, Linda, likes, Jane, more, basketball, loves, than, He]
The inverse documnet frequency vector is [1.386294, 0.916291, 0.916291, 1.386294, 0.916291, 1.386294, 0.693147, 1.386294, 0.916291, 0.693147, 1.386294]


In [23]:
import numpy as np

def build_idf_matrix(idf_vector):
    idf_mat = np.zeros((len(idf_vector), len(idf_vector)))
    np.fill_diagonal(idf_mat, idf_vector)
    return idf_mat

my_idf_matrix = build_idf_matrix(my_idf_vector)
print (my_idf_matrix)

[[ 1.38629436  0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.        ]
 [ 0.          0.91629073  0.          0.          0.          0.          0.
   0.          0.          0.          0.        ]
 [ 0.          0.          0.91629073  0.          0.          0.          0.
   0.          0.          0.          0.        ]
 [ 0.          0.          0.          1.38629436  0.          0.          0.
   0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.91629073  0.          0.
   0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          1.38629436
   0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.
   0.69314718  0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.          0.
   1.386294

In [24]:
doc_term_matrix

[[0, 1, 2, 1, 0, 0, 1, 0, 2, 1, 0],
 [0, 1, 2, 0, 1, 1, 1, 0, 1, 1, 0],
 [1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1]]

In [25]:
doc_term_matrix_tfidf = []

#performing tf-idf matrix multiplication
for tf_vector in doc_term_matrix:
    doc_term_matrix_tfidf.append(np.dot(tf_vector, my_idf_matrix))

In [26]:
#normalizing
doc_term_matrix_tfidf_l2 = []
for tf_vector in doc_term_matrix_tfidf:
    doc_term_matrix_tfidf_l2.append(l2_normalizer(tf_vector))

In [27]:
print (vocabulary)
print (np.matrix(doc_term_matrix_tfidf_l2)) # np.matrix() just to make it easier to look at

{'baseball', 'Julie', 'me', 'Linda', 'likes', 'Jane', 'more', 'basketball', 'loves', 'than', 'He'}
[[ 0.          0.28359822  0.56719644  0.42906755  0.          0.
   0.21453377  0.          0.56719644  0.21453377  0.        ]
 [ 0.          0.30958879  0.61917759  0.          0.30958879  0.46838976
   0.23419488  0.          0.30958879  0.23419488  0.        ]
 [ 0.50399273  0.          0.          0.          0.33312107  0.
   0.25199636  0.50399273  0.          0.25199636  0.50399273]]


In [28]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(min_df=1)
term_freq_matrix = count_vectorizer.fit_transform(mydoclist)
print ("Vocabulary:", count_vectorizer.vocabulary_)
print (term_freq_matrix.todense())

from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(norm="l2")
tf_idf_matrix = tfidf.fit_transform(term_freq_matrix)
print (tf_idf_matrix.todense())

Vocabulary: {'julie': 4, 'loves': 7, 'me': 8, 'more': 9, 'than': 10, 'linda': 6, 'jane': 3, 'likes': 5, 'he': 2, 'basketball': 1, 'baseball': 0}
[[0 0 0 0 1 0 1 2 2 1 1]
 [0 0 0 1 1 1 0 1 2 1 1]
 [1 1 1 0 0 1 0 0 0 1 1]]
[[ 0.          0.          0.          0.          0.28945906  0.
   0.38060387  0.57891811  0.57891811  0.22479078  0.22479078]
 [ 0.          0.          0.          0.41715759  0.3172591   0.3172591
   0.          0.3172591   0.6345182   0.24637999  0.24637999]
 [ 0.48359121  0.48359121  0.48359121  0.          0.          0.36778358
   0.          0.          0.          0.28561676  0.28561676]]


In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(min_df = 1)
tfidf_matrix = tfidf_vectorizer.fit_transform(mydoclist)

print (tfidf_matrix.todense())

[[ 0.          0.          0.          0.          0.28945906  0.
   0.38060387  0.57891811  0.57891811  0.22479078  0.22479078]
 [ 0.          0.          0.          0.41715759  0.3172591   0.3172591
   0.          0.3172591   0.6345182   0.24637999  0.24637999]
 [ 0.48359121  0.48359121  0.48359121  0.          0.          0.36778358
   0.          0.          0.          0.28561676  0.28561676]]
