In [157]:
docA = "The cat sat on my face"
docB = "The dog sat on my bed"

In [158]:
bowA = docA.split(" ")
bowB = docB.split(" ")

In [159]:
bowB

['The', 'dog', 'sat', 'on', 'my', 'bed']

In [160]:
wordSet = set(bowA).union(set(bowB))

In [161]:
wordSet

{'The', 'bed', 'cat', 'dog', 'face', 'my', 'on', 'sat'}

In [162]:
wordDictA = dict.fromkeys(wordSet, 0)
wordDictB = dict.fromkeys(wordSet, 0)

In [163]:
wordDictA

{'The': 0, 'bed': 0, 'cat': 0, 'dog': 0, 'face': 0, 'my': 0, 'on': 0, 'sat': 0}

In [164]:
wordDictB

{'The': 0, 'bed': 0, 'cat': 0, 'dog': 0, 'face': 0, 'my': 0, 'on': 0, 'sat': 0}

In [165]:
for word in bowA:
    wordDictA[word] += 1

for word in bowB:
    wordDictB[word] += 1

In [166]:
wordDictA

{'The': 1, 'bed': 0, 'cat': 1, 'dog': 0, 'face': 1, 'my': 1, 'on': 1, 'sat': 1}

In [167]:
wordDictB

{'The': 1, 'bed': 1, 'cat': 0, 'dog': 1, 'face': 0, 'my': 1, 'on': 1, 'sat': 1}

In [168]:
import pandas as pd

pd.DataFrame([wordDictA, wordDictB])

Unnamed: 0,The,bed,cat,dog,face,my,on,sat
0,1,0,1,0,1,1,1,1
1,1,1,0,1,0,1,1,1


In [169]:
def computeTF(wordDict, bow):
    tfDict = {}
    bowCount = len(bow)
    
    for word, count in wordDict.items():
        """
        computes the Term Frequency score for each word in the corpus, by document.
        Term Frequency: frequency of the word in each document
        tf = It is the ratio of number of times the word appears in a document
        compared to the total number of words in that document
        """
        tfDict[word] = count/float(bowCount)
    return tfDict

In [170]:
tfBowA = computeTF(wordDictA, bowA)
tfBowB = computeTF(wordDictB, bowB)

In [171]:
tfBowA

{'The': 0.16666666666666666,
 'bed': 0.0,
 'cat': 0.16666666666666666,
 'dog': 0.0,
 'face': 0.16666666666666666,
 'my': 0.16666666666666666,
 'on': 0.16666666666666666,
 'sat': 0.16666666666666666}

In [172]:
tfBowB

{'The': 0.16666666666666666,
 'bed': 0.16666666666666666,
 'cat': 0.0,
 'dog': 0.16666666666666666,
 'face': 0.0,
 'my': 0.16666666666666666,
 'on': 0.16666666666666666,
 'sat': 0.16666666666666666}

In [173]:
def computeIDF(docList):
    """
    used to calculate the weight of rare words across all documents in the corpus
    idf = log(N/df)
    """
    import math
    
    idfDict = {}
    N = len(docList)
    
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    
    for doc in docList:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1
                
    for word, val in idfDict.items():
            idfDict[word] = math.log10(N /float(val))
    return idfDict

In [174]:
idfs = computeIDF([wordDictA, wordDictB])

In [175]:
def computeTFIDF(tfBow, idfs):
    """
    Computes the TF-IDF score for each word, by multiplying the TF and IDF scores.
    """
    tfidf = {}
    
    for word, val in tfBow.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [176]:
tfidfBowA = computeTFIDF(tfBowA, idfs)
tfidfBowB = computeTFIDF(tfBowB, idfs)

In [177]:
import pandas as pd
pd.DataFrame([tfidfBowA, tfidfBowB])

Unnamed: 0,The,bed,cat,dog,face,my,on,sat
0,0.0,0.0,0.050172,0.0,0.050172,0.0,0.0,0.0
1,0.0,0.050172,0.0,0.050172,0.0,0.0,0.0,0.0
