# 简单TF-IDF示例

In [1]:
import numpy as np
import pandas as pd

## 1. 定义数据和预处理

In [2]:
# m没有使用标点符号。
docA = "The cat sat on my bed"
docB = "The dog sat on my kness"

# 先分词。生成词袋。
bowA = docA.split(" ")
bowB = docB.split(" ")

# 构建完整词库，取并集。
wordSet = set(bowA).union(set(bowB)) 

## 2. 进行词数统计

In [3]:
# 用统计字典来保存词出现的次数。
wordDictA = dict.fromkeys(wordSet, 0)
wordDictB = dict.fromkeys(wordSet, 0)

# 遍历文档，统计词数。
for word in bowA:
    wordDictA[word] += 1
for word in bowB:
    wordDictB[word] += 1

pd.DataFrame([wordDictA, wordDictB])

Unnamed: 0,cat,sat,dog,kness,The,my,on,bed
0,1,1,0,0,1,1,1,1
1,0,1,1,1,1,1,1,0


## 3. 计算TF

In [4]:
def computeTF(wordDict, bow):
    # 用一个字典对象记录TF
    tfDict = {}
    nbowCount = len(bow)
    
    for word,count in wordDict.items():
        # print(word, count,)
        tfDict[word] = count / nbowCount
    
    return tfDict

tfA = computeTF(wordDictA, bowA)
tfB = computeTF(wordDictB, bowB)

tfA

{'cat': 0.16666666666666666,
 'sat': 0.16666666666666666,
 'dog': 0.0,
 'kness': 0.0,
 'The': 0.16666666666666666,
 'my': 0.16666666666666666,
 'on': 0.16666666666666666,
 'bed': 0.16666666666666666}

## 4. 计算逆文档频率

In [5]:
import math

def computeIDF(wordDictList):
    # 用一个字典对象来保存idf结果，每个词作为key，初始值为0。
    idfDict = dict.fromkeys(wordDictList[0], 0)
    N = len(wordDictList)
    
    for wordDict in wordDictList:
        # 遍历字典中的每个词汇，统计Ni
        for word,count in wordDict.items():
            if count > 0 :
                # 先把Ni增加1，存入到idfDict
                idfDict[word] += 1
    
    # 已经得到所有词汇i对应的Ni，现在更加公式把它替换为idf值。
    for word, ni in idfDict.items():
        idfDict[word] = math.log10((N+1)/(ni+1))
    
    return idfDict

idfs = computeIDF([wordDictA, wordDictB])
idfs

{'cat': 0.17609125905568124,
 'sat': 0.0,
 'dog': 0.17609125905568124,
 'kness': 0.17609125905568124,
 'The': 0.0,
 'my': 0.0,
 'on': 0.0,
 'bed': 0.17609125905568124}

## 5. 计算TF-IDF

In [6]:
def computeTFIDF(tf, idfs):
    tfidf = {}
    for word, tfval in tf.items():
        tfidf[word] = tfval * idfs[word]
        
    return tfidf

tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)

pd.DataFrame([tfidfA, tfidfB])

Unnamed: 0,cat,sat,dog,kness,The,my,on,bed
0,0.029349,0.0,0.0,0.0,0.0,0.0,0.0,0.029349
1,0.0,0.0,0.029349,0.029349,0.0,0.0,0.0,0.0
