# TF-IDF算法实例


### 0.引入依赖

In [2]:
import numpy as np
import pandas as pd

### 1.定义数据和预处理

In [4]:
docA = "The cat sat on my bed"
docB = "The dog sat on my knees"

bowA = docA.split(" ")
bowB = docB.split(" ")
print(bowA)

# 构建词库
wordSet = set(bowA).union(bowB)
print(wordSet)

['The', 'cat', 'sat', 'on', 'my', 'bed']
{'my', 'The', 'knees', 'on', 'dog', 'bed', 'cat', 'sat'}


### 2.次数统计

In [9]:
#统计字典保存词的出现次数
wordDictA = dict.fromkeys(wordSet,0)
wordDictB = dict.fromkeys(wordSet,0)
wordDictA

{'my': 0,
 'The': 0,
 'knees': 0,
 'on': 0,
 'dog': 0,
 'bed': 0,
 'cat': 0,
 'sat': 0}

In [10]:
for word in bowA:
    wordDictA[word] += 1

for word in bowB:
    wordDictB[word] += 1
    
pd.DataFrame([wordDictA,wordDictB])    

Unnamed: 0,The,bed,cat,dog,knees,my,on,sat
0,1,1,1,0,0,1,1,1
1,1,0,0,1,1,1,1,1


### 3.计算词频TF

In [11]:
def computeTF(wordDict, bow):
    tfDict = {}
    nbowCount = len(bow)
    
    for word,count in wordDict.items():
        tfDict[word] = count / nbowCount #词频/文档长度
    
    return tfDict

#测试
tfA = computeTF(wordDictA,bowA)
tfB = computeTF(wordDictB,bowB)
tfA

{'my': 0.16666666666666666,
 'The': 0.16666666666666666,
 'knees': 0.0,
 'on': 0.16666666666666666,
 'dog': 0.0,
 'bed': 0.16666666666666666,
 'cat': 0.16666666666666666,
 'sat': 0.16666666666666666}

In [15]:
### 4.逆文档频率IDF
def computeIDF(wordDictList): # 词在多少个文档中出现过
    idfDict = dict.fromkeys(wordDictList[0],0)
    N = len(wordDictList) #总文档长度
    import math
    
    for wordDict in wordDictList:
        for word,count in wordDict.items(): #dict的键值对
            if count > 0:
                # 先把Ni增加1存入idfDict
                idfDict[word] += 1
    
    # 得到所有词汇i对应的Ni,现在根据公式替换成idf值
    for word,ni in idfDict.items():
        idfDict[word] = math.log10((N+1)/(ni+1))
        
    return idfDict #全部次的IDF

idf = computeIDF([wordDictA,wordDictB])
idf

{'my': 0.0,
 'The': 0.0,
 'knees': 0.17609125905568124,
 'on': 0.0,
 'dog': 0.17609125905568124,
 'bed': 0.17609125905568124,
 'cat': 0.17609125905568124,
 'sat': 0.0}

### 5.计算tf-idf

In [17]:
def computeTFIDF(tf,idf):
    tfidf = {}
    for word,tfval in tf.items():
        tfidf[word] = tfval * idf[word]
    return tfidf

tfidfA = computeTFIDF(tfA,idf)
tfidfB = computeTFIDF(tfB,idf)

pd.DataFrame([tfidfA,tfidfB])

Unnamed: 0,The,bed,cat,dog,knees,my,on,sat
0,0.0,0.029349,0.029349,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.029349,0.029349,0.0,0.0,0.0
