# 简单TF-IDF示例

In [1]:
import numpy as np
import pandas as pd

## 1. 定义数据和预处理

In [2]:
# m没有使用标点符号。
docA = "The cat sat on my bed"
docB = "The dog sat on my kness"

# 先分词。生成词袋。
bowA = docA.split(" ")
bowB = docB.split(" ")

# 构建完整词库，取并集。
wordSet = set(bowA).union(set(bowB)) 

## 2. 进行词数统计

In [3]:
# 用统计字典来保存词出现的次数。
wordDictA = dict.fromkeys(wordSet, 0)
wordDictB = dict.fromkeys(wordSet, 0)

# 遍历文档，统计词数。
for word in bowA:
    wordDictA[word] += 1
for word in bowB:
    wordDictB[word] += 1

pd.DataFrame([wordDictA, wordDictB])

Unnamed: 0,bed,dog,sat,The,kness,on,my,cat
0,1,0,1,1,0,1,1,1
1,0,1,1,1,1,1,1,0


## 3. 计算TF

$$\text{词频TF}=\frac{\text{某个词在一篇文章中出现的次数}}{\text{一篇文章的总词数}}$$

In [4]:
def computeTF(wordDict, bow):
    # 用一个字典对象记录TF
    tfDict = {}
    nbowCount = len(bow)
    
    for word,count in wordDict.items():
        # print(word, count,)
        tfDict[word] = count / nbowCount
    
    return tfDict

tfA = computeTF(wordDictA, bowA)
tfB = computeTF(wordDictB, bowB)

tfA

{'bed': 0.16666666666666666,
 'dog': 0.0,
 'sat': 0.16666666666666666,
 'The': 0.16666666666666666,
 'kness': 0.0,
 'on': 0.16666666666666666,
 'my': 0.16666666666666666,
 'cat': 0.16666666666666666}

## 4. 计算逆文档频率

$$\text{逆文档频率IDF}=\log_{10}\frac{\text{语料库的文档总数}}{\text{包含该词的文档数}+1}$$
1. “语料库的文档总数”是固定值。
2. 只需要求“包含该词的文档数”即可。

In [5]:
import math

def computeIDF(wordDictList):
    # 用一个字典对象来保存idf结果，每个词作为key，初始值为0。
    idfDict = dict.fromkeys(wordDictList[0], 0)
    N = len(wordDictList)
    
    for wordDict in wordDictList:
        # 遍历字典中的每个词汇，统计Ni
        for word,count in wordDict.items():
            if count > 0 :
                # 先把Ni增加1，存入到idfDict
                idfDict[word] += 1
    
    print(idfDict)
    # 已经得到所有词汇i对应的Ni，现在更加公式把它替换为idf值。
    for word, ni in idfDict.items():
        idfDict[word] = math.log10((N+1)/(ni+1))
        print(N, ni)
    
    return idfDict

idfs = computeIDF([wordDictA, wordDictB])
idfs

{'bed': 1, 'dog': 1, 'sat': 2, 'The': 2, 'kness': 1, 'on': 2, 'my': 2, 'cat': 1}
2 1
2 1
2 2
2 2
2 1
2 2
2 2
2 1


{'bed': 0.17609125905568124,
 'dog': 0.17609125905568124,
 'sat': 0.0,
 'The': 0.0,
 'kness': 0.17609125905568124,
 'on': 0.0,
 'my': 0.0,
 'cat': 0.17609125905568124}

## 5. 计算TF-IDF

In [6]:
def computeTFIDF(tf, idfs):
    tfidf = {}
    for word, tfval in tf.items():
        tfidf[word] = tfval * idfs[word]
        
    return tfidf

tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)

pd.DataFrame([tfidfA, tfidfB])

Unnamed: 0,bed,dog,sat,The,kness,on,my,cat
0,0.029349,0.0,0.0,0.0,0.0,0.0,0.0,0.029349
1,0.0,0.029349,0.0,0.0,0.029349,0.0,0.0,0.0


# 一个相对复杂的例子

1. 几个文件PDF导出成txt的时候格式还都不一样。wtf

In [7]:
import re
import pandas as pd
import numpy as np

2. 对于第一个字符如果是特殊字符的行不进行处理

In [8]:
skipCharList = ['\n', '\r', '\t', ' ', '.', ':', 'a', 'b', 'c', 'd', 
                'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 
                'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

3. 设置总词库
   1. 列名是单词。
   2. 行名是不同的文章的名称或者对应的index。
   3. 目前是四篇paper，创建的词袋是4行。

In [9]:
WordBag = pd.DataFrame([[],[],[],[]])
# 设置行名称。
WordBag.index = pd.Series(['P1', 'P2', 'P3', 'P4'])
WordBag

P1
P2
P3
P4


4. 定义将词添加入总词袋中，并且对出现的次数进行记录的函数。

In [10]:
def AddWordToWordBag(paperName, word):
    # WordList = WordBag.columns.to_list()
    if word in skipCharList:
        return
    if word in WordBag.columns.to_list():
        WordBag.loc[paperName, word] += 1
    else:
        WordBag[word] = 0
        WordBag.loc[paperName, word] = 1


In [11]:
# 测试函数

# words = ['attention', 'NaN', 'all', 'is', 'you', 'need', 'NaN', 'NaN']

# for word in words:
#     AddWordToWordBag(1, word)
    
# print(WordBag)

5. 定义文本路径

In [12]:
# 4个文本的路径。
PaperPathList = ["../../data/TFIDF/1.txt", 
                 "../../data/TFIDF/2.txt", 
                 "../../data/TFIDF/3.txt", 
                 "../../data/TFIDF/4.txt"]

## 对第一个文档进行处理。

1. 在PDF导出为txt的时候就将所有的文字分为了逐个的单词，而且每个单词占一行。

In [13]:
# 对第一篇论文进行处理


# 逐行读取
with open(PaperPathList[0]) as f1:
    line = f1.readline()
    line = line
    
    while line:
        if line in skipCharList:
            line = f1.readline()  #读取一行文件，包括换行符
            line = line
            continue

        AddWordToWordBag('P1', re.sub(r'[^a-zA-Z]', '', line).casefold())
        
        # 读取下一行。
        line = f1.readline()  #读取一行文件，包括换行符
        line = line
        # print("-----------")
        # print(Paper1WordList)
        # print("-----------------------------------")
        
print(WordBag)
        

  


    neural  collaborative  filtering  xiangnan  helizi  liao  hanwang  zhang  \
P1      44             41          4         1       1     1        1      8   
P2       0              0          0         0       0     0        0      0   
P3       0              0          0         0       0     0        0      0   
P4       0              0          0         0       0     0        0      0   

    national  university  ...  shen  luan  discrete  start  identifying  \
P1         4           6  ...     1     2         1      1            1   
P2         0           0  ...     0     0         0      0            0   
P3         0           0  ...     0     0         0      0            0   
P4         0           0  ...     0     0         0      0            0   

    naming  attributes  tang  ding  zhou  
P1       1           1     1     1     1  
P2       0           0     0     0     0  
P3       0           0     0     0     0  
P4       0           0     0     0     0  

[4 rows

## 对第二个文本进行处理


In [14]:
with open(PaperPathList[1]) as f2:
    line = f2.readline()
    line = line
    while line:        
        # 如果存在“|”字符，那么使用“|”对字符串进行分词。不然就用"."对字符串进行分词。
        if "|" in line:
            lineWords = line.split("|")
        else:
            lineWords = line.split(".")
        # print(lineWords)
        
        # if line in skipCharList:
        #     line = f2.readline()  #读取一行文件，包括换行符
        #     line = line
        #     continue
        
        for word in lineWords:
            AddWordToWordBag('P2', re.sub(r'[^a-zA-Z]', '', word).casefold())

        line = f2.readline()  #读取一行文件，包括换行符
        line = line
        # print("--------------------------------------")

print(WordBag)

  


    neural  collaborative  filtering  xiangnan  helizi  liao  hanwang  zhang  \
P1      44             41          4         1       1     1        1      8   
P2       0              0          0         0       0     0        0      1   
P3       0              0          0         0       0     0        0      0   
P4       0              0          0         0       0     0        0      0   

    national  university  ...  forms  constructing  tegration  exten  \
P1         4           6  ...      0             0          0      0   
P2         1           2  ...      1             1          1      1   
P3         0           0  ...      0             0          0      0   
P4         0           0  ...      0             0          0      0   

    sibility  shed  light  possibly  acknowledgments  foun  
P1         0     0      0         0                0     0  
P2         1     1      1         1                1     1  
P3         0     0      0         0                0   

## 对第三个文本进行处理

In [15]:
with open(PaperPathList[2]) as f3:
    line = f3.readline()
    line = line
    while line: 
        # 用" "对字符串进行分词。
        lineWords = line.split(" ")
        
        # if line in skipCharList:
        #     line = f3.readline()  #读取一行文件，包括换行符
        #     line = line
        #     continue
            
        # print(lineWords)
        for word in lineWords:
            AddWordToWordBag('P3', re.sub(r'[^a-zA-Z]', '', word).casefold())

        line = f3.readline()  #读取一行文件，包括换行符
        line = line
        # line = line[:-1]
        # print("--------------------------------------")

print(WordBag)

  


    neural  collaborative  filtering  xiangnan  helizi  liao  hanwang  zhang  \
P1      44             41          4         1       1     1        1      8   
P2       0              0          0         0       0     0        0      1   
P3      27              0          0         0       0     0        0      3   
P4       0              0          0         0       0     0        0      0   

    national  university  ...  colors  viewed  color  \
P1         4           6  ...       0       0      0   
P2         1           2  ...       0       0      0   
P3         0           1  ...       1       1      1   
P4         0           0  ...       0       0      0   

    thelawwillneverbeperfectbutitsapplicationshouldbejustthisiswhatwearemissinginmyopinioneospad  \
P1                                                  0                                              
P2                                                  0                                              
P3                

In [16]:
# zz = Paper3WordList.columns
# print(type(zz))
# print(zz.to_list())

## 对第四个文本进行处理

In [17]:
with open(PaperPathList[3]) as f4:
    line = f4.readline()
    line = line
    while line: 
        # 用" "对字符串进行分词。
        lineWords = line.split(" ")
        
        # if line in skipCharList:
        #     line = f4.readline()  #读取一行文件，包括换行符
        #     line = line
        #     continue
            
        # print(lineWords)
        for word in lineWords:
            AddWordToWordBag('P4', re.sub(r'[^a-zA-Z]', '', word).casefold())

        line = f4.readline()  #读取一行文件，包括换行符
        line = line
        # line = line[:-1]
        # print("--------------------------------------")

print(WordBag)
# Paper4WordList.to_csv("../../data/TFIDF/4t.csv")

  


    neural  collaborative  filtering  xiangnan  helizi  liao  hanwang  zhang  \
P1      44             41          4         1       1     1        1      8   
P2       0              0          0         0       0     0        0      1   
P3      27              0          0         0       0     0        0      3   
P4       2              6          5         0       0     0        0      4   

    national  university  ...  locationcontentaware  nguyen  adapting  drift  \
P1         4           6  ...                     0       0         0      0   
P2         1           2  ...                     0       0         0      0   
P3         0           1  ...                     0       0         0      0   
P4         1           2  ...                     1       1         1      1   

    pp  shao  thalmann  jd  cy  sigspatial  
P1   0     0         0   0   0           0  
P2   0     0         0   0   0           0  
P3   0     0         0   0   0           0  
P4   1     1      

In [18]:
WordBag['TotalCount'] = WordBag.sum(axis=1)

In [19]:
# WordBag['max_value']=WordBag.max(axis=1)

In [20]:
WordBag.to_csv("../../data/TFIDF/WordBag.csv")

## 定义IFIDF存储结构。

1. 需要将WordBag的列名赋值给IFIDF

In [21]:
# WordBagColumnsNameList = WordBag.columns.to_list()
# print(WordBagColumnsNameList.remove('TotalCount'))
# IFIDFColumnsName =  WordBag.columns.to_list().remove('TotalCount')
# print(IFIDFColumnsName)
IFIDF = pd.DataFrame(index=['TFIDF1', 'TFIDF2', 'TFIDF3', 'TFIDF4'], 
                     columns=WordBag.columns.to_list())
IFIDF

Unnamed: 0,neural,collaborative,filtering,xiangnan,helizi,liao,hanwang,zhang,national,university,...,nguyen,adapting,drift,pp,shao,thalmann,jd,cy,sigspatial,TotalCount
TFIDF1,,,,,,,,,,,...,,,,,,,,,,
TFIDF2,,,,,,,,,,,...,,,,,,,,,,
TFIDF3,,,,,,,,,,,...,,,,,,,,,,
TFIDF4,,,,,,,,,,,...,,,,,,,,,,


## 计算TF

1. TF为单词出现的个数除以总的单词类别数。

In [22]:
import math
documentCount = 4

for columnName in WordBag.columns.to_list():
    # IFIDF.loc['IDF', columnName] = math.log10(documentCount/(WordBag[WordBag[columnName]>0].shape[0] + 1))
    IDF = math.log10(documentCount/(WordBag[WordBag[columnName]>0].shape[0] + 1))
    IFIDF.loc['TFIDF1', columnName] = (WordBag.loc['P1', columnName] / WordBag.loc['P1', 'TotalCount']) * IDF
    IFIDF.loc['TFIDF2', columnName] = (WordBag.loc['P2', columnName] / WordBag.loc['P2', 'TotalCount']) * IDF
    IFIDF.loc['TFIDF3', columnName] = (WordBag.loc['P3', columnName] / WordBag.loc['P3', 'TotalCount']) * IDF
    IFIDF.loc['TFIDF4', columnName] = (WordBag.loc['P4', columnName] / WordBag.loc['P4', 'TotalCount']) * IDF
    
print(IFIDF)


       neural collaborative filtering  xiangnan    helizi      liao   hanwang  \
TFIDF1    0.0      0.000615   0.00006  0.000036  0.000036  0.000036  0.000036   
TFIDF2    0.0           0.0       0.0       0.0       0.0       0.0       0.0   
TFIDF3    0.0           0.0       0.0       0.0       0.0       0.0       0.0   
TFIDF4    0.0      0.000078  0.000065       0.0       0.0       0.0       0.0   

           zhang national university  ...    nguyen  adapting     drift  \
TFIDF1 -0.000093      0.0   -0.00007  ...       0.0       0.0       0.0   
TFIDF2 -0.000018      0.0  -0.000035  ...       0.0       0.0       0.0   
TFIDF3 -0.000043      0.0  -0.000014  ...       0.0       0.0       0.0   
TFIDF4  -0.00004      0.0   -0.00002  ...  0.000031  0.000031  0.000031   

              pp      shao  thalmann        jd        cy sigspatial TotalCount  
TFIDF1       0.0       0.0       0.0       0.0       0.0        0.0   -0.09691  
TFIDF2       0.0       0.0       0.0       0.0       0.0

In [23]:
IFIDF.sort_values(by='TFIDF1', ascending=False, axis=1, inplace=True)
IFIDF.loc['TFIDF1'].T.to_csv("../../data/TFIDF/P1_IFIDF.csv")

In [24]:
IFIDF.sort_values(by='TFIDF2', ascending=False, axis=1, inplace=True)
IFIDF.loc['TFIDF2'].T.to_csv("../../data/TFIDF/P2_IFIDF.csv")

In [25]:
IFIDF.sort_values(by='TFIDF3', ascending=False, axis=1, inplace=True)
IFIDF.loc['TFIDF3'].T.to_csv("../../data/TFIDF/P3_IFIDF.csv")

In [26]:
IFIDF.sort_values(by='TFIDF4', ascending=False, axis=1, inplace=True)
IFIDF.loc['TFIDF4'].T.to_csv("../../data/TFIDF/P4_IFIDF.csv")