# 简单TF-IDF示例

In [1]:
import numpy as np
import pandas as pd

## 1. 定义数据和预处理

In [2]:
# m没有使用标点符号。
docA = "The cat sat on my bed"
docB = "The dog sat on my kness"

# 先分词。生成词袋。
bowA = docA.split(" ")
bowB = docB.split(" ")

# 构建完整词库，取并集。
wordSet = set(bowA).union(set(bowB)) 

## 2. 进行词数统计

In [3]:
# 用统计字典来保存词出现的次数。
wordDictA = dict.fromkeys(wordSet, 0)
wordDictB = dict.fromkeys(wordSet, 0)

# 遍历文档，统计词数。
for word in bowA:
    wordDictA[word] += 1
for word in bowB:
    wordDictB[word] += 1

pd.DataFrame([wordDictA, wordDictB])

Unnamed: 0,cat,sat,dog,kness,The,my,on,bed
0,1,1,0,0,1,1,1,1
1,0,1,1,1,1,1,1,0


## 3. 计算TF

In [4]:
def computeTF(wordDict, bow):
    # 用一个字典对象记录TF
    tfDict = {}
    nbowCount = len(bow)
    
    for word,count in wordDict.items():
        # print(word, count,)
        tfDict[word] = count / nbowCount
    
    return tfDict

tfA = computeTF(wordDictA, bowA)
tfB = computeTF(wordDictB, bowB)

tfA

{'cat': 0.16666666666666666,
 'sat': 0.16666666666666666,
 'dog': 0.0,
 'kness': 0.0,
 'The': 0.16666666666666666,
 'my': 0.16666666666666666,
 'on': 0.16666666666666666,
 'bed': 0.16666666666666666}

## 4. 计算逆文档频率

In [5]:
import math

def computeIDF(wordDictList):
    # 用一个字典对象来保存idf结果，每个词作为key，初始值为0。
    idfDict = dict.fromkeys(wordDictList[0], 0)
    N = len(wordDictList)
    
    for wordDict in wordDictList:
        # 遍历字典中的每个词汇，统计Ni
        for word,count in wordDict.items():
            if count > 0 :
                # 先把Ni增加1，存入到idfDict
                idfDict[word] += 1
    
    # 已经得到所有词汇i对应的Ni，现在更加公式把它替换为idf值。
    for word, ni in idfDict.items():
        idfDict[word] = math.log10((N+1)/(ni+1))
    
    return idfDict

idfs = computeIDF([wordDictA, wordDictB])
idfs

{'cat': 0.17609125905568124,
 'sat': 0.0,
 'dog': 0.17609125905568124,
 'kness': 0.17609125905568124,
 'The': 0.0,
 'my': 0.0,
 'on': 0.0,
 'bed': 0.17609125905568124}

## 5. 计算TF-IDF

In [6]:
def computeTFIDF(tf, idfs):
    tfidf = {}
    for word, tfval in tf.items():
        tfidf[word] = tfval * idfs[word]
        
    return tfidf

tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)

pd.DataFrame([tfidfA, tfidfB])

Unnamed: 0,cat,sat,dog,kness,The,my,on,bed
0,0.029349,0.0,0.0,0.0,0.0,0.0,0.0,0.029349
1,0.0,0.0,0.029349,0.029349,0.0,0.0,0.0,0.0


# 一个相对复杂的例子

1. 几个文件PDF导出成txt的时候格式还都不一样。wtf

In [1]:
import re
import pandas as pd
import numpy as np

2. 对于第一个字符如果是特殊字符的行不进行处理

In [37]:
skipCharList = ['\n', '\r', '\t', ' ', '.', ':']

3. 设置总词库
   1. 列名是单词。
   2. 行名是不同的文章的名称或者对应的index。
   3. 目前是四篇paper，创建的词袋是4行。

In [68]:
WordBag = pd.DataFrame([[],[],[],[]])

In [69]:
WordBag

0
1
2
3


4. 将词添加如总词袋中，并且对出现的次数进行记录。

In [58]:
def AddWordToWordBag(paperName, word):
    # WordList = WordBag.columns.to_list()
    if word in WordBag.columns.to_list():
        WordBag.loc[paperName, word] += 1
    else:
        WordBag[word] = 0
        WordBag.loc[paperName, word] = 1
    pass

In [62]:
# 测试函数

# words = ['attention', 'NaN', 'all', 'is', 'you', 'need', 'NaN', 'NaN']

# for word in words:
#     AddWordToWordBag(1, word)
    
# print(WordBag)

   attention  NaN  all  is  you  need
0          0    0    0   0    0     0
1          1    3    1   1    1     1
2          0    0    0   0    0     0
3          0    0    0   0    0     0


## 对第一个文档进行处理。

1. 在PDF导出为txt的时候就将所有的文字分为了逐个的单词，而且每个单词占一行。

In [46]:
# 对第一篇论文进行处理

Paper1WordList = pd.DataFrame(columns=["word", "count", "TF", "IDF"])
Paper1WordList['word'] = Paper1WordList['word'].astype(np.str)
Paper1WordList['count'] = Paper1WordList['count'].astype(np.int)
Paper1WordList['TF'] = Paper1WordList['TF'].astype(np.float)
Paper1WordList['IDF'] = Paper1WordList['IDF'].astype(np.float)

# 4个文本的路径。
PaperPathList = ["../../data/TFIDF/1.txt", 
                 "../../data/TFIDF/2.txt", 
                 "../../data/TFIDF/3.txt", 
                 "../../data/TFIDF/4.txt"]
# 逐行读取
with open("../../data/TFIDF/1.txt") as f1:
    line = f1.readline()
    line = line
    
    while line:
        # 用于记录暂存下来查找过程中的循环次数。
        i = 0
        # 是否找到的标签。
        flag = False
        # 存储作为是否遍历完成的判断数。
        rowNumber = Paper1WordList.shape[0]
        # print("rowNumber: {}".format(rowNumber))
        
        if line in skipCharList:
            line = f1.readline()  #读取一行文件，包括换行符
            line = line
            continue

        # 对记录缓存进行遍历。
        for index,row in Paper1WordList.iterrows():
            if re.sub(r'[^a-zA-Z]', '', row['word']).casefold() == re.sub(r'[^a-zA-Z]', '', line).casefold():
                # 修改对应单词的计数。
                Paper1WordList.loc[index,'count'] += 1
                # 找了结束遍历，并且将标识设置为True。
                flag = True
                break
            # 统计循环次数。
            i += 1
        
        # print("i: {0} rowNumber: {1}".format(i, rowNumber))
        # 由循环次数是否等于总的缓存行数和是否没有找到作为判断条件来添加新的单词。
        if i == rowNumber and flag == False:
            Paper1WordList = Paper1WordList.append({'word':re.sub(r'[^a-zA-Z]', '', line).casefold(), 'count':1}, ignore_index=True)
            # print("Append")
        # 对标识使用完毕并初始化。
        flag = False
        
        # 读取下一行。
        line = f1.readline()  #读取一行文件，包括换行符
        line = line
        # print("-----------")
        # print(Paper1WordList)
        # print("-----------------------------------")
        
print(Paper1WordList)
# Paper1.to_csv("1.csv")
        

KeyboardInterrupt: 

## 对第二个文本进行处理


In [None]:
# Paper2WordList = pd.DataFrame(columns=["word", "count", "TF", "IDF"])
# Paper2WordList['word'] = Paper2WordList['word'].astype(np.str)
# Paper2WordList['count'] = Paper2WordList['count'].astype(np.int)
# Paper2WordList['TF'] = Paper2WordList['TF'].astype(np.float)
# Paper2WordList['IDF'] = Paper2WordList['IDF'].astype(np.float)

# with open("../../data/TFIDF/2t.txt") as f2:
#     line = f2.readline()
#     line = line
#     while line:        
#         # 如果存在“|”字符，那么使用“|”对字符串进行分词。不然就用"."对字符串进行分词。
#         if "|" in line:
#             lineWords = line.split("|")
#         else:
#             lineWords = line.split(".")
#         # print(lineWords)
        
#         if line in skipCharList:
#             line = f2.readline()  #读取一行文件，包括换行符
#             line = line
#             continue
        
#         for word in lineWords:
#             # 用于记录暂存下来查找过程中的循环次数。
#             i = 0
#             # 是否找到的标签。
#             flag = False
#             # 存储作为是否遍历完成的判断数。
#             rowNumber = Paper2WordList.shape[0]
#             for index,row in Paper2WordList.iterrows():
#                 if re.sub(r'[^a-zA-Z]', '', row['word']).casefold() == re.sub(r'[^a-zA-Z]', '', word).casefold():
#                     Paper2WordList.loc[index,'count'] += 1
#                     # 找了结束遍历，并且将标识设置为True。
#                     flag = True
#                     break
#                 # 统计循环次数。
#                 i += 1
#             # print(i, rowNumber)
#             if i == rowNumber and flag == False:
#                 # print(word)
#                 Paper2WordList = Paper2WordList.append({'word':re.sub(r'[^a-zA-Z]', '', word).casefold(), 'count':1}, ignore_index=True)
#             # print("Append")
#             # 对标识使用完毕并初始化。
#             flag = False
#             i = 0

#         line = f2.readline()  #读取一行文件，包括换行符
#         line = line
#         # print("--------------------------------------")

# print(Paper2WordList)
# # Paper2WordList.to_csv("../../data/TFIDF/2t.csv")

                word  count  TF  IDF
0     contextualized      1 NaN  NaN
1    pointofinterest      2 NaN  NaN
2     recommendation      6 NaN  NaN
3                        41 NaN  NaN
4               peng      1 NaN  NaN
..               ...    ...  ..  ...
198             time      1 NaN  NaN
199            equal      1 NaN  NaN
200     contribution      1 NaN  NaN
201   ycorresponding      1 NaN  NaN
202           author      1 NaN  NaN

[203 rows x 4 columns]


In [None]:
with open("../../data/TFIDF/2t.txt") as f2:
    line = f2.readline()
    line = line
    while line:        
        # 如果存在“|”字符，那么使用“|”对字符串进行分词。不然就用"."对字符串进行分词。
        if "|" in line:
            lineWords = line.split("|")
        else:
            lineWords = line.split(".")
        # print(lineWords)
        
        if line in skipCharList:
            line = f2.readline()  #读取一行文件，包括换行符
            line = line
            continue
        
        for word in lineWords:
            AddWordToWordBag(3-1, re.sub(r'[^a-zA-Z]', '', word).casefold())

        line = f2.readline()  #读取一行文件，包括换行符
        line = line
        # print("--------------------------------------")

print(WordBag)

## 对第三个文本进行处理

In [None]:
# Paper3WordList = pd.DataFrame(columns=["word", "count", "TF", "IDF"])
# Paper3WordList['word'] = Paper3WordList['word'].astype(np.str)
# Paper3WordList['count'] = Paper3WordList['count'].astype(np.int)
# Paper3WordList['TF'] = Paper3WordList['TF'].astype(np.float)
# Paper3WordList['IDF'] = Paper3WordList['IDF'].astype(np.float)

# with open("../../data/TFIDF/3t.txt") as f3:
#     line = f3.readline()
#     line = line
#     while line: 
#         # 用" "对字符串进行分词。
#         lineWords = line.split(" ")
        
#         if line in skipCharList:
#             line = f3.readline()  #读取一行文件，包括换行符
#             line = line
#             continue
            
#         # print(lineWords)
#         for word in lineWords:
#             # 用于记录暂存下来查找过程中的循环次数。
#             i = 0
#             # 是否找到的标签。
#             flag = False
#             # 存储作为是否遍历完成的判断数。
#             rowNumber = Paper3WordList.shape[0]
            
#             AddWordToWordBag(3, re.sub(r'[^a-zA-Z]', '', row['word']).casefold())
#             for index,row in Paper3WordList.iterrows():
#                 if re.sub(r'[^a-zA-Z]', '', row['word']).casefold() == re.sub(r'[^a-zA-Z]', '', word).casefold():
#                     Paper3WordList.loc[index,'count'] += 1
#                     # 找了结束遍历，并且将标识设置为True。
#                     flag = True
#                     break
#                 # 统计循环次数。
#                 i += 1
#             # print(i, rowNumber)
#             if i == rowNumber and flag == False:
#                 # print(word)
#                 Paper3WordList = Paper3WordList.append({'word':re.sub(r'[^a-zA-Z]', '', word).casefold(), 'count':1}, ignore_index=True)
#             # print("Append")
#             # 对标识使用完毕并初始化。
#             flag = False
#             i = 0

#         line = f3.readline()  #读取一行文件，包括换行符
#         line = line
#         # line = line[:-1]
#         # print("--------------------------------------")

# print(Paper3WordList)
# # Paper3WordList.to_csv("../../data/TFIDF/3t.csv")

In [70]:
with open("../../data/TFIDF/3t.txt") as f3:
    line = f3.readline()
    line = line
    while line: 
        # 用" "对字符串进行分词。
        lineWords = line.split(" ")
        
        if line in skipCharList:
            line = f3.readline()  #读取一行文件，包括换行符
            line = line
            continue
            
        # print(lineWords)
        for word in lineWords:
            AddWordToWordBag(3-1, re.sub(r'[^a-zA-Z]', '', word).casefold())

        line = f3.readline()  #读取一行文件，包括换行符
        line = line
        # line = line[:-1]
        # print("--------------------------------------")

print(WordBag)
# WordBag.to_csv("../../data/TFIDF/wb3.csv")

   attention  is  all  you  need      arxivv  cscl  dec  ashish  ...  zwork  \
0          0   0    0    0     0   0       0     0    0       0  ...      0   
1          0   0    0    0     0   0       0     0    0       0  ...      0   
2          0   0    0    0     0   0       0     0    0       0  ...      0   
3          5   2    1    1     1  69       1     1    1       2  ...      1   

   st  conference  information  processing  systems  nips  beach  ca  usa  
0   0           0            0           0        0     0      0   0    0  
1   0           0            0           0        0     0      0   0    0  
2   0           0            0           0        0     0      0   0    0  
3   1           1            1           1        1     1      1   1    1  

[4 rows x 232 columns]


  


In [None]:
zz = Paper3WordList.columns
print(type(zz))
print(zz.to_list())

<class 'pandas.core.indexes.base.Index'>
['word', 'count', 'TF', 'IDF']


## 对第四个文本进行处理

In [None]:
Paper4WordList = pd.DataFrame(columns=["word", "count", "TF", "IDF"])
Paper4WordList['word'] = Paper4WordList['word'].astype(np.str)
Paper4WordList['count'] = Paper4WordList['count'].astype(np.int)
Paper4WordList['TF'] = Paper4WordList['TF'].astype(np.float)
Paper4WordList['IDF'] = Paper4WordList['IDF'].astype(np.float)

with open("../../data/TFIDF/4t.txt") as f4:
    line = f4.readline()
    line = line
    while line: 
        # 用" "对字符串进行分词。
        lineWords = line.split(" ")
        
        if line in skipCharList:
            line = f4.readline()  #读取一行文件，包括换行符
            line = line
            continue
            
        # print(lineWords)
        for word in lineWords:
            # 用于记录暂存下来查找过程中的循环次数。
            i = 0
            # 是否找到的标签。
            flag = False
            # 存储作为是否遍历完成的判断数。
            rowNumber = Paper4WordList.shape[0]
            for index,row in Paper4WordList.iterrows():
                if re.sub(r'[^a-zA-Z]', '', row['word']).casefold() == re.sub(r'[^a-zA-Z]', '', word).casefold():
                    Paper4WordList.loc[index,'count'] += 1
                    # 找了结束遍历，并且将标识设置为True。
                    flag = True
                    break
                # 统计循环次数。
                i += 1
            # print(i, rowNumber)
            if i == rowNumber and flag == False:
                # print(word)
                Paper4WordList = Paper4WordList.append({'word':re.sub(r'[^a-zA-Z]', '', word).casefold(), 'count':1}, ignore_index=True)
            # print("Append")
            # 对标识使用完毕并初始化。
            flag = False
            i = 0

        line = f4.readline()  #读取一行文件，包括换行符
        line = line
        # line = line[:-1]
        # print("--------------------------------------")

print(Paper4WordList)
Paper4WordList.to_csv("../../data/TFIDF/4t.csv")

                word  count  TF  IDF
0           learning      2 NaN  NaN
1                        69 NaN  NaN
2         graphbased      2 NaN  NaN
3                poi      1 NaN  NaN
4          embedding      5 NaN  NaN
..               ...    ...  ..  ...
366               li      1 NaN  NaN
367             lore      1 NaN  NaN
368       exploiting      1 NaN  NaN
369  recommendations      1 NaN  NaN
370       sigspatial      1 NaN  NaN

[371 rows x 4 columns]


## 计算TF

1. TF为单词出现的个数除以总的单词类别数。

In [None]:
def CalculateTF(ariseCount, totalWordCategoryCount):
    return ariseCount/totalWordCategoryCount

def CalculateIDF():
    pass