1. 先抓每篇文章的terms
2. 把每篇文章的terms frequency(TF)偷記下來後面用
3. 把所有Terms存在一個大Dict
4. 大Dict裡面要有index, term內容, document frequency(DF)
5. 弄consine similarity

In [None]:
%run TextPRocessor.ipynb

import os
import math

In [None]:
outputDir = "Output/"
dicionary_OutputFileName = "dictionary.txt"

if not os.path.isdir(outputDir): os.makedirs(outputDir)

In [None]:
def listFiles(dir_path):
    fList = []
    for dirPath, dirName, fileNames in os.walk(dir_path):
        fList.extend(fileNames)
    return fList

In [None]:
# save dictionary list to text file.
def saveDictionary(dictionary_list):
    
    with open(outputDir+dicionary_OutputFileName, "w") as opf:
        
        for item in dictionary_list:
            index = str(item[0])
            term = item[1]
            df = str(item[2])
            # write to file
            opf.write(index+" "+term+" "+df)
            opf.write("\n")
            
        opf.close()

In [None]:
# sort the dictionary by term(ascending order)
# add the index into tuple

def sortDictionary(dictionary_Dict):
    tempList = sorted(dictionary_Dict.items(), key = lambda x : x[0])
    index = 0
    outputList = list()
    for item in tempList:
        term = item[0]
        df = item[1]
        outputList.append((index, term, df))
        index+=1
        
    return outputList

In [None]:
# for using and efficient purpose, turn the list to dict type
def transformDictionaryListToDict(dictionaryList):
    outputDict = dict()
    for item in dictionaryList:
        index = item[0]
        term = item[1]
        df = item[2]
        outputDict[term] = (index, df)
    return outputDict

In [None]:
def calculateIDF(df, totalDocNumbers):
    ratio = totalDocNumbers / df
    return math.log10(ratio)

In [None]:
def calculateTFIDF(documentTF_dict, dictionary_dict, totalDocNumbers):
    outputDict = dict()
    for term, termFrequency in documentTF_dict.items():
        termObj_in_dictionary =  dictionary_dict[term]
        termIndex = termObj_in_dictionary[0]
        df = termObj_in_dictionary[1]
        idf = calculateIDF(df, totalDocNumbers)
        tfidf = termFrequency * idf
        outputDict[termIndex] = tfidf
    return outputDict

In [None]:
def saveDocTFIDFInformation(docID, tfidf_info):
    with open(outputDir+docID+".txt", "w") as opf:
        tfidf_list = sorted(tfidf_info.items(), key=lambda x : x)
        for item in tfidf_list:
            index = str(item[0])
            tfidf = str(item[1])
            opf.write(index +" "+tfidf + "\n")
        
        opf.close()

In [None]:
def constructTFIDFVectors(dataDir):
    dictionary = dict() # total term Dictionary
    tfPool = dict()

    fileList = listFiles(dataDir)
    for file in fileList:
        fileName = file.split('.')[0] # get only number
        terms = getTermsDict(dataDir + file)
        tfPool[fileName] = terms # update TF pool

        for key, value in terms.items(): # update Document Frequency(i.e., DF)
            if key in dictionary:
                dictionary[key] += 1
            else:
                dictionary[key] = 1

    sortedDictionary = sortDictionary(dictionary) # sort and change to list.
    saveDictionary(sortedDictionary)
    dictionaryDict = transformDictionaryListToDict(sortedDictionary)
    docTotalNumbers = len(fileList) # get total document number

    for key, value in tfPool.items(): # key is docID, value is TF dict
        doc_tfidf_Info = calculateTFIDF(value, dictionaryDict, docTotalNumbers) # a dict of document tf-idf information.
        saveDocTFIDFInformation(key, doc_tfidf_Info)

In [None]:
def cosine(documentX, documentY):
    with open(documentX, "r") as docXHandle:
        docX = docXHandle.readlines()
    docX = [x.strip() for x in docX]
    with open(documentY, "r") as docYHandle:
        docY = docYHandle.readlines()
    docY = [x.strip() for x in docY]
    
    docX_dict = dict()
    for item in docX:
        infos = item.split(' ')
        docX_dict[int(infos[0])] = float(infos[1])
        
    docY_dict = dict()
    for item in docY:
        infos = item.split(' ')
        docY_dict[int(infos[0])] = float(infos[1])
    
    listX, listY = checkEmptyEntry(docX_dict, docY_dict)
    
    import numpy as np
    import scipy.spatial.distance as distance
    if checkAllZero(listX) or checkAllZero(listY):
        return 0
    else:
        arrDocX = np.array(listX)
        arrDocY = np.array(listY)
        similarityScore = 1 - distance.cosine(arrDocX, arrDocY)
        return similarityScore

In [None]:
def checkAllZero(docList):
    for entry in docList:
        if entry != 0:
            return False
    return True

In [None]:
def checkEmptyEntry(docX_dict, docY_dict):
    docXKey_list = sorted(docX_dict.keys(), key=lambda key : key)
    docYKey_list = sorted(docY_dict.keys(), key=lambda key : key)
    
    xminIdx = docXKey_list[0]
    yminIdx = docYKey_list[0]
    xmaxIdx = docXKey_list[len(docXKey_list)-1]
    ymaxIdx = docYKey_list[len(docYKey_list)-1]
    mini = -1
    maxi = -1
    mini = xminIdx if(xminIdx < yminIdx) else yminIdx
    maxi = xmaxIdx if(xmaxIdx > ymaxIdx) else ymaxIdx
    listX = []
    listY = []
    for num in range(mini, maxi+1):
        if num in docX_dict:
            listX.append(docX_dict[num])
        else:
            listX.append(0)
            
        if num in docY_dict:
            listY.append(docY_dict[num])
        else:
            listY.append(0)
    
    return listX, listY

In [None]:
docX = "Output/TFIDF/1.txt"
docY = "Output/TFIDF/2.txt"
cosine(docX, docY)