Pre-requirement：Term generator(i.e., TextProcessor.ipynb)
### Calculate a corpus documents' tf-idf vectors.
1. If you want to generate the 'dictionary', call 'constructTFIDFVectors' function.
2. If you want to calculate two document terms' cosine similarity, call 'cosine' function.

### Usage
See "TFIDF_Converter_Example.ipynb"

In [None]:
%run TextPRocessor.ipynb

import os
import math

In [None]:
dicionary_OutputFileName = "dictionary.txt"

In [None]:
# list all files in given directory
def listFiles(dir_path):
    fList = []
    for dirPath, dirName, fileNames in os.walk(dir_path):
        fList.extend(fileNames)
    return fList

In [None]:
# save dictionary list to text file.
def saveDictionary(dictionary_list, outputDir):
    
    with open(outputDir+dicionary_OutputFileName, "w") as opf:
        
        for item in dictionary_list:
            index = str(item[0])
            term = item[1]
            df = str(item[2])
            # write to file
            opf.write(index+" "+term+" "+df)
            opf.write("\n")
            
        opf.close()
        
    print("Generating dictionary done.")

In [None]:
# sort the dictionary by term(ascending order)
# add the index into tuple

def sortDictionary(dictionary_Dict):
    tempList = sorted(dictionary_Dict.items(), key = lambda x : x[0])
    index = 0
    outputList = list()
    for item in tempList:
        term = item[0]
        df = item[1]
        outputList.append((index, term, df))
        index+=1
        
    return outputList

In [None]:
# for using and efficient purpose, turn the list to dict type
def transformDictionaryListToDict(dictionaryList):
    outputDict = dict()
    for item in dictionaryList:
        index = item[0]
        term = item[1]
        df = item[2]
        outputDict[term] = (index, df)
    return outputDict

In [None]:
# calculate IDF value ( idf = log10(N/df) )
def calculateIDF(df, totalDocNumbers):
    ratio = totalDocNumbers / df
    return math.log10(ratio)

In [None]:
def getVectorLength(vectorTempList):
    product = sum(vectorTempList)
    return product**(0.5)

In [None]:
def calculateTFIDF(documentTF_dict, dictionary_dict, totalDocNumbers):
    outputDict = dict()
    for term, termFrequency in documentTF_dict.items(): # the item is (term: tf)
        termObj_in_dictionary =  dictionary_dict[term] # get (term_index,df) tuple
        termIndex = termObj_in_dictionary[0] # get term index
        df = termObj_in_dictionary[1] # get df
        idf = calculateIDF(df, totalDocNumbers)
        tfidf = termFrequency * idf # tf-idf = tf * idf
        outputDict[termIndex] = tfidf
    
    # do normalization
    vectorTempList = [tfidfValue*tfidfValue for termIdx, tfidfValue in outputDict.items()]
    vectorLength = getVectorLength(vectorTempList) # do sum and square root
    outputDict = {key: value/vectorLength for key, value in outputDict.items()}
    
    return outputDict

In [None]:
def saveDocTFIDFInformation(docID, tfidf_info, outputDir):
    with open(outputDir+docID+".txt", "w") as opf:
        tfidf_list = sorted(tfidf_info.items(), key=lambda x : x)
        for item in tfidf_list:
            index = str(item[0])
            tfidf = str(item[1])
            opf.write(index +" "+tfidf + "\n")
        
        opf.close()

In [None]:
def constructTFIDFVectors(dataDir, outputDir=None):
    print("Start constructing...\n")
    
    if outputDir == None: # default value
        outputDir = "Output/TFIDF/"
    
    if not os.path.isdir(outputDir): os.makedirs(outputDir) # check outputDir exist.
    
    dictionary = dict() # total term Dictionary
    tfPool = dict()

    fileList = listFiles(dataDir)
    for file in fileList:
        if(file != ".DS_Store"): ### this line is for macOS, have to avoid this file
            fileName = file.split('.')[0] # get only number
            terms = getTermsDictFromDoc(dataDir + file)
            tfPool[fileName] = terms # update TF pool

            for key, value in terms.items(): # update Document Frequency(i.e., DF)
                if key in dictionary:
                    dictionary[key] += 1
                else:
                    dictionary[key] = 1

    sortedDictionary = sortDictionary(dictionary) # sort the dictionary and change to list.
    saveDictionary(sortedDictionary, outputDir) # save the dictionary to outputDir
    dictionaryDict = transformDictionaryListToDict(sortedDictionary) # transform the dictionary to dict type
    docTotalNumbers = len(fileList) # get total document number

    for key, value in tfPool.items(): # key is docID, value is TF dict
        doc_tfidf_Info = calculateTFIDF(value, dictionaryDict, docTotalNumbers) # a dict of document tf-idf information.
        saveDocTFIDFInformation(key, doc_tfidf_Info, outputDir)

In [None]:
# calculate and return the cosine similarity
def cosine(documentX, documentY):
    with open(documentX, "r") as docXHandle:
        docX = docXHandle.readlines()
    docX = [x.strip() for x in docX]
    with open(documentY, "r") as docYHandle:
        docY = docYHandle.readlines()
    docY = [x.strip() for x in docY]
    
    docX_dict = dict()
    for item in docX:
        infos = item.split(' ')
        termIdx = int(infos[0])
        tfidfValue = float(infos[1])
        docX_dict[termIdx] = tfidfValue
        
    docY_dict = dict()
    for item in docY:
        infos = item.split(' ')
        termIdx = int(infos[0])
        tfidfValue = float(infos[1])
        docY_dict[termIdx] = tfidfValue
    
    listX, listY = prepareVectorsForConsine(docX_dict, docY_dict)
    
    import numpy as np
    import scipy.spatial.distance as distance
    if checkAllZero(listX) or checkAllZero(listY): # if one of two list is empty, then return 0 score.
        return 0
    
    # create two array(using two list from two doc), and calculate the similarity score
    else:
        arrDocX = np.array(listX)
        arrDocY = np.array(listY)
        similarityScore = 1 - distance.cosine(arrDocX, arrDocY)
        return similarityScore

In [None]:
def checkAllZero(docList):
    for entry in docList:
        if entry != 0:
            return False
    return True # this should never occur!

In [None]:
# check the two given doc vector, add 0 element if the term id doesn't in the given document vector.
# it's a memory waste method, but it works.
def prepareVectorsForConsine(docX_dict, docY_dict):
    docXKey_list = sorted(docX_dict.keys(), key=lambda key : key)
    docYKey_list = sorted(docY_dict.keys(), key=lambda key : key)
    
    xminIdx = docXKey_list[0]
    yminIdx = docYKey_list[0]
    xmaxIdx = docXKey_list[len(docXKey_list)-1]
    ymaxIdx = docYKey_list[len(docYKey_list)-1]
    
    # get the range of two term id.
    mini = -1
    maxi = -1
    mini = xminIdx if(xminIdx < yminIdx) else yminIdx # get minimum term id in two docs
    maxi = xmaxIdx if(xmaxIdx > ymaxIdx) else ymaxIdx # get the maximum term id in two docs
    
    # create two list, if there is a record of given id(i.e., num) in doc, then use the record tf-idf
    # else add 0 in list.
    listX = []
    listY = []
    for num in range(mini, maxi+1):
        if num in docX_dict:
            listX.append(docX_dict[num])
        else:
            listX.append(0)
            
        if num in docY_dict:
            listY.append(docY_dict[num])
        else:
            listY.append(0)
    
    return listX, listY