In [None]:
%run TextPRocessor.ipynb

import os
import math

In [None]:
# outputPath = ""
outputFile = ".txt"
docDirPath = "IRTM/"
trainingDataLabel = "Data/training.txt"

In [None]:
# list all files in given directory
def listFiles(dir_path):
    fList = []
    for dirPath, dirName, fileNames in os.walk(dir_path):
        fList.extend(fileNames)
    return fList

In [None]:
def extractVocabularyFromDocs(docList):
    tfPool = dict() # key=Term; value=sum term freq. of all docs in docList
    for doc in docList:
        termDict = getTermsDictFromDoc(doc) # key=Term; value=tf
        for term, tf in termDict.items(): # update tfPool
            if term in tfPool.keys():
                tfPool[term] += tf
            else:
                tfPool[term] = tf
    return tfPool

def extractVocabularyFromSingleDoc(doc):
    return getTermsDictFromDoc(doc) # key=Term; value=tf

def getTotalVocabularyCountFromDocs(termFreqOfDocs_dict):
    return sum(termFreqOfDocs_dict.values())


def countDocsInClass(labeledDocListPath):
    classCountDict = dict()
    classDocsDict = dict()
    with open(labeledDocListPath, 'r') as f:
        for line in f.readlines():
            line = line.strip()
            items = line.split(' ')
            classID = int(items[0]) # the class_id
            countNumber = len(items)-1 # how many docs is that class
            classCountDict[classID] = countNumber # count class docs
            classDocsDict[classID] = items[1::] # what docs belongs to class
            
    return classCountDict, classDocsDict

def countAllDocs(classCountDict):
    return sum(classCountDict.values())

In [None]:
# return a list of doc path
def getClassDocPaths(classDocs, docDirPath):
    return [docDirPath + str(doc)+'.txt' for doc in classDocs]

In [None]:
classCountDict, classDocsDict = countDocsInClass(trainingDataLabel)
totalDocsCount = countAllDocs(classCountDict)

In [None]:
%run NaiveBayesClassifier.ipynb

In [None]:
nbClassifier = NaiveBayesClassifier() # initialize classifier

for classID in classDocsDict.keys(): # iterate each classID
    priorProb = classCountDict[classID] / totalDocsCount # count prior prob of Naive Bayes
    classDocs = classDocsDict[classID] # get docs list of classID
    docPaths = getClassDocPaths(classDocs, docDirPath)
    
    # get term and term occur freq from docs in class. (k=term, v=freq)
    termFreqOfDocs_dict = extractVocabularyFromDocs(docPaths)
    
    totalTokenCount = getTotalVocabularyCountFromDocs(termFreqOfDocs_dict) # how many vocabularys
    diffTermCount = len(termFreqOfDocs_dict.keys()) # how many different terms
    
    condProb = dict()
    for term, termOccurFreq in termFreqOfDocs_dict.items(): # get term occur freq (have been calculated)
        condProb[term] = (termOccurFreq + 1) / ( totalTokenCount + diffTermCount)
    
    nbClassifier.updateClassPriorProbability(classID, priorProb) # update class prior prob
    nbClassifier.updateClassConditionProbability(classID, condProb) # update class cond prob

In [None]:
# nbClassifier.getClassPriorProbability(2)

In [None]:
# nbClassifier.getClassConditionProbability(2)

In [None]:
def getLogScore(prob):
#     print(prob)
    return math.log(prob)

def addScoreWithFrequency(scoreBuff, score, freq):
    for i in range(freq):
        scoreBuff += score
    return scoreBuff

In [None]:
def updateClassScoreRecord(classScoreList, classID, score):
    classScoreList.append((classID, score))
        
def getResultClass(classScoreList):
    classScoreList.sort(key=lambda s:s[1], reverse=True)
    print(classScoreList)
    maxRecord = classScoreList[0]
    resultClass = maxRecord[0]
    return resultClass

In [None]:
docID = 5
docPath = docDirPath + str(docID) + '.txt'
docTermFreqDict = extractVocabularyFromSingleDoc(docPath)
classes = nbClassifier.getAllClass()

classScoreList = list()
for c in classes:
    scoreOfClass = 0
    
    priorProb = nbClassifier.getClassPriorProbability(c)
    priorScore = getLogScore(priorProb) # get prior log score
    
    scoreOfClass = addScoreWithFrequency(scoreOfClass, priorScore, 1)
    
    condProb_dict = nbClassifier.getClassConditionProbability(c)
    for term, occurFreq in docTermFreqDict.items():
        if term not in condProb_dict.keys(): continue
        condProb = condProb_dict[term]
        condProbScore = getLogScore(condProb)
        scoreOfClass = addScoreWithFrequency(scoreOfClass, condProbScore, occurFreq)
        
    updateClassScoreRecord(classScoreList, c, scoreOfClass) # update class score

print(getResultClass(classScoreList))