In [1]:
%run TextPRocessor.ipynb

import os
import math

In [2]:
outputPath = ""
outputFile = "Output/classifyResult.txt"
docDirPath = "IRTM/"
trainingDataLabel = "Data/training.txt"

In [3]:
def extractVocabularyFromDocs(docList):
    tfPool = dict() # key=Term; value=sum term freq. of all docs in docList
    for doc in docList:
        termDict = getTermsDictFromDoc(doc) # key=Term; value=tf
        for term, tf in termDict.items(): # update tfPool
            if term in tfPool.keys():
                tfPool[term] += tf
            else:
                tfPool[term] = tf
    return tfPool

def extractVocabularyFromSingleDoc(doc):
    return getTermsDictFromDoc(doc) # key=Term; value=tf

def getTotalVocabularyCountFromDocs(termFreqOfDocs_dict):
    return sum(termFreqOfDocs_dict.values())

# get each class docs' count and docs' list
def readTrainingDataInfos(labeledDocListPath):
    classCountDict = dict()
    classDocsDict = dict()
    with open(labeledDocListPath, 'r') as f:
        for line in f.readlines():
            line = line.strip()
            items = line.split(' ')
            classID = int(items[0]) # the class_id
            countNumber = len(items)-1 # how many docs is that class
            classCountDict[classID] = countNumber # count class docs
            classDocsDict[classID] = items[1::] # what docs belongs to class
    return classCountDict, classDocsDict

def countAllDocs(classCountDict):
    return sum(classCountDict.values())

In [4]:
# return a list of doc path
def getClassDocPaths(classDocs, docDirPath):
    return [docDirPath + str(doc)+'.txt' for doc in classDocs]

In [5]:
classCountDict, classDocsDict = readTrainingDataInfos(trainingDataLabel)
totalDocsCount = countAllDocs(classCountDict) # count documents numbers (of training set)

## Feature Selection
Using Chi-Square

In [6]:
%run FeatureSelection.ipynb

In [7]:
dictionarySet = set() # dictionary of corpus
classTerms_dict = {i:[] for i in classDocsDict.keys()} # k=classID, value=list of tuple(docID, termList)

r = 0
for classID, docList in classDocsDict.items():
    for docID in docList:
        r+=1
        docPath = docDirPath + str(docID) + '.txt'
        termList = list(extractVocabularyFromSingleDoc(docPath).keys())
        dictionarySet.update(termList)
        classTerms_dict[classID].append((docID, termList))
classTermChiSquare_dict = {classID: [] for classID in classTerms_dict.keys()}

specialTerm = set() # some term might only exist in ontopic (small corpus)


for term in dictionarySet:
    
    for i in classTerms_dict.keys(): # how many classes
        termTable = [[0,0],[0,0]] # initial term table for each classID
        
        for classID, docTermTupleList in classTerms_dict.items(): # for every classID, iterate every class
            if i is classID: # on topic
                for docTermTuple in docTermTupleList: # count doc number in on topic class
                    if term in docTermTuple[1]: # present
                        termTable[0][0] += 1
                    else:                    # absent
                        termTable[0][1] += 1
                        
            else: # off topic
                for docTermTuple in docTermTupleList: # count doc number in off topic class
                    if term in docTermTuple[1]: # present
                        termTable[1][0] += 1
                    else:                    # absent
                        termTable[1][1] += 1
                        
        if(termTable[0][1]==0 and termTable[1][1]==0):
            specialTerm.add(term)
            continue
        chisquareVal = getChiSquare(termTable)
        if(chisquareVal[1] == True): # positive indicator of that class
            classTermChiSquare_dict[i].append((term, chisquareVal[0])) # append term and score

In [8]:
print(specialTerm) # check any specialTerm

set()


In [9]:
classTermChiSquare_dict.keys()

dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])

In [10]:
# top = list()
significantTerms = set()
s = 0 
for classID, chisquareList in classTermChiSquare_dict.items():
    termRankList = sorted(chisquareList, key=lambda x:x[1], reverse = True)
    tList = [ rank[0] for rank in termRankList if rank[1] > 49 ]
    
    if len(tList) > 50: tList = tList[0:50]
#     print("CLASS-",classID, " term count:", len(tList))
    significantTerms.update(set(tList))

In [11]:
# print(len(significantTerms))
# print(significantTerms)

In [12]:
freakTerms = {'6-1', '6-2', '6-3', '6-4', '7-5', '7-6', 'el', 'pa', 'ho'}
significantTerms = significantTerms - freakTerms # after minus freak terms, residual 500 terms.
print("Terms after feature selection:", len(significantTerms))

Terms after feature selection: 500


# Naive Bayes Classifier Training 
Calculate prior probability and condition probability for each class

In [13]:
%run NaiveBayesClassifier.ipynb

In [14]:
nbClassifier = NaiveBayesClassifier() # initialize classifier

for classID in classDocsDict.keys(): # iterate each classID
    priorProb = classCountDict[classID] / totalDocsCount # count prior prob of Naive Bayes
    classDocs = classDocsDict[classID] # get docs list of classID
    docPaths = getClassDocPaths(classDocs, docDirPath)
    
    # get term and term occur freq from docs in class. (k=term, v=freq)
    termFreqOfDocs_dict = extractVocabularyFromDocs(docPaths)
    for k in list(termFreqOfDocs_dict.keys()):
        if k not in significantTerms:
            del termFreqOfDocs_dict[k]
    
    totalTokenCount = getTotalVocabularyCountFromDocs(termFreqOfDocs_dict) # how many tokens in class docs
    smoothTermCount = len(significantTerms) # how many different terms in selected features

    condProb = dict()
    for term, termOccurFreq in termFreqOfDocs_dict.items(): # get term occur freq (have been calculated)
          condProb[term] = (termOccurFreq + 1) / ( totalTokenCount + smoothTermCount)
    condProb['OtherTermNotInTrainingData'] = 1 / ( totalTokenCount + smoothTermCount)
    
    nbClassifier.updateClassPriorProbability(classID, priorProb) # update class prior prob
    nbClassifier.updateClassConditionProbability(classID, condProb) # update class cond prob

# Naive Bayes Classifier Testing

In [15]:
def getLogScore(prob):
#     print(prob)
    return math.log(prob)

def addScoreWithFrequency(scoreBuff, score, freq):
    for i in range(freq):
        scoreBuff += score
    return scoreBuff

In [16]:
def updateClassScoreRecord(classScoreList, classID, score):
    classScoreList.append((classID, score))
        
def getResultClass(classScoreList):
    classScoreList.sort(key=lambda s:s[1], reverse=True) # Ranking class score from big to small
    maxRecord = classScoreList[0] # result is the highest-scored class
    resultClass = maxRecord[0] # index0 is class ; 1 is score
    return resultClass

In [17]:
# list all files in given directory
def getTestingDocPaths(trainindDataList, docDirPath):
    trainingSet = [ train + ".txt" for train in trainindDataList ]
    docList = os.listdir(docDirPath)
    return [docDirPath + d for d in docList if d not in trainingSet]

In [18]:
trainingDataList = []

for docList in classDocsDict.values():
    trainingDataList.extend(docList)
testSet = getTestingDocPaths(trainingDataList, docDirPath)
resultList = []
for docPath in testSet:
    
    # macOS filesystem
    if docPath == 'Data/test/.DS_Store' or docPath == 'Data/test/.ipynb_checkpoints':continue
        
    docTermFreqDict = extractVocabularyFromSingleDoc(docPath) # get vocabulary from testing doc
    classes = nbClassifier.getAllClass() # get classes in classifier
    classScoreList = list()

    for c in classes:
        scoreOfClass = 0
        priorProb = nbClassifier.getClassPriorProbability(c)
        priorScore = getLogScore(priorProb) # get prior prob log score
        scoreOfClass = addScoreWithFrequency(scoreOfClass, priorScore, 1)

        condProb_dict = nbClassifier.getClassConditionProbability(c)
        for term, occurFreq in docTermFreqDict.items():
            if term not in condProb_dict.keys():
                condProb = condProb_dict['OtherTermNotInTrainingData']
            else:
                condProb = condProb_dict[term]

            condProbScore = getLogScore(condProb) # get conditional prob log score
            scoreOfClass = addScoreWithFrequency(scoreOfClass, condProbScore, occurFreq)

        updateClassScoreRecord(classScoreList, c, scoreOfClass) # update each class's score
    resultList.append((docPath.split(docDirPath)[1][:-4], getResultClass(classScoreList)))

### Output Classify Result

In [19]:
resultList.sort(key=lambda x:int(x[0])) # sort by docID
with open(outputFile, 'w') as handle: # output as txt file
    for r in resultList:
        handle.write("%-5s %15s\n" % (r[0],r[1]))