## Process the document and get Terms
#### Usage: call the getTermsDict() function. It will return a dict {term1: TF (TermFrequency), term2: TF2, ...}

#### getTermsDict() will do 1. tokenization, 2. lowercasing everything, 3. remove stopwords, and 4. stemming

In [None]:
stopwordFile = "stopword_list.txt" # stopword dictionary

In [None]:
# tokenize by whitespace
def doTokenize(context):
    result = list()
    
    context = context.replace('\n', '') # remove newline character
    context = context.replace(',', '') # remove comma
    context = context.replace('?', '') # remove question mark
    context = context.replace('\"','')
    context = context.replace('\'','')
    context = context.replace('%','')
    context = context.replace('`','')
    context = context.replace('--','')
    context = context.replace('(','')
    context = context.replace(')','')
    context = context.replace('\\', '')

    tokens = context.split(' ') # split by whitespace
    tokens = [token.split('.')[0] for token in tokens] # remove dot of vocabluary (e.g., "world.")
    
    
    # check empty elements
    for token in tokens:
        if token:
            result.append(token)
            
    return result

In [None]:
def doLowercase(token):
    return token.lower()

In [None]:
# Check the given token is stopword or not
def isStopword(token, stopword_set):
    return (token in stopword_set)

In [None]:
# use porter's algorithm to do stemming
%run PorterStemmer.ipynb
def doStemming(token):
    return PorterStemmer().stem(token, 0, len(token)-1 )

In [None]:
# save the result as a txt file
def saveResult(tokenSet, outputFileName):
    
    with open(outputDirPath + outputFileName, "w") as opf:
        for element in tokenSet:
            opf.write(element+"\n")
        opf.close()

In [None]:
def getStopwordList(userDefinedSWFile=None):
    if userDefinedSWFile is None:
        with open(stopwordFile, "r") as f:
            stopword_list = f.readlines()
            f.close()
    else:
        with open(userDefinedSWFile, "r") as f:
            stopword_list = f.readlines()
            f.close()
    stopword_set = {x.strip() for x in stopword_list} # ready the stopword list
    return stopword_set

In [None]:
# check if term is number
def is_number(string):
    try:
        float(string)
        return True
    except ValueError:
        return False

In [None]:
def getTermsDict(documentPath):
    with open(documentPath, "r") as f:
        document = f.read()
        f.close()
    
    stopword_set = getStopwordList() # get stopword set
    
    tokenDict = dict()
    tokens = doTokenize(document)
    for token in tokens:
        token = doLowercase(token)  # Doing lowercasing
    
        if (not isStopword(token, stopword_set)) and (not is_number(str(token))):  # stopword removing
            token = doStemming(token) # Doing stemming

            # count the tf of given document
            if token in tokenDict:
                tokenDict[token] += 1
            else:
                tokenDict[token] = 1

    return tokenDict