# B09705039_HW03

In [1]:
# import module
from nltk.stem import PorterStemmer
import math
import pandas as pd
import numpy as np

In [2]:
# Read training file.
path = "./training.txt"
f = open(path, 'r')
training_raw = f.read()
f.close()

# training data
training_raw = training_raw.split("\n")
training = []
for i in training_raw:
    training.append(i.split(" "))
training = np.delete(training, [0, 16], 1)
training = np.array(training).astype(int)
training

array([[  11,   19,   29,  113,  115,  169,  278,  301,  316,  317,  321,
         324,  325,  338,  341],
       [   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,   12,
          13,   14,   15,   16],
       [ 813,  817,  818,  819,  820,  821,  822,  824,  825,  826,  828,
         829,  830,  832,  833],
       [ 635,  680,  683,  702,  704,  705,  706,  708,  709,  719,  720,
         722,  723,  724,  726],
       [ 646,  751,  781,  794,  798,  799,  801,  812,  815,  823,  831,
         839,  840,  841,  842],
       [ 995,  998,  999, 1003, 1005, 1006, 1007, 1009, 1011, 1012, 1013,
        1014, 1015, 1016, 1019],
       [ 700,  730,  731,  732,  733,  735,  740,  744,  752,  754,  755,
         756,  757,  759,  760],
       [ 262,  296,  304,  308,  337,  397,  401,  443,  445,  450,  466,
         480,  513,  533,  534],
       [ 130,  131,  132,  133,  134,  135,  136,  137,  138,  139,  140,
         141,  142,  143,  145],
       [  31,   44,   70,   83,   86,

In [3]:
# testing data
testing = []
for i in range(1, 1096):
    if i not in training:
        testing.append(i)

In [4]:
# All tokenized doc result
doc_amount = 1095
all_doc = []
for doc in range(1, doc_amount + 1):
    # Read file.
    path = "./data/" + str(doc) + ".txt"
    f = open(path, 'r')
    all_text = f.read()
    f.close()

    # Tokenization.
    # signs that can be ignored: We only listed a few here, if needed more we can add more.
    nonAlphanumeric = [",", "'", ";", ":", '"', "@", "!", "?", "(", ")", "[", "]", "<", ">", "=", "+", "^", "$", "~", "*", "/", "{", "}", "&", "#", "%","`", "_"]
    for i in nonAlphanumeric:
        all_text = all_text.replace(i, " ")

    # periods: concatenate
    all_text = all_text.replace(".", "")
    # hyphens: concatenate
    all_text = all_text.replace("-", "")
    
    # remove digits
    all_text = ''.join([i for i in all_text if not i.isdigit()])

    tokenize = all_text.split()

    # Lowercasing everything.
    lowercase = []
    for i in tokenize:
        lowercase.append(i.lower())
        
    # Stopword removal.
    # Read stopwords file. stopwords.txt is generated by nltk.
    path = 'stopwords.txt'
    f2 = open(path, 'r')
    stop_words = f2.read()
    f2.close()
    
    # Removal start.
    stop_words = stop_words.split()
    stopword_removed = []
    for w in lowercase:
        if w not in stop_words:
            stopword_removed.append(w)
        
    # Stemming using Porter’s algorithm.
    ps = PorterStemmer()

    after_stemming = []
    for w in stopword_removed:
        after_stemming.append(ps.stem(w))
        
    all_doc.append(after_stemming)

In [5]:
# Feature Selection

# ExtractVocabulary
V = []
for i in training:
    for j in i:
        for k in all_doc[j - 1]:
            if k not in V:
                V.append(k)
                
# CountDocs
N = 0
for c in range(13):
    N += len(training[c])

chi_dict = {}
for t in V:
    # get matrix
    mat = np.zeros((13, 2))
    for k in range(13):
        for j in training[k]:
            if t in all_doc[j - 1]:
                mat[k][0] += 1
            else:
                mat[k][1] += 1
                
    # count chi-square
    rowsum = np.sum(mat, axis=0)
    columnsum = np.sum(mat, axis=1)

    chi_sqr = 0
    for i in range(2):
        for j in range(13):
            E = N * (rowsum[i] / N) * (columnsum[j] / N)
            chi_sqr += (((mat[j][i] - E) ** 2) / E)
            
    chi_dict[t] = chi_sqr

V = sorted(chi_dict, key=chi_dict.get, reverse=True)[:200]

In [6]:
# Training

prior = []
condprob = []
for c in range(13):
    # get P_c
    N_c = len(training[c])
    prior.append(N_c / N)
                
    # get tf of D in c
    tf_dict = dict.fromkeys(V, 0)
    for i in training[c]:
        for j in all_doc[i - 1]:
            if j in V:
                tf_dict[j] += 1
    
    # get Pt_c
    tnum = sum(tf_dict.values()) + len(V)
    temp_dict = {}
    for t in V:
        temp_dict[t] = (tf_dict[t] + 1) / tnum
    condprob.append(temp_dict)

In [7]:
# Testing

result = []
for test in testing:
    score = []
    for c in range(13):
        temp_score = 0
        temp_score += math.log(prior[c])
        for t in all_doc[test - 1]:
            if t in condprob[c]:
                temp_score += math.log(condprob[c][t])
        score.append(temp_score)
    result.append(np.argmax(score) + 1)

In [8]:
# Write to file
data = {"Id": testing,
       "Value": result}
df = pd.DataFrame(data)
df.to_csv('output.csv', index = False)  