In [10]:
import re
import math
import numpy as np
import nltk
from string import digits
from nltk.stem import PorterStemmer
from os import listdir

In [11]:
FILE_PATH = "./data/"

ps = PorterStemmer()

r = open("stopwords.txt")
stopwords = r.read()

In [12]:
def preprocessing(file):
    # Preprocessing
    doc = file.replace("\s+"," ").replace("\n", "").replace("\r\n", "")
    doc = re.sub(r"[^\w\s]", "", doc)
    doc = re.sub(r"[0-9]", "", doc)
    doc = doc.replace("_", "")

    # Tokenization and Lowercasing
    tokenization = [word.lower() for word in doc.split(" ")]

    # Stemming using Porter's Algorithm
    stemming = [ps.stem(word) for word in tokenization]

    # Stopword Removal
    result = [word for word in stemming if word not in stopwords]

    return result

In [13]:
def count_tf_df(doc_set):
    tf_all = list()
    df_all = dict()

    for document in doc_set:
        doc_id, doc = document

        token_list = preprocessing(doc)
        
        tf = dict()
        for term in token_list:
            if term in tf:
                tf[term] += 1
            else:
                tf[term] = 1
        tf_all.append([doc_id, tf])

        for term in tf:
            if term in df_all:
                df_all[term] += 1
            else:
                df_all[term] = 1

    df_all = dict(sorted(df_all.items(), key=lambda x: x[0]))

    term_index = dict()
    index = 0
    for term in df_all:
        term_index[term] = index
        index += 1

    return tf_all, df_all, term_index    

In [14]:
def tf_vec(tf_list, t_index):
    tf_vector = list()

    for pair in tf_list:
        doc_id, tf = pair
        vec = np.array([0] * len(t_index), dtype=float)
        
        for word in tf:
            vec[t_index[word]] = tf[word]
        
        tf_vector.append([doc_id, vec])
        
    return tf_vector

In [15]:
def tf_idf_vec(tf_vector, df_list, t_index):
    
    idf_vector = np.array([0] * len(t_index), dtype=float)

    N = len(tf_vector)
   
    for word, df in df_list.items():
        idf = math.log(N / df, 10)
        idf_vector[t_index[word]] = idf

    tf_idf_vectors = list()
    for vec in tf_vector:
        index = vec[0]
        tf_idf = vec[1] * idf_vector
        tf_idf_unit = tf_idf / np.linalg.norm(tf_idf)
        tf_idf_vectors.append([index, tf_idf_unit])
        
    return tf_idf_vectors


In [16]:
def get_vector(doc_id, t_index):
    vector = np.array([0] * len(t_index), dtype=float)
    with open(f"./output/{doc_id}.txt") as f:
        row = 0
        for line in f:
            if row > 1:
                index, tf_idf = [x.strip() for x in re.split(r'\t+', line)]
                vector[int(index)] = tf_idf
            row += 1
    return vector

def cosine(doc_x, doc_y):
    vector_x = get_vector(doc_x, t_index)
    vector_y = get_vector(doc_y, t_index)
    cosine_sim = float(np.dot(vector_x, vector_y))
    return cosine_sim

In [17]:
files = listdir(FILE_PATH)
files.sort(key=lambda x: int(x[:-4]))
doc_set = list()

for file in files:
    with open(FILE_PATH + file, "r") as f:
        document_id = str(file)[:-4]
        document = f.read()
        doc_set.append([document_id, document])

tf_list, df_list, t_index = count_tf_df(doc_set)

with open("dictionary.txt", "w") as f:
    f.write("t_index\tterm\tdf\n")
    for term in df_list:
        index = t_index[term]
        key = term
        df = df_list[term]
        f.write(f"{index}\t{key}\t{df}\n")

tf_vector = tf_vec(tf_list, t_index)
tf_idf_vector = tf_idf_vec(tf_vector, df_list, t_index)

for vector in tf_idf_vector:
    doc_id, vec_list = vector
    terms_num = np.count_nonzero(vec_list)
    with open(f"./output/{doc_id}.txt", "w") as f:
        f.write(f"{terms_num}\n")
        f.write("t_index\ttf-idf\n")
        for i in range(len(vec_list)):
            if vec_list[i] != 0:
                f.write(f"{i}\t{vec_list[i]}\n")

In [18]:
print("cosine similarity of doc 1 and doc 2 is", cosine("1", "2"))

cosine similarity of doc 1 and doc 2 is  0.19986585359571019
