In [45]:
import os
import re
import math
import heapq

In [12]:
def tokenize_document(document):
    tokens = re.findall(r'\w+', document)
    return tokens

In [13]:
def extract_text(directory, filename):
    with open(os.path.join(directory, filename), 'r') as file:
        text_data = file.read()
        s_title = text_data.find("<TITLE>")
        e_title = text_data.find("</TITLE>", 7)
        
        s_text = text_data.find("<TEXT>")
        e_text = text_data.find("</TEXT>", 6)
        
        extracted_text = text_data[s_title+7:e_title].strip().lower() + "\n" + text_data[s_text+6:e_text].strip().lower()
        return extracted_text

In [14]:
def calculate_tf(tokens, TOKEN_IDs):
    tfs = {}
    for token in tokens:
        if token in tfs:
            tfs[TOKEN_IDs[token]] += 1
        else:
            tfs[TOKEN_IDs[token]] = 1
    return tfs

def calculate_idfs(TERM_TO_DOCS, TOKEN_IDs, DOCS_IDs):
    N = len(DOCS_IDs)
    idfs = {}
    for term, documents in TERM_TO_DOCS.items():
        idf = math.log(N / len(documents))
        idfs[term] = round(idf, 3)
    return idfs

def calculate_tf_idfs(TFs, IDFs):
    tf_idf = {}
    for doc in TFs:
        vector = []
        for term in TFs[doc]:
            tf = TFs[doc][term]
            idf = IDFs[term]
            vector.append(tf*idf)
        tf_idf[doc] = vector
    return tf_idf

In [15]:
DIRECTORY = '25-20240329T124513Z-001/25/'
FILES = os.listdir(DIRECTORY)

DOC_ID  = 1
TOKEN_ID = 1
DOCS_IDs = {} # dict of <DOCNO> entry to document-id
TOKEN_IDs = {} # dict of token to token-id
TERM_TO_DOCS = {}
TFs = {}
for filename in FILES:
    DOCS_IDs[DOC_ID] = filename
    document = extract_text(DIRECTORY, filename)
    tokens = tokenize_document(document)
    for token in tokens:
        if token not in TOKEN_IDs:
            TOKEN_IDs[token] = TOKEN_ID
            TERM_TO_DOCS[TOKEN_IDs[token]] = set() # unique documents
            TERM_TO_DOCS[TOKEN_IDs[token]].add(DOC_ID)
            TOKEN_ID += 1
        else:
            TERM_TO_DOCS[TOKEN_IDs[token]].add(DOC_ID)
    TFs[DOC_ID] =  calculate_tf(tokens, TOKEN_IDs)
    DOC_ID += 1

In [16]:
IDFs = calculate_idfs(TERM_TO_DOCS, TOKEN_IDs, DOCS_IDs)
# print(IDFs)

In [17]:
TF_IDF_VECTORS = calculate_tf_idfs(TFs, IDFs)
# print(TF_IDF_VECTORS)

In [48]:
def cosine_similarity(v1, v2):
    min_vect = len(v1) if len(v1) < len(v2) else len(v2)
    v1v2 = 0
    for i in range(min_vect):
        v1v2 += v1[i]*v2[i]
    norm_v1 = math.sqrt(sum( i**2 for i in v1))
    norm_v2 = math.sqrt(sum( i**2 for i in v2))

    similarity_value =  v1v2/(norm_v1*norm_v2)
    return similarity_value

def calculate_similarity(TF_IDF_VECTORS):
    min_heap = []
    N = len(TF_IDF_VECTORS)
    for doc1 in TF_IDF_VECTORS:
        for doc2 in range(doc1+1, N+1):
            value = cosine_similarity(TF_IDF_VECTORS[doc1], TF_IDF_VECTORS[doc2])
            value = round(value, 3)
            heapq.heappush(min_heap, (value, doc1, doc2))
    return min_heap

In [49]:
print(calculate_similarity(TF_IDF_VECTORS))


[(1, 2, 0.537), (1, 3, 0.435), (1, 4, 0.344), (1, 5, 0.654), (1, 6, 0.728), (1, 7, 0.275), (1, 8, 0.749), (1, 9, 0.718), (1, 10, 0.358), (1, 11, 0.784), (1, 12, 0.511), (1, 13, 0.686), (1, 14, 0.556), (1, 15, 0.567), (1, 16, 0.268), (1, 17, 0.453), (1, 18, 0.424), (1, 19, 0.776), (1, 20, 0.277), (1, 21, 0.522), (1, 22, 0.645), (1, 23, 0.693), (1, 24, 0.653), (1, 25, 0.59), (1, 26, 0.519), (1, 27, 0.418), (1, 28, 0.442), (1, 29, 0.605), (1, 30, 0.395), (1, 31, 0.466), (1, 32, 0.55), (1, 33, 0.805), (1, 34, 0.659), (1, 35, 0.432), (1, 36, 0.647), (1, 37, 0.352), (1, 38, 0.495), (1, 39, 0.514), (1, 40, 0.419), (1, 41, 0.709), (1, 42, 0.423), (1, 43, 0.754), (1, 44, 0.636), (1, 45, 0.265), (1, 46, 0.287), (1, 47, 0.415), (1, 48, 0.764), (1, 49, 0.398), (1, 50, 0.399), (1, 51, 0.591), (1, 52, 0.673), (1, 53, 0.462), (1, 54, 0.258), (1, 55, 0.602), (1, 56, 0.522), (1, 57, 0.699), (1, 58, 0.669), (1, 59, 0.722), (1, 60, 0.665), (1, 61, 0.695), (1, 62, 0.382), (1, 63, 0.468), (1, 64, 0.577), (

In [36]:
# print(TF_IDF_VECTORS)

In [None]:
# norm_cosine_vector = {}
# idfs = calculate_idfs(term_to_docs, TOKEN_IDs)
# for document in DOCS_IDs:
#     text = extract_text(directory, DOCS_IDs[document])
#     tokens = tokenize_document(text)
#     vector = []
#     for token in tokens:
#         token_id = TOKEN_IDs[token]
#         term_weight = term_frequency[token_id]*idfs[token_id]  ####replace tfs
#         vector.append(term_weight)
#     norm_cosine_vector[document] = vector

[8.93, 8.93, 522.24, 42.839999999999996, 8.93, 324.3, 8.93, 113.81, 54.8, 8.93, 534.6, 255.0, 530.55, 534.6, 320.28, 8.93, 441.87, 515.34, 342.62, 360.23999999999995, 331.57, 60.57000000000001, 522.24, 534.6, 42.839999999999996, 108.72, 30.16, 527.8, 534.6, 8.93, 324.3, 8.93, 8.93, 534.6, 8.93, 16.48, 192.6, 530.55, 8.93, 8.93, 23.490000000000002, 278.72999999999996, 82.68, 522.24, 54.8, 480.7, 113.81, 527.8, 93.3, 522.24, 8.93, 82.68, 480.7, 93.3, 8.93, 370.64, 441.87, 432.0, 527.8, 113.81, 71.83, 16.48, 36.6, 238.76, 118.6, 205.52999999999997, 8.93, 527.8, 98.56, 192.6, 165.9, 534.6, 48.86, 480.7, 527.8, 222.31, 23.490000000000002, 530.55, 8.93, 446.16, 8.93, 48.86, 534.6, 187.95000000000002, 165.9, 527.8, 8.93, 8.93, 432.0, 36.6, 108.72, 71.83, 388.02, 36.6, 8.93, 449.40000000000003, 54.8, 8.93, 8.93, 527.8, 8.93, 30.16, 60.57000000000001, 8.93, 165.9, 527.8, 342.62, 54.8, 36.6, 8.93, 23.490000000000002, 278.72999999999996, 113.81, 522.24, 54.8, 480.7, 113.81, 449.40000000000003, 53

In [None]:
# def calculate_score(query, document, id):
#     score = 0
#     tokens = tokenize_document(query)
#     for token in tokens:
#         token_id = TOKEN_IDs[token]
#         Wtd = tfs[token_id] * idfs[token_id] 
#         query_score  +=  Wtd

# query = "socialism trying products"
# calculate_score(query, document)

112.59

In [None]:
# term_idfs = {}

# for doc_id, filename in docs.items():
#     document = extract_text(directory, filename)
#     tokens = tokenize_document(document)
#     for token in tokens:
#         if token not in term_idfs:
#             term_idfs[token] = []
#             term_idfs[token].append(doc_id)
#         else:
#             term_idfs[token].append(doc_id)

In [None]:
# term_idfs

In [None]:
# if token not in dictionary:
#     dictionary[token] = []
#     dictionary[token].append(i)
# else:
#     dictionary[token].append(i)

In [None]:
# document = "This is an example document with some_underscored_words and 123numbers456."
# tokens = tokenize_document(document)
# print(tokens)

In [None]:
# term_idfs = {}

# for doc_id, filename in docs.items():
#     document = extract_text(directory, filename)
#     tokens = tokenize_document(document)
#     for token in tokens:
#         if token not in term_idfs:
#             term_idfs[token] = []
#             term_idfs[token].append(doc_id)
#         else:
#             term_idfs[token].append(doc_id)