In [2]:
import os
import re
import math
import heapq

In [3]:
DIRECTORY = '25-20240329T124513Z-001/25/'

In [4]:
def tokenize_document(document):
    tokens = re.findall(r'\w+', document)
    return tokens

In [5]:
def extract_text(directory, filename):
    with open(os.path.join(directory, filename), 'r') as file:
        text_data = file.read()
        s_title = text_data.find("<TITLE>")
        e_title = text_data.find("</TITLE>", 7)
        
        s_text = text_data.find("<TEXT>")
        e_text = text_data.find("</TEXT>", 6)
        
        extracted_text = text_data[s_title+7:e_title].strip().lower() + "\n" + text_data[s_text+6:e_text].strip().lower()
        return extracted_text

In [6]:
def calculate_tf(tokens, TOKEN_IDs):
    tfs = {}
    for token in tokens:
        if token in tfs:
            tfs[TOKEN_IDs[token]] += 1
        else:
            tfs[TOKEN_IDs[token]] = 1
    return tfs

def calculate_idfs(TERM_TO_DOCS, TOKEN_IDs, DOCS_IDs):
    N = len(DOCS_IDs)
    idfs = {}
    for term, documents in TERM_TO_DOCS.items():
        idf = math.log(N / len(documents))
        idfs[term] = round(idf, 3)
    return idfs

def calculate_tf_idfs(TFs, IDFs):
    tf_idf = {}
    for doc in TFs:
        vector = []
        for term in TFs[doc]:
            tf = TFs[doc][term]
            idf = IDFs[term]
            vector.append(tf*idf)
        tf_idf[doc] = vector
    return tf_idf



In [7]:
def main(DIRECTORY):
    FILES = os.listdir(DIRECTORY)

    DOC_ID  = 1
    TOKEN_ID = 1
    DOCS_IDs = {} # dict of <DOCNO> entry to document-id
    TOKEN_IDs = {} # dict of token to token-id
    TERM_TO_DOCS = {}
    TFs = {}
    for filename in FILES:
        DOCS_IDs[DOC_ID] = filename
        document = extract_text(DIRECTORY, filename)
        tokens = tokenize_document(document)
        for token in tokens:
            if token not in TOKEN_IDs:
                TOKEN_IDs[token] = TOKEN_ID
                TERM_TO_DOCS[TOKEN_IDs[token]] = set() # unique documents
                TERM_TO_DOCS[TOKEN_IDs[token]].add(DOC_ID)
                TOKEN_ID += 1
            else:
                TERM_TO_DOCS[TOKEN_IDs[token]].add(DOC_ID)
        TFs[DOC_ID] =  calculate_tf(tokens, TOKEN_IDs)
        DOC_ID += 1
    
    return DOCS_IDs, TOKEN_IDs, TFs, TERM_TO_DOCS

In [8]:
DOCS_IDs, TOKEN_IDs, TFs, TERM_TO_DOCS = main(DIRECTORY)
IDFs = calculate_idfs(TERM_TO_DOCS, TOKEN_IDs, DOCS_IDs)
TF_IDF_VECTORS = calculate_tf_idfs(TFs, IDFs)

In [9]:
def calculate_cosine_similarity(v1, v2):
    min_vect = len(v1) if len(v1) < len(v2) else len(v2)
    v1v2 = 0
    for i in range(min_vect):
        v1v2 += v1[i]*v2[i]
    norm_v1 = math.sqrt(sum( i**2 for i in v1))
    norm_v2 = math.sqrt(sum( i**2 for i in v2))

    similarity_value =  v1v2/(norm_v1*norm_v2)
    return similarity_value

In [10]:

def calculate_similarity(TF_IDF_VECTORS):
    max_heap = []
    N = len(TF_IDF_VECTORS)
    for doc1 in TF_IDF_VECTORS:
        for doc2 in range(doc1+1, N+1):
            value = calculate_cosine_similarity(TF_IDF_VECTORS[doc1], TF_IDF_VECTORS[doc2])
            value = round(value, 3)
            heapq.heappush(max_heap, (-value, doc1, doc2))
    return max_heap

In [11]:
def display():
    pairewiseSimilarity = calculate_similarity(TF_IDF_VECTORS)
    for value, doc1, doc2 in pairewiseSimilarity:
        # print(DOCS_IDs[doc1] + " -> "+ DOCS_IDs[doc2] + ":  " + str(-value))
        print(str(doc1) + " -> "+ str(doc2) + ":  " + str(-value))
display()


28 -> 88:  0.846
19 -> 65:  0.82
9 -> 116:  0.821
6 -> 69:  0.81
23 -> 69:  0.814
28 -> 40:  0.811
87 -> 130:  0.803
119 -> 130:  0.806
43 -> 122:  0.805
24 -> 36:  0.806
61 -> 87:  0.813
8 -> 43:  0.802
73 -> 96:  0.799
8 -> 96:  0.8
96 -> 116:  0.797
1 -> 33:  0.805
40 -> 85:  0.8
44 -> 93:  0.8
48 -> 65:  0.789
50 -> 88:  0.799
11 -> 122:  0.802
6 -> 9:  0.798
59 -> 114:  0.805
13 -> 130:  0.782
68 -> 109:  0.796
69 -> 106:  0.796
74 -> 116:  0.799
16 -> 45:  0.791
81 -> 116:  0.793
34 -> 87:  0.795
8 -> 120:  0.795
103 -> 109:  0.795
38 -> 80:  0.795
9 -> 23:  0.798
41 -> 116:  0.799
21 -> 100:  0.778
43 -> 120:  0.791
21 -> 39:  0.755
21 -> 103:  0.778
6 -> 8:  0.789
22 -> 44:  0.794
23 -> 92:  0.796
5 -> 36:  0.8
6 -> 96:  0.792
57 -> 74:  0.795
6 -> 116:  0.795
24 -> 104:  0.803
61 -> 119:  0.782
13 -> 92:  0.778
27 -> 28:  0.792
67 -> 79:  0.795
31 -> 80:  0.792
69 -> 116:  0.793
74 -> 81:  0.788
15 -> 56:  0.769
79 -> 100:  0.771
81 -> 130:  0.782
1 -> 120:  0.78
33 -> 73:  0.

In [12]:
# norm_cosine_vector = {}
# idfs = calculate_idfs(term_to_docs, TOKEN_IDs)
# for document in DOCS_IDs:
#     text = extract_text(directory, DOCS_IDs[document])
#     tokens = tokenize_document(text)
#     vector = []
#     for token in tokens:
#         token_id = TOKEN_IDs[token]
#         term_weight = term_frequency[token_id]*idfs[token_id]  ####replace tfs
#         vector.append(term_weight)
#     norm_cosine_vector[document] = vector

In [13]:
# def calculate_score(query, document, id):
#     score = 0
#     tokens = tokenize_document(query)
#     for token in tokens:
#         token_id = TOKEN_IDs[token]
#         Wtd = tfs[token_id] * idfs[token_id] 
#         query_score  +=  Wtd

# query = "socialism trying products"
# calculate_score(query, document)

In [14]:
# term_idfs = {}

# for doc_id, filename in docs.items():
#     document = extract_text(directory, filename)
#     tokens = tokenize_document(document)
#     for token in tokens:
#         if token not in term_idfs:
#             term_idfs[token] = []
#             term_idfs[token].append(doc_id)
#         else:
#             term_idfs[token].append(doc_id)

In [15]:
# term_idfs

In [16]:
# if token not in dictionary:
#     dictionary[token] = []
#     dictionary[token].append(i)
# else:
#     dictionary[token].append(i)

In [17]:
# document = "This is an example document with some_underscored_words and 123numbers456."
# tokens = tokenize_document(document)
# print(tokens)

In [18]:
# term_idfs = {}

# for doc_id, filename in docs.items():
#     document = extract_text(directory, filename)
#     tokens = tokenize_document(document)
#     for token in tokens:
#         if token not in term_idfs:
#             term_idfs[token] = []
#             term_idfs[token].append(doc_id)
#         else:
#             term_idfs[token].append(doc_id)