In [1]:
import os
import re
import math
import heapq

In [2]:
DIRECTORY = '25-20240329T124513Z-001/25/'

In [3]:
def tokenize_document(document):
    tokens = re.findall(r'\w+', document)
    return tokens

In [4]:
def extract_text(directory, filename):
    with open(os.path.join(directory, filename), 'r') as file:
        text_data = file.read()
        s_title = text_data.find("<TITLE>")
        e_title = text_data.find("</TITLE>", 7)
        
        s_text = text_data.find("<TEXT>")
        e_text = text_data.find("</TEXT>", 6)
        
        extracted_text = text_data[s_title+7:e_title].strip().lower() + "\n" + text_data[s_text+6:e_text].strip().lower()
        return extracted_text

In [5]:
def calculate_tf(tokens, TOKEN_IDs):
    tfs = {}
    for token in tokens:
        if TOKEN_IDs[token] in tfs:
            tfs[TOKEN_IDs[token]] += 1
        else:
            tfs[TOKEN_IDs[token]] = 1
    return tfs

def calculate_idfs(TERM_TO_DOCS, TOKEN_IDs, DOCS_IDs):
    N = len(DOCS_IDs)
    idfs = {}
    for term, documents in TERM_TO_DOCS.items():
        idf = math.log(N / len(documents))
        # idfs[term] = round(idf, 3)
        idfs[term] = idf
    return idfs

In [6]:
# def calculate_tf_idfs(TFs, IDFs):
#     tf_idf = {}
#     for doc in TFs:
#         vector = []
#         for term in TFs[doc]:
#             tf = TFs[doc][term]
#             idf = IDFs[term]
#             vector.append(tf*idf)
#         tf_idf[doc] = vector
#     return tf_idf

In [7]:
def main(DIRECTORY):
    FILES = os.listdir(DIRECTORY)

    DOC_ID  = 1
    TOKEN_ID = 1
    DOCS_IDs = {} # dict of <DOCNO> entry to document-id
    TOKEN_IDs = {} # dict of token to token-id
    TERM_TO_DOCS = {}
    TFs = {}
    for filename in FILES:
        DOCS_IDs[DOC_ID] = filename
        document = extract_text(DIRECTORY, filename)
        tokens = tokenize_document(document)
        for token in tokens:
            if token not in TOKEN_IDs:
                TOKEN_IDs[token] = TOKEN_ID
                TERM_TO_DOCS[TOKEN_IDs[token]] = set() # unique documents
                TERM_TO_DOCS[TOKEN_IDs[token]].add(DOC_ID)
                TOKEN_ID += 1
            else:
                TERM_TO_DOCS[TOKEN_IDs[token]].add(DOC_ID)
        TFs[DOC_ID] =  calculate_tf(tokens, TOKEN_IDs)
        DOC_ID += 1
    
    return DOCS_IDs, TOKEN_IDs, TFs, TERM_TO_DOCS

In [8]:
def calculate_tf_idfs(TFs, IDFs):
    tf_idf = {}
    for doc in TFs:
        vector = {}
        for term in TFs[doc]:
            tf = TFs[doc][term]
            idf = IDFs[term]
            vector[term] = tf*idf
        sorted_vector = dict(sorted(vector.items()))
        tf_idf[doc] = sorted_vector
    return tf_idf

def calculate_cosine_similarity(v1, v2):
    v2_keys = list(v2.keys())       # List of all terms 
    v2_j = 0                        # pointer for document v2_keys list
    v1v2 = 0                        # dot products sum pointer
    for v1_i in v1:
        while v2_j < len(v2_keys) and v1_i >= v2_keys[v2_j] :
            if v1_i == v2_keys[v2_j]: # comparing term for both documents
                v1v2 += v1[v1_i]*v2[v2_keys[v2_j]]
                break
            v2_j+=1
    norm_v1 = math.sqrt(sum( v1[i]**2 for i in v1))
    norm_v2 = math.sqrt(sum( v2[i]**2 for i in v2))

    similarity_value =  v1v2/(norm_v1*norm_v2)
    return similarity_value

def calculate_similarity(TF_IDF_VECTORS):
    max_heap = []
    N = len(TF_IDF_VECTORS)
    for doc1 in range(1, N+1):
        for doc2 in range(doc1+ 1, N+1):
            value = calculate_cosine_similarity(TF_IDF_VECTORS[doc1], TF_IDF_VECTORS[doc2])
            heapq.heappush(max_heap, (-value, doc1, doc2))
    return max_heap

In [9]:
DOCS_IDs, TOKEN_IDs, TFs, TERM_TO_DOCS = main(DIRECTORY)
IDFs = calculate_idfs(TERM_TO_DOCS, TOKEN_IDs, DOCS_IDs)
TF_IDF_VECTORS = calculate_tf_idfs( TFs, IDFs)

In [10]:

# TF_IDF_VECTORS[100]

In [11]:
# def ccs(v1, v2):
#     v2_keys = list(v2.keys())       # List of all terms 
#     v2_j = 0                        # pointer for document v2_keys list
#     v1v2 = 0                        # dot products sum pointer
#     for v1_i in v1:
#         while v2_j < len(v2_keys) and v1_i >= v2_keys[v2_j] :
#             if v1_i == v2_keys[v2_j]: # comparing term for both documents
#                 v1v2 += v1[v1_i]*v2[v2_keys[v2_j]]
#                 break
#             v2_j+=1
#     print(v1v2)
#     norm_v1 = math.sqrt(sum( v1[i]**2 for i in v1))
#     norm_v2 = math.sqrt(sum( v2[i]**2 for i in v2))

#     similarity_value =  v1v2/(norm_v1*norm_v2)
#     return similarity_value

# v1 = {100: 1, 101:1}
# v2 = {100:1, 101:1}

# ccs(v1,v2)

2


0.9999999999999998

In [12]:
# TF_IDF_VECTORS[100]

In [13]:

pairewiseSimilarity = calculate_similarity(TF_IDF_VECTORS)
limit = 0
for _ in pairewiseSimilarity:
    value, doc1, doc2 = heapq.heappop(pairewiseSimilarity)
    # if limit < 50:
    print(f"Similarity between {[DOCS_IDs[doc1]]} and {[DOCS_IDs[doc2]]}: {-value}")
    #     limit+=1
    # else:
    #     break

Similarity between ['en.13.25.458.2009.11.8'] and ['en.13.25.67.2009.11.8']: 0.9603271710302747
Similarity between ['en.13.25.281.2009.11.7'] and ['en.13.25.101.2009.11.9']: 0.9061121967688857
Similarity between ['en.13.25.184.2009.11.9'] and ['en.13.25.182.2009.11.9']: 0.8791574857681683
Similarity between ['en.13.25.78.2009.11.8'] and ['en.13.25.239.2009.11.8']: 0.8500616536341428
Similarity between ['en.13.25.392.2009.11.8'] and ['en.13.25.381.2009.11.8']: 0.8390715005322509
Similarity between ['en.13.25.170.2009.11.8'] and ['en.13.25.94.2009.11.8']: 0.8113797693569366
Similarity between ['en.13.25.285.2009.11.8'] and ['en.13.25.440.2009.11.8']: 0.807716957497694
Similarity between ['en.13.25.89.2009.11.9'] and ['en.13.25.399.2009.11.9']: 0.8064823534595625
Similarity between ['en.13.25.252.2009.11.8'] and ['en.13.25.152.2009.11.9']: 0.7804171169518499
Similarity between ['en.13.25.47.2009.11.9'] and ['en.13.25.76.2009.11.9']: 0.7581581476551947
Similarity between ['en.13.25.12.2009

In [14]:
# TF_IDF_VECTORS[doc1]
# calculate_cosine_similarity(, TF_IDF_VECTORS[doc2])

In [15]:
# def calculate_tf_idfs( TOKEN_IDs ,TFs, IDFs):
#     tf_idf = {}
#     for doc in TFs:
#         vector = []
#         for term in TOKEN_IDs:
#             if term not in TFs[doc].keys():
#                 vector.append(0)
#             else:
#                 tf = TFs[doc][term]
#                 idf = IDFs[term]
#                 vector.append(tf*idf)
#         tf_idf[doc] = vector
#     return tf_idf

In [16]:
# def calculate_cosine_similarity(v1, v2):
#     min_vect = len(v1) if len(v1) < len(v2) else len(v2)
#     v1v2 = 0
#     for i in range(min_vect):
#         v1v2 += v1[i]*v2[i]
#     norm_v1 = math.sqrt(sum( i**2 for i in v1))
#     norm_v2 = math.sqrt(sum( i**2 for i in v2))

#     similarity_value =  v1v2/(norm_v1*norm_v2)
#     return similarity_value

In [17]:
# def calculate_similarity(TF_IDF_VECTORS):
#     max_heap = []
#     N = len(TF_IDF_VECTORS)
#     for doc1 in TF_IDF_VECTORS:
#         for doc2 in range(doc1+1, N+1):
#             value = calculate_cosine_similarity(TF_IDF_VECTORS[doc1], TF_IDF_VECTORS[doc2])
#             value = round(value, 3)
#             heapq.heappush(max_heap, (-value, doc1, doc2))
#     return max_heap

In [18]:
# i = 0
# while i < 10:
#     if i == 4:
#         break
#     i+=1

# i

In [19]:

pairewiseSimilarity = calculate_similarity(TF_IDF_VECTORS)
for value, doc1, doc2 in pairewiseSimilarity:
    # print(DOCS_IDs[doc1] + " -> "+ DOCS_IDs[doc2] + ":  " + str(-value))
    print(str(doc1) + " -> "+ str(doc2) + ":  " + str(-value))



28 -> 88:  0.9603271710302747
5 -> 6:  0.807716957497694
100 -> 129:  0.9061121967688857
40 -> 61:  0.7581581476551947
51 -> 108:  0.7804171169518499
17 -> 29:  0.6965644421402358
10 -> 42:  0.8791574857681683
10 -> 28:  0.6736665307403116
1 -> 65:  0.6729659861416484
26 -> 87:  0.5811272099534412
25 -> 77:  0.5731169241707603
28 -> 42:  0.6100370875757267
75 -> 90:  0.6085282191525488
85 -> 97:  0.8064823534595625
36 -> 75:  0.8500616536341428
38 -> 43:  0.25546980070143477
42 -> 88:  0.6361725193970165
44 -> 58:  0.5169657303678611
10 -> 88:  0.6640967576413358
51 -> 95:  0.30344747150836265
3 -> 92:  0.4148658950849455
6 -> 109:  0.2840774903136736
61 -> 100:  0.5570857870292478
61 -> 129:  0.5506145701674398
28 -> 95:  0.4837557391033656
29 -> 79:  0.31498257242790567
75 -> 85:  0.5495611546626175
16 -> 78:  0.5981640229613712
81 -> 100:  0.654309030710362
34 -> 57:  0.8113797693569366
93 -> 103:  0.8390715005322509
19 -> 44:  0.248243609497462
37 -> 84:  0.2552499637979849
40 -> 1

In [20]:
# TFs

In [21]:
# DOCS_IDs

In [22]:
# norm_cosine_vector = {}
# idfs = calculate_idfs(term_to_docs, TOKEN_IDs)
# for document in DOCS_IDs:
#     text = extract_text(directory, DOCS_IDs[document])
#     tokens = tokenize_document(text)
#     vector = []
#     for token in tokens:
#         token_id = TOKEN_IDs[token]
#         term_weight = term_frequency[token_id]*idfs[token_id]  ####replace tfs
#         vector.append(term_weight)
#     norm_cosine_vector[document] = vector

In [23]:
# def calculate_score(query, document, id):
#     score = 0
#     tokens = tokenize_document(query)
#     for token in tokens:
#         token_id = TOKEN_IDs[token]
#         Wtd = tfs[token_id] * idfs[token_id] 
#         query_score  +=  Wtd

# query = "socialism trying products"
# calculate_score(query, document)

In [24]:
DOCS_IDs

{1: 'en.13.25.418.2009.11.9',
 2: 'en.13.25.65.2009.11.9',
 3: 'en.13.25.179.2009.11.8',
 4: 'en.13.25.242.2009.11.9',
 5: 'en.13.25.285.2009.11.8',
 6: 'en.13.25.440.2009.11.8',
 7: 'en.13.25.318.2009.11.8',
 8: 'en.13.25.487.2009.11.8',
 9: 'en.13.25.357.2009.11.8',
 10: 'en.13.25.184.2009.11.9',
 11: 'en.13.25.115.2009.11.9',
 12: 'en.13.25.100.2009.11.7',
 13: 'en.13.25.455.2009.11.8',
 14: 'en.13.25.344.2009.11.7',
 15: 'en.13.25.291.2009.11.8',
 16: 'en.13.25.162.2009.11.8',
 17: 'en.13.25.493.2009.11.8',
 18: 'en.13.25.69.2009.11.8',
 19: 'en.13.25.454.2009.11.8',
 20: 'en.13.25.191.2009.11.9',
 21: 'en.13.25.13.2009.11.8',
 22: 'en.13.25.497.2009.11.7',
 23: 'en.13.25.53.2009.11.8',
 24: 'en.13.25.133.2009.11.8',
 25: 'en.13.25.26.2009.11.9',
 26: 'en.13.25.481.2009.11.8',
 27: 'en.13.25.118.2009.11.9',
 28: 'en.13.25.458.2009.11.8',
 29: 'en.13.25.50.2009.11.8',
 30: 'en.13.25.288.2009.11.9',
 31: 'en.13.25.80.2009.11.8',
 32: 'en.13.25.156.2009.11.8',
 33: 'en.13.25.347.2009.

In [25]:
# term_idfs = {}

# for doc_id, filename in docs.items():
#     document = extract_text(directory, filename)
#     tokens = tokenize_document(document)
#     for token in tokens:
#         if token not in term_idfs:
#             term_idfs[token] = []
#             term_idfs[token].append(doc_id)
#         else:
#             term_idfs[token].append(doc_id)

In [26]:
# term_idfs

In [27]:
# if token not in dictionary:
#     dictionary[token] = []
#     dictionary[token].append(i)
# else:
#     dictionary[token].append(i)

In [28]:
# document = "This is an example document with some_underscored_words and 123numbers456."
# tokens = tokenize_document(document)
# print(tokens)

In [29]:
# term_idfs = {}

# for doc_id, filename in docs.items():
#     document = extract_text(directory, filename)
#     tokens = tokenize_document(document)
#     for token in tokens:
#         if token not in term_idfs:
#             term_idfs[token] = []
#             term_idfs[token].append(doc_id)
#         else:
#             term_idfs[token].append(doc_id)