In [1]:
import os
import re
import math
import heapq

In [2]:
DIRECTORY = '25-20240329T124513Z-001/25/'

In [3]:
def tokenize_document(document):
    tokens = re.findall(r'\w+', document)
    return tokens

In [4]:
def extract_text(directory, filename):
    with open(os.path.join(directory, filename), 'r') as file:
        text_data = file.read()
        s_title = text_data.find("<TITLE>")
        e_title = text_data.find("</TITLE>", 7)
        
        s_text = text_data.find("<TEXT>")
        e_text = text_data.find("</TEXT>", 6)
        
        extracted_text = text_data[s_title+7:e_title].strip().lower() + "\n" + text_data[s_text+6:e_text].strip().lower()
        return extracted_text

In [31]:
def calculate_tf(tokens, TOKEN_IDs):
    tfs = {}
    for token in tokens:
        if TOKEN_IDs[token] in tfs:
            tfs[TOKEN_IDs[token]] += 1
        else:
            tfs[TOKEN_IDs[token]] = 1
    return tfs


def calculate_idfs(TERM_TO_DOCS, TOKEN_IDs, DOCS_IDs):
    N = len(DOCS_IDs)
    idfs = {}
    for term, documents in TERM_TO_DOCS.items():
        idf = math.log(N / len(documents))
        idfs[term] = round(idf, 3)
    return idfs

def calculate_tf_idfs(TFs, IDFs):
    tf_idf = {}
    for doc in TFs:
        vector = []
        for term in TFs[doc]:
            tf = TFs[doc][term]
            idf = IDFs[term]
            vector.append(tf*idf)
        tf_idf[doc] = vector
    return tf_idf



In [32]:
def main(DIRECTORY):
    FILES = os.listdir(DIRECTORY)

    DOC_ID  = 1
    TOKEN_ID = 1
    DOCS_IDs = {} # dict of <DOCNO> entry to document-id
    TOKEN_IDs = {} # dict of token to token-id
    TERM_TO_DOCS = {}
    TFs = {}
    for filename in FILES:
        DOCS_IDs[DOC_ID] = filename
        document = extract_text(DIRECTORY, filename)
        tokens = tokenize_document(document)
        for token in tokens:
            if token not in TOKEN_IDs:
                TOKEN_IDs[token] = TOKEN_ID
                TERM_TO_DOCS[TOKEN_IDs[token]] = set() # unique documents
                TERM_TO_DOCS[TOKEN_IDs[token]].add(DOC_ID)
                TOKEN_ID += 1
            else:
                TERM_TO_DOCS[TOKEN_IDs[token]].add(DOC_ID)
        TFs[DOC_ID] =  calculate_tf(tokens, TOKEN_IDs)
        DOC_ID += 1
    
    return DOCS_IDs, TOKEN_IDs, TFs, TERM_TO_DOCS

In [33]:
DOCS_IDs, TOKEN_IDs, TFs, TERM_TO_DOCS = main(DIRECTORY)
IDFs = calculate_idfs(TERM_TO_DOCS, TOKEN_IDs, DOCS_IDs)
TF_IDF_VECTORS = calculate_tf_idfs(TFs, IDFs)

In [34]:
# print(TFs[12])

{1491: 5, 47: 13, 1492: 5, 1493: 1, 275: 3, 1494: 8, 1495: 1, 1496: 2, 9: 2, 453: 1, 11: 1, 12: 1, 13: 1, 15: 6, 906: 1, 84: 2, 1497: 1, 700: 1, 381: 1, 18: 16, 1498: 1, 1499: 1, 1500: 3, 1501: 1, 582: 1, 1502: 2, 1503: 2, 723: 1, 227: 6, 418: 2, 243: 4, 27: 4, 586: 2, 1504: 1, 438: 2, 1505: 1, 1506: 2, 314: 2, 1507: 2, 511: 1, 1508: 1, 160: 2, 6: 7, 1509: 1, 36: 9, 1510: 1, 450: 1, 1511: 1, 665: 1, 346: 2, 739: 2, 877: 1, 1512: 1, 1513: 1, 169: 6, 1110: 3, 56: 5, 1514: 1, 38: 9, 1515: 2, 57: 4, 1516: 1, 71: 1, 1517: 1, 123: 3, 118: 1, 1518: 1, 340: 1, 98: 1, 426: 1, 87: 1, 20: 3, 1519: 1, 32: 3, 1520: 3, 526: 1, 618: 1, 1521: 1, 1522: 1, 1523: 3, 166: 3, 414: 1, 1524: 1, 1525: 1, 1020: 1, 1417: 1, 886: 1, 158: 2, 97: 8, 609: 2, 1526: 1, 1527: 1, 1528: 1, 1529: 1, 1530: 1, 1531: 1, 25: 4, 1532: 1, 180: 1, 213: 3, 641: 1, 778: 1, 1533: 1, 1534: 1, 1535: 2, 156: 1, 522: 1, 327: 2, 462: 1, 1536: 1, 1537: 2, 171: 1, 1538: 1, 1539: 1, 1540: 1, 1541: 1, 164: 1, 1542: 1, 1543: 1, 278: 4, 460:

In [27]:
# for i in DOCS_IDs:
#     if DOCS_IDs[i] == "en.13.25.100.2009.11.7":
#         print(i)

In [38]:
# len(TERM_TO_DOCS[1491])


7

In [8]:
DOCS_IDs, TOKEN_IDs, TFs, TERM_TO_DOCS = main(DIRECTORY)
IDFs = calculate_idfs(TERM_TO_DOCS, TOKEN_IDs, DOCS_IDs)
TF_IDF_VECTORS = calculate_tf_idfs(TFs, IDFs)

In [9]:
def calculate_cosine_similarity(v1, v2):
    min_vect = len(v1) if len(v1) < len(v2) else len(v2)
    v1v2 = 0
    for i in range(min_vect):
        v1v2 += v1[i]*v2[i]
    norm_v1 = math.sqrt(sum( i**2 for i in v1))
    norm_v2 = math.sqrt(sum( i**2 for i in v2))

    similarity_value =  v1v2/(norm_v1*norm_v2)
    return similarity_value

In [10]:

def calculate_similarity(TF_IDF_VECTORS):
    max_heap = []
    N = len(TF_IDF_VECTORS)
    for doc1 in TF_IDF_VECTORS:
        for doc2 in range(doc1+1, N+1):
            value = calculate_cosine_similarity(TF_IDF_VECTORS[doc1], TF_IDF_VECTORS[doc2])
            value = round(value, 3)
            heapq.heappush(max_heap, (-value, doc1, doc2))
    return max_heap

In [11]:
def display():
    pairewiseSimilarity = calculate_similarity(TF_IDF_VECTORS)
    for value, doc1, doc2 in pairewiseSimilarity:
        # print(DOCS_IDs[doc1] + " -> "+ DOCS_IDs[doc2] + ":  " + str(-value))
        print(str(doc1) + " -> "+ str(doc2) + ":  " + str(-value))
display()


28 -> 88:  0.846
19 -> 65:  0.82
9 -> 116:  0.821
6 -> 69:  0.81
23 -> 69:  0.814
28 -> 40:  0.811
87 -> 130:  0.803
119 -> 130:  0.806
43 -> 122:  0.805
24 -> 36:  0.806
61 -> 87:  0.813
8 -> 43:  0.802
73 -> 96:  0.799
8 -> 96:  0.8
96 -> 116:  0.797
1 -> 33:  0.805
40 -> 85:  0.8
44 -> 93:  0.8
48 -> 65:  0.789
50 -> 88:  0.799
11 -> 122:  0.802
6 -> 9:  0.798
59 -> 114:  0.805
13 -> 130:  0.782
68 -> 109:  0.796
69 -> 106:  0.796
74 -> 116:  0.799
16 -> 45:  0.791
81 -> 116:  0.793
34 -> 87:  0.795
8 -> 120:  0.795
103 -> 109:  0.795
38 -> 80:  0.795
9 -> 23:  0.798
41 -> 116:  0.799
21 -> 100:  0.778
43 -> 120:  0.791
21 -> 39:  0.755
21 -> 103:  0.778
6 -> 8:  0.789
22 -> 44:  0.794
23 -> 92:  0.796
5 -> 36:  0.8
6 -> 96:  0.792
57 -> 74:  0.795
6 -> 116:  0.795
24 -> 104:  0.803
61 -> 119:  0.782
13 -> 92:  0.778
27 -> 28:  0.792
67 -> 79:  0.795
31 -> 80:  0.792
69 -> 116:  0.793
74 -> 81:  0.788
15 -> 56:  0.769
79 -> 100:  0.771
81 -> 130:  0.782
1 -> 120:  0.78
33 -> 73:  0.

In [12]:
# TFs

{1: {1: 1,
  2: 1,
  3: 1,
  4: 1,
  5: 1,
  6: 1,
  7: 1,
  8: 1,
  9: 1,
  10: 1,
  11: 1,
  12: 1,
  13: 1,
  14: 1,
  15: 1,
  16: 1,
  17: 1,
  18: 1,
  19: 1,
  20: 1,
  21: 1,
  22: 1,
  23: 1,
  24: 1,
  25: 1,
  26: 1,
  27: 1,
  28: 1,
  29: 1,
  30: 1,
  31: 1,
  32: 1,
  33: 1,
  34: 1,
  35: 1,
  36: 1,
  37: 1,
  38: 1,
  39: 1,
  40: 1,
  41: 1,
  42: 1,
  43: 1,
  44: 1,
  45: 1,
  46: 1,
  47: 1,
  48: 1,
  49: 1,
  50: 1,
  51: 1,
  52: 1,
  53: 1,
  54: 1,
  55: 1,
  56: 1,
  57: 1,
  58: 1,
  59: 1,
  60: 1,
  61: 1,
  62: 1,
  63: 1,
  64: 1,
  65: 1,
  66: 1,
  67: 1,
  68: 1,
  69: 1,
  70: 1,
  71: 1,
  72: 1,
  73: 1,
  74: 1,
  75: 1,
  76: 1,
  77: 1,
  78: 1,
  79: 1,
  80: 1,
  81: 1,
  82: 1,
  83: 1,
  84: 1,
  85: 1,
  86: 1,
  87: 1,
  88: 1,
  89: 1,
  90: 1,
  91: 1,
  92: 1,
  93: 1,
  94: 1,
  95: 1,
  96: 1,
  97: 1,
  98: 1,
  99: 1,
  100: 1,
  101: 1,
  102: 1,
  103: 1,
  104: 1,
  105: 1,
  106: 1,
  107: 1,
  108: 1,
  109: 1,
  110: 1,
  111

In [13]:
# DOCS_IDs

In [14]:
# norm_cosine_vector = {}
# idfs = calculate_idfs(term_to_docs, TOKEN_IDs)
# for document in DOCS_IDs:
#     text = extract_text(directory, DOCS_IDs[document])
#     tokens = tokenize_document(text)
#     vector = []
#     for token in tokens:
#         token_id = TOKEN_IDs[token]
#         term_weight = term_frequency[token_id]*idfs[token_id]  ####replace tfs
#         vector.append(term_weight)
#     norm_cosine_vector[document] = vector

In [15]:
# def calculate_score(query, document, id):
#     score = 0
#     tokens = tokenize_document(query)
#     for token in tokens:
#         token_id = TOKEN_IDs[token]
#         Wtd = tfs[token_id] * idfs[token_id] 
#         query_score  +=  Wtd

# query = "socialism trying products"
# calculate_score(query, document)

In [16]:
DOCS_IDs

{1: 'en.13.25.418.2009.11.9',
 2: 'en.13.25.65.2009.11.9',
 3: 'en.13.25.179.2009.11.8',
 4: 'en.13.25.242.2009.11.9',
 5: 'en.13.25.285.2009.11.8',
 6: 'en.13.25.440.2009.11.8',
 7: 'en.13.25.318.2009.11.8',
 8: 'en.13.25.487.2009.11.8',
 9: 'en.13.25.357.2009.11.8',
 10: 'en.13.25.184.2009.11.9',
 11: 'en.13.25.115.2009.11.9',
 12: 'en.13.25.100.2009.11.7',
 13: 'en.13.25.455.2009.11.8',
 14: 'en.13.25.344.2009.11.7',
 15: 'en.13.25.291.2009.11.8',
 16: 'en.13.25.162.2009.11.8',
 17: 'en.13.25.493.2009.11.8',
 18: 'en.13.25.69.2009.11.8',
 19: 'en.13.25.454.2009.11.8',
 20: 'en.13.25.191.2009.11.9',
 21: 'en.13.25.13.2009.11.8',
 22: 'en.13.25.497.2009.11.7',
 23: 'en.13.25.53.2009.11.8',
 24: 'en.13.25.133.2009.11.8',
 25: 'en.13.25.26.2009.11.9',
 26: 'en.13.25.481.2009.11.8',
 27: 'en.13.25.118.2009.11.9',
 28: 'en.13.25.458.2009.11.8',
 29: 'en.13.25.50.2009.11.8',
 30: 'en.13.25.288.2009.11.9',
 31: 'en.13.25.80.2009.11.8',
 32: 'en.13.25.156.2009.11.8',
 33: 'en.13.25.347.2009.

In [17]:
# term_idfs = {}

# for doc_id, filename in docs.items():
#     document = extract_text(directory, filename)
#     tokens = tokenize_document(document)
#     for token in tokens:
#         if token not in term_idfs:
#             term_idfs[token] = []
#             term_idfs[token].append(doc_id)
#         else:
#             term_idfs[token].append(doc_id)

In [18]:
# term_idfs

In [19]:
# if token not in dictionary:
#     dictionary[token] = []
#     dictionary[token].append(i)
# else:
#     dictionary[token].append(i)

In [20]:
# document = "This is an example document with some_underscored_words and 123numbers456."
# tokens = tokenize_document(document)
# print(tokens)

In [21]:
# term_idfs = {}

# for doc_id, filename in docs.items():
#     document = extract_text(directory, filename)
#     tokens = tokenize_document(document)
#     for token in tokens:
#         if token not in term_idfs:
#             term_idfs[token] = []
#             term_idfs[token].append(doc_id)
#         else:
#             term_idfs[token].append(doc_id)