In [37]:
import os
import re
import math

In [38]:
def tokenize_document(document):
    tokens = re.findall(r'\w+', document)
    return tokens

In [39]:
def extract_text(directory, filename):
    with open(os.path.join(directory, filename), 'r') as file:
        text_data = file.read()
        s_title = text_data.find("<TITLE>")
        e_title = text_data.find("</TITLE>", 7)
        
        s_text = text_data.find("<TEXT>")
        e_text = text_data.find("</TEXT>", 6)
        
        extracted_text = text_data[s_title+7:e_title].strip().lower() + "\n" + text_data[s_text+6:e_text].strip().lower()
        return extracted_text

In [40]:
def calculate_tf(tokens, TOKEN_IDs):
    tfs = {}
    for token in tokens:
        if token in tfs:
            tfs[TOKEN_IDs[token]] += 1
        else:
            tfs[TOKEN_IDs[token]] = 1
    return tfs

def calculate_idfs(TERM_TO_DOCS, TOKEN_IDs, DOCS_IDs):
    N = len(DOCS_IDs)
    idfs = {}
    for term, documents in TERM_TO_DOCS.items():
        idf = math.log(N / len(documents))
        idfs[term] = round(idf, 3)
    return idfs

def calculate_tf_idfs(TFs, IDFs):
    tf_idf = {}
    for doc in TFs:
        vector = []
        for term in TFs[doc]:
            tf = TFs[doc][term]
            idf = IDFs[term]
            vector.append(tf*idf)
        tf_idf[doc] = vector
    return tf_idf

In [45]:
DIRECTORY = '25-20240329T124513Z-001/25/'
FILES = os.listdir(DIRECTORY)

DOC_ID  = 1
TOKEN_ID = 1
DOCS_IDs = {} # dict of <DOCNO> entry to document-id
TOKEN_IDs = {} # dict of token to token-id
TERM_TO_DOCS = {}
TFs = {}
for filename in FILES:
    DOCS_IDs[DOC_ID] = filename
    document = extract_text(DIRECTORY, filename)
    tokens = tokenize_document(document)
    for token in tokens:
        if token not in TOKEN_IDs:
            TOKEN_IDs[token] = TOKEN_ID
            TERM_TO_DOCS[TOKEN_IDs[token]] = set() # unique documents
            TERM_TO_DOCS[TOKEN_IDs[token]].add(DOC_ID)
            TOKEN_ID += 1
        else:
            TERM_TO_DOCS[TOKEN_IDs[token]].add(DOC_ID)
    TFs[DOC_ID] =  calculate_tf(tokens, TOKEN_IDs)
    DOC_ID += 1

In [44]:
IDFs = calculate_idfs(TERM_TO_DOCS, TOKEN_IDs, DOCS_IDs)
# IDFs

In [48]:
# print(tf_idf)

{1: [3.497, 3.784, 4.883, 3.091, 3.497, 0.015, 3.784, 2.11, 0.0, 0.951, 0.008, 0.0, 0.663, 0.649, 0.015, 3.784, 1.664, 0.0, 2.05, 0.008, 3.273, 1.299, 2.244, 2.175, 0.104, 2.937, 0.138, 3.497, 3.497, 3.497, 4.883, 0.84, 1.992, 2.686, 2.05, 0.047, 2.175, 0.031, 4.19, 3.784, 3.784, 3.091, 1.194, 1.838, 0.84, 4.19, 0.015, 2.686, 0.238, 3.497, 4.19, 3.091, 2.937, 2.398, 1.076, 0.278, 0.788, 3.091, 3.784, 2.485, 0.894, 2.803, 3.497, 3.784, 3.273, 1.664, 0.229, 3.784, 3.497, 1.219, 0.238, 3.273, 2.803, 2.803, 2.686, 3.273, 1.327, 4.883, 1.417, 2.11, 1.449, 2.58, 3.091, 0.476, 0.708, 2.803, 2.05, 1.838, 3.784, 3.784, 3.784, 0.405, 3.091, 1.145, 2.58, 3.497, 0.405, 2.686, 4.883, 4.883, 4.883, 1.664, 0.84, 4.883, 3.091, 3.091, 4.883, 2.686, 2.686, 4.883, 0.579, 3.784, 0.229, 2.803, 1.992, 3.497, 4.19, 0.592, 4.883, 3.784, 1.887, 2.803, 0.44, 1.145, 4.883, 4.883, 0.526, 1.887, 4.19, 4.883, 2.686, 1.887, 1.169, 3.497, 4.883, 2.686, 2.398, 1.076, 1.033, 1.625, 2.803, 2.686, 3.273, 4.19, 3.784, 4.1

In [24]:
# norm_cosine_vector = {}
# idfs = calculate_idfs(term_to_docs, TOKEN_IDs)
# for document in DOCS_IDs:
#     text = extract_text(directory, DOCS_IDs[document])
#     tokens = tokenize_document(text)
#     vector = []
#     for token in tokens:
#         token_id = TOKEN_IDs[token]
#         term_weight = term_frequency[token_id]*idfs[token_id]  ####replace tfs
#         vector.append(term_weight)
#     norm_cosine_vector[document] = vector

[8.93, 8.93, 522.24, 42.839999999999996, 8.93, 324.3, 8.93, 113.81, 54.8, 8.93, 534.6, 255.0, 530.55, 534.6, 320.28, 8.93, 441.87, 515.34, 342.62, 360.23999999999995, 331.57, 60.57000000000001, 522.24, 534.6, 42.839999999999996, 108.72, 30.16, 527.8, 534.6, 8.93, 324.3, 8.93, 8.93, 534.6, 8.93, 16.48, 192.6, 530.55, 8.93, 8.93, 23.490000000000002, 278.72999999999996, 82.68, 522.24, 54.8, 480.7, 113.81, 527.8, 93.3, 522.24, 8.93, 82.68, 480.7, 93.3, 8.93, 370.64, 441.87, 432.0, 527.8, 113.81, 71.83, 16.48, 36.6, 238.76, 118.6, 205.52999999999997, 8.93, 527.8, 98.56, 192.6, 165.9, 534.6, 48.86, 480.7, 527.8, 222.31, 23.490000000000002, 530.55, 8.93, 446.16, 8.93, 48.86, 534.6, 187.95000000000002, 165.9, 527.8, 8.93, 8.93, 432.0, 36.6, 108.72, 71.83, 388.02, 36.6, 8.93, 449.40000000000003, 54.8, 8.93, 8.93, 527.8, 8.93, 30.16, 60.57000000000001, 8.93, 165.9, 527.8, 342.62, 54.8, 36.6, 8.93, 23.490000000000002, 278.72999999999996, 113.81, 522.24, 54.8, 480.7, 113.81, 449.40000000000003, 53

In [90]:
# def calculate_score(query, document, id):
#     score = 0
#     tokens = tokenize_document(query)
#     for token in tokens:
#         token_id = TOKEN_IDs[token]
#         Wtd = tfs[token_id] * idfs[token_id] 
#         query_score  +=  Wtd

# query = "socialism trying products"
# calculate_score(query, document)

112.59

In [92]:
# term_idfs = {}

# for doc_id, filename in docs.items():
#     document = extract_text(directory, filename)
#     tokens = tokenize_document(document)
#     for token in tokens:
#         if token not in term_idfs:
#             term_idfs[token] = []
#             term_idfs[token].append(doc_id)
#         else:
#             term_idfs[token].append(doc_id)

In [93]:
# term_idfs

In [94]:
# if token not in dictionary:
#     dictionary[token] = []
#     dictionary[token].append(i)
# else:
#     dictionary[token].append(i)

In [95]:
# document = "This is an example document with some_underscored_words and 123numbers456."
# tokens = tokenize_document(document)
# print(tokens)

In [96]:
# term_idfs = {}

# for doc_id, filename in docs.items():
#     document = extract_text(directory, filename)
#     tokens = tokenize_document(document)
#     for token in tokens:
#         if token not in term_idfs:
#             term_idfs[token] = []
#             term_idfs[token].append(doc_id)
#         else:
#             term_idfs[token].append(doc_id)