In [1]:
import nltk
from nltk.tokenize import RegexpTokenizer
from collections import Counter, defaultdict, OrderedDict
import re
import sys
import os

In [2]:
# with open(sys.argv[1], 'r') as f:
#     contents = f.read()
headline_file=open('headlines.txt',"r")
headline_content = headline_file.read()

In [3]:
regextoken = RegexpTokenizer(r'<P ID=\d+>(.*?)</P>')
headline_text_list = regextoken.tokenize(headline_content)

In [4]:
# Normalization
#     lower-case words
#     Change short term to long terms for verb.
#     remove punctuation
#         https://www.geeksforgeeks.org/python-remove-punctuation-from-string/

def normalization(word):
    word= word.lower()
    word = word.replace("'re",' are').replace("'m'", ' am').replace("'s",' is').replace("n't",' not').replace("'ve",' have').replace("'d",' had').replace("'ll",' will')
    word = word.replace("'",'')
    word  = re.sub(r'[^\w\s]', '', word)
    return word

In [5]:
# Calculate Terms
# 1. Normalized the text 
# 2. It tokenized the text and count the occurence of the text
# 3. It returns the document id and count for each of the terms.
# 4. It also count the number of terms and documents. 

def calculate_terms(listed):
    normalized_text = []
    collection_frequency = Counter()
    document_frequency = Counter()
    output_wordlist_dict ={}
    terms_frequency = defaultdict(lambda: Counter([]))
    
    for i in range(len(listed)):
        normalized_text.append(normalization(listed[i]))
    
    for i in range(len(normalized_text)):
        tokenized_list= []
        for j in normalized_text[i].split():
            tokenized_list.append(j)
        output_wordlist_dict[i] = Counter(tokenized_list)
        collection_frequency.update(tokenized_list)
        document_frequency.update(set(tokenized_list))
        
    for key, value in output_wordlist_dict.items():
        for term, term_cnt in value.items():
            terms_frequency[term][key] += term_cnt
    
    return normalized_text, collection_frequency, document_frequency, terms_frequency

In [6]:
# Dictionary List
# 1. It stores information about a term (Document Frequency and offset)
# 2. The dictionary is sorted by term. 
# 3. It starts from 0 and counts the number document frequency and term count. 

def dictionary_list(listed):
    sort_dict = {}
    result_sort_dict = {}
    offset_sum = 0
    offset_i = 0
    sort_dict = OrderedDict(sorted(listed.items()))
    for i, value in enumerate(sort_dict.keys()):
        offset_i = len(sort_dict[value]) * 2 
        result_sort_dict[value] = len(sort_dict[value].values()),offset_sum
        offset_sum = offset_sum + offset_i 
    return result_sort_dict

In [7]:
# Inverted file
# 1. It stores the sorted entries as an inverted file

def inverted_file(key, dict_listed):
    inverted_list = []
    for i in key:
        for docid, term_cnt in dict_listed[i].items():
            inverted_list.append(docid)
            inverted_list.append(term_cnt)
    return inverted_list

In [8]:
# Store_Inverted_bin
# 1. It stores the inverted file as binary file. 
# 2. It stored this binary file as 4-byte integers. 

def Store_Inverted_bin(file):
    with open("inverted_fiile_binary.bin", "wb") as fb:
        for num in file:
            fb.write(num.to_bytes(4, "big"))
    print("Inverted File is created.")

In [9]:
normalized_text, collection_freq, document_freq, posting_list_output = calculate_terms(headline_text_list)

postings_list_terms =['heidelberg', 'cesium', 'trondheim', 'crustacean']

for i in postings_list_terms:
    posting_list_output[i]

In [10]:
dict_pos_output = dictionary_list(posting_list_output)
byte_file = inverted_file(dict_pos_output.keys(), posting_list_output)
Store_Inverted_bin(byte_file)

Inverted File is created.


In [11]:
print('Number of paragraph:', len(normalized_text))
print('Number of unique words observed:', len(document_freq))
print('The total number of words encountered:', sum(collection_freq.values()))

Number of paragraph: 500000
Number of unique words observed: 174195
The total number of words encountered: 4586860


In [12]:
print('Size of original_text: ' + str(os.path.getsize('headlines.txt')) + ' bytes')
print('Size of Inverted File: ' + str(os.path.getsize('inverted_fiile_binary.bin')) + ' bytes')
print('Size of Dictionary: ' + str(sys.getsizeof(dict_pos_output)) + ' bytes')

Size of original_text: 39381610 bytes
Size of Inverted File: 35970152 bytes
Size of Dictionary: 5242968 bytes


In [31]:
# Document Frequency
# 1. It prints the  document frequency and postings list for terms.
# 2. It prints the index of posting list for terms.
# 3. It reads inverted_file binary file every 4 byte, then gets the posting list for the terms. 

def document_freqency(dictionary_output,binary_file, terms):
    term = terms.lower()
    print("document frequency for "+terms+" : ",dictionary_output[terms][0])
    if (dictionary_output[terms][0] == 0):
        print("The "+ terms + " does not exit in the document.")
    terms_position = dictionary_output[term][1]
    range_of_terms_in_positing_list = list(dictionary_output.keys()).index(term)
    if len(list(dictionary_output)) != range_of_terms_in_positing_list+1:
        next_term = list(dictionary_output.keys())[range_of_terms_in_positing_list+1]
    else:
        print("The terms "+ terms+" is in end of the word. There is no next word")
        return
    next_term_position = dictionary_output[next_term][1]
    range_index = [terms_position, next_term_position]
    print("index of range for "+terms+" : ", range_index)
    list_num = []
    with open(binary_file, "br") as bf:
        for _ in range(terms_position):
            data = bf.read(4)
        for _ in range(terms_position, next_term_position):
            data = bf.read(4)
            number = int.from_bytes(data,"big")
            list_num.append(number)
    print("posting list for "+terms+" : ")
    print(list_num)
    return list_num

In [32]:
postings_list_terms = ['Heidelberg', 'cesium', 'Trondheim', 'crustacean']

for i in postings_list_terms:
    terms = i.lower()
    document_freqency(dict_pos_output, "inverted_fiile_binary.bin", terms)
    print("-----------------")

document frequency for heidelberg :  8
index of range for heidelberg :  [3588994, 3589010]
posting list for heidelberg : 
[114329, 1, 135133, 1, 174780, 1, 221099, 1, 243837, 1, 452545, 1, 491139, 1, 491278, 1]
-----------------
document frequency for cesium :  4
index of range for cesium :  [1671216, 1671224]
posting list for cesium : 
[50019, 1, 280669, 1, 348143, 1, 391938, 1]
-----------------
document frequency for trondheim :  0
The trondheim does not exit in the document.
index of range for trondheim :  [8232872, 8232872]
posting list for trondheim : 
[]
-----------------
document frequency for crustacean :  2
index of range for crustacean :  [2109786, 2109790]
posting list for crustacean : 
[230747, 1, 234923, 1]
-----------------


In [33]:
word_list = ['Hopkins', 'Stanford', 'Brown', 'college']

for i in word_list:
    print('Document Frequency for '+i+' is '+ str(dict_pos_output[i.lower()]))

Document Frequency for Hopkins is (71, 3714138)
Document Frequency for Stanford is (150, 7287324)
Document Frequency for Brown is (769, 1410140)
Document Frequency for college is (1909, 1866432)


In [34]:
elon_list = document_freqency(dict_pos_output, "inverted_fiile_binary.bin", 'Elon'.lower())
musk_list = document_freqency(dict_pos_output, "inverted_fiile_binary.bin", 'Musk'.lower())

document frequency for elon :  60
index of range for elon :  [2560860, 2560980]
posting list for elon : 
[3393, 1, 16330, 1, 19262, 1, 21341, 1, 29749, 1, 39287, 1, 44321, 1, 45978, 1, 52990, 1, 57023, 1, 57787, 1, 71988, 1, 84806, 1, 87959, 1, 98830, 1, 103398, 1, 104204, 1, 115207, 1, 122603, 1, 127050, 1, 128662, 1, 131441, 1, 131448, 1, 131514, 1, 135942, 1, 146965, 1, 151171, 1, 159147, 1, 186107, 1, 194998, 1, 197341, 1, 239304, 1, 240040, 1, 245923, 1, 249585, 1, 251252, 1, 274393, 1, 277539, 1, 283098, 1, 297139, 1, 301627, 1, 303775, 1, 305183, 1, 306988, 1, 307162, 1, 341755, 1, 342182, 1, 354346, 1, 369772, 1, 383528, 1, 399001, 1, 399946, 1, 420082, 1, 431495, 1, 431739, 1, 449684, 1, 456443, 1, 461816, 1, 479190, 1, 482769, 1]
document frequency for musk :  53
index of range for musk :  [5189044, 5189150]
posting list for musk : 
[3393, 1, 16330, 1, 19262, 1, 21341, 1, 29749, 1, 44321, 1, 45978, 1, 52990, 1, 57023, 1, 57787, 1, 84806, 1, 98830, 1, 115207, 1, 122603, 1, 127

In [35]:
set_elon =set(elon_list)
set_musk = set(musk_list)

setone = {1}
set_same = set_elon.intersection(set_musk) - setone
print('Intersection of postings list for each term: Elon and Musk: '+ str(len(list(set_same))))
print(sorted(set_same))

Intersection of postings list for each term: Elon and Musk: 46
[3393, 16330, 19262, 21341, 29749, 44321, 45978, 52990, 57023, 57787, 84806, 98830, 115207, 122603, 127050, 128662, 131448, 131514, 146965, 159147, 186107, 194998, 197341, 239304, 240040, 245923, 249585, 274393, 283098, 297139, 303775, 305183, 306988, 341755, 342182, 354346, 369772, 383528, 399001, 399946, 420082, 431495, 431739, 449684, 456443, 482769]
