In [1]:
import os
import nltk
import pandas as pd
import time
import string
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

### 0.0 List the Files in the specific directory (Section 0.0)

In [2]:
def listFile(d):    
    path = [os.path.abspath(os.path.join(d,i)) for i in os.listdir(d)]
    return path

### 0.1 Read the file and return the content (Section 0.1)

In [3]:
def readFile(d):
    file = open(d,"r",encoding='utf-8')
    content = file.read()
    return content

### 1. Create token (Section 1)
- input (content, document id)
- output (pairs of token and document id)

In [4]:
def createToken(content, d):
    did = os.path.basename(d).replace(".txt","")
    tokens = list()
    tokenizer = WhitespaceTokenizer()
    for t in tokenizer.tokenize(content):
        tokens = tokens + [[t,did]]
    return tokens

### 2. Linguistic (lower, stemming)

In [5]:
def linguisticToken(token_list):
    #stemmer = PorterStemmer() #stem
    lemmer = WordNetLemmatizer()
    tokens = list()
    for [t,d] in token_list:
        token = t.translate(str.maketrans('', '', string.punctuation)) #remove punctuations
        if token == '': #if the token is only punctuation
            continue
        token = token.lower()
        #token = stemmer.stem(token)
        token = lemmer.lemmatize(token)
        tokens += [[token,d]]
    return tokens

### 3. Sorting

In [6]:
def sortToken(token_list):
    token_list.sort(key=lambda e: (e[0],e[1]))
    return token_list

### 4. Transform in posting

In [7]:
def transformPosting(sorted_list):
    postDictionary = {}
    for term,docId in sorted_list:
        postDictionary.setdefault(term,[]).append(docId)
    for key in postDictionary:
        post = list(dict.fromkeys(postDictionary[key]))
        post.sort(key=int)
        postDictionary[key] = (len(post),post)
    return postDictionary

### 5. Merge the Postings (Intersecting)

In [8]:
def mergePostings(postingList):
    posting1 = postingList[0]
    #print(posting2)
    
    for i in range(1,len(postingList),1):
        merged = []
        posting2 = postingList[i]
        #print(posting1[0])
        p = 0
        q = 0
        #print(len(posting2))
        while p < len(posting1) and q < len(posting2):
            if int(posting1[p]) == int(posting2[q]):
                merged.append(posting1[p])
                
                #print(p,q,posting1[p],posting2[q])
                p += 1
                q += 1
            elif int(posting1[p]) < int(posting2[q]):
                
                #print(p,q,posting1[p],posting2[q])
                p += 1
            else:
                
                #print(p,q,posting1[p],posting2[q])
                q += 1
        posting1 = merged
        #print(p,q)
        #print(merged)
    return posting1

## Create Index (Main)

In [9]:
start_time = time.time()

files = listFile("HillaryEmails") #list dir
tokens = list()
for docs in files:
    file_content = readFile(docs) #read content
    token = createToken(file_content,docs) #create token
    token = linguisticToken(token) #stemming
    tokens += token
tokens = sortToken(tokens) #token from all files
posting = transformPosting(tokens) #create posting from these files

end_time = time.time()
time_to_index = end_time - start_time
print("Time taken to index: " , round(time_to_index,3))

Time taken to index:  101.583


In [None]:
while True:
    query = input("Enter a query: ")
    q_token = createToken(query)
    operator = [i for i in q_token]

In [None]:
#t = pd.DataFrame(posting) # check the reverse index
#t = t.T
#t.to_csv("posting.csv")

In [11]:
t.shape

(72052, 2)

In [14]:
t.sample(50)

Unnamed: 0,0,1
rushat,1,[5961]
152010,1,[5587]
locate,17,"[10, 12, 21, 26, 1734, 2815, 3738, 4332, 4582,..."
c05772256,1,[6471]
lanny,15,"[512, 513, 591, 1402, 1875, 2144, 2151, 2805, ..."
flux,1,[6755]
haaretzherald,1,[5789]
substantively,13,"[525, 526, 527, 528, 2400, 2401, 2402, 3521, 4..."
etonian,8,"[1066, 1175, 1176, 3252, 3506, 3509, 3643, 3644]"
171126,2,"[6054, 6971]"


In [20]:

searchA = linguisticToken([['canton','']])[0][0]
searchB = linguisticToken([['obama','']])[0][0]
print(searchA)
print(searchB)
#print(posting[searchA][1])

canton
obama


In [21]:
search_result = mergePostings([posting[searchA][1],posting[searchB][1]])
search_result

['152', '6073', '6079']

In [None]:
mergePostings1([['969', '1628', '1645', '1893', '1943', '2472', '3154', '5107', '5490', '5843', '5922', '7081', '7479'],
              ['10', '12', '21', '23', '24', '26', '32', '43', '81', '86', '98', '128', '131', '134', '135', '139', '140', '143', '163', '168', '175', '204', '215', '266', '267', '269', '290', '300', '328', '389', '390', '443', '460', '462', '493', '591', '697', '733', '772', '795', '797', '798', '802', '885', '918', '921', '953', '954', '965', '966', '969', '979', '980', '982', '997', '1004', '1052', '1056', '1066', '1082', '1097', '1119', '1190', '1198', '1199', '1200', '1233', '1294', '1299', '1359', '1360', '1408', '1412', '1459', '1472', '1474', '1485', '1563', '1601', '1602', '1607', '1615', '1639', '1688', '1745', '1760', '1797', '1815', '1816', '1860', '1861', '1921', '1922', '1941', '1996', '2051', '2060', '2064', '2065', '2119', '2165', '2170', '2172', '2200', '2203', '2213', '2228', '2247', '2277', '2348', '2394', '2449', '2474', '2479', '2482', '2532', '2565', '2604', '2647', '2648', '2700', '2701', '2704', '2755', '2758', '2764', '2781', '2805', '2864', '2868', '2875', '2894', '2933', '2940', '2942', '2948', '2978', '2979', '3014', '3054', '3055', '3152', '3153', '3154', '3165', '3166', '3169', '3170', '3171', '3178', '3193', '3209', '3217', '3252', '3268', '3273', '3274', '3301', '3392', '3394', '3430', '3452', '3472', '3478', '3479', '3497', '3498', '3499', '3502', '3567', '3568', '3571', '3593', '3606', '3615', '3640', '3655', '3688', '3692', '3693', '3695', '3696', '3709', '3710', '3720', '3738', '3740', '3742', '3755', '3769', '3771', '3802', '3839', '3849', '3884', '3892', '3913', '3948', '3949', '3995', '4040', '4044', '4045', '4060', '4065', '4119', '4120', '4141', '4144', '4145', '4146', '4163', '4171', '4181', '4232', '4233', '4264', '4276', '4292', '4304', '4320', '4332', '4334', '4354', '4394', '4399', '4405', '4408', '4416', '4428', '4429', '4459', '4467', '4489', '4510', '4511', '4514', '4547', '4558', '4591', '4597', '4599', '4616', '4632', '4639', '4653', '4663', '4666', '4669', '4694', '4697', '4700', '4701', '4704', '4708', '4712', '4767', '4794', '4875', '4899', '4900', '4906', '4953', '4968', '4969', '4971', '5031', '5032', '5036', '5037', '5098', '5102', '5106', '5107', '5115', '5116', '5133', '5162', '5164', '5168', '5172', '5175', '5208', '5211', '5212', '5217', '5219', '5236', '5237', '5238', '5248', '5264', '5279', '5287', '5288', '5300', '5402', '5412', '5413', '5433', '5434', '5436', '5452', '5490', '5538', '5585', '5595', '5599', '5600', '5602', '5609', '5627', '5642', '5646', '5648', '5653', '5669', '5676', '5677', '5685', '5715', '5719', '5721', '5788', '5789', '5806', '5879', '5880', '5897', '5900', '5901', '5928', '5945', '5979', '5982', '5991', '5998', '5999', '6006', '6009', '6011', '6021', '6037', '6054', '6058', '6069', '6112', '6182', '6210', '6258', '6270', '6280', '6283', '6308', '6335', '6377', '6387', '6439', '6443', '6479', '6480', '6487', '6495', '6499', '6500', '6504', '6507', '6514', '6528', '6539', '6568', '6570', '6574', '6602', '6627', '6634', '6635', '6637', '6638', '6649', '6650', '6685', '6699', '6707', '6708', '6716', '6718', '6751', '6759', '6764', '6769', '6801', '6808', '6834', '6888', '6907', '6916', '6924', '6971', '6974', '7031', '7036', '7038', '7041', '7042', '7073', '7080', '7085', '7087', '7100', '7117', '7119', '7134', '7162', '7165', '7245', '7257', '7271', '7278', '7279', '7280', '7339', '7345', '7359', '7378', '7394', '7398', '7403', '7407', '7410', '7423', '7453', '7455', '7457', '7458', '7459', '7460', '7465', '7474', '7501', '7503', '7504', '7569', '7570', '7571', '7576', '7577', '7582', '7584', '7608', '7645', '7686', '7691', '7699', '7705', '7728', '7761', '7767', '7769', '7796', '7846', '7880', '7907', '7909', '7929', '7940', '7943'],
              ])

In [16]:
import sys


In [17]:
sys.getsizeof(posting)

2621544