Import the required libraries

In [3]:
import xml.sax
import re
from collections import defaultdict, OrderedDict
import Stemmer
from os import path, listdir
import sys
import pickle
import time
import heapq

In the cell below, we have instantiated some global variables and initiated the stemmer

In [4]:
#Need to change these below
INPUT_FILE = '../phase-2-data'
OUTPUT = 'index/'
INDEX_STAT = '​invertedindex_stat.txt​'
TOTAL_TOKENS = 0
TOTAL_INV_TOKENS = 0
if OUTPUT[-1]!='/':
    OUTPUT+='/'
STOP_DICT = {}
STOP_FILE = ''
if OUTPUT.split('/')[0] == '2018114017':
    STOP_FILE = '2018114017/frequent.pickle'
else:
    STOP_FILE = 'frequent.pickle'
with open(STOP_FILE, 'rb') as handle:
    STOP_DICT = pickle.load(handle)
handle.close()
stemmer = Stemmer.Stemmer('english')
stem_dict = {}
title_dict = {}
indexMap = defaultdict(list)
file_num = 0
pages = 1

The preprocess function is to process the text. It tokenizes the data, removes unnecessary 
non-ASCII characters and punctuations, stem the words using pystemmer and remove stop words

In [None]:
def preprocess(text):
    tokens = re.sub(r'[^A-Za-z0-9]+', r' ', text).split()
    global TOTAL_TOKENS
    TOTAL_TOKENS += len(tokens)
    stemmed_stop_free = []
    for token in tokens:
        if token not in STOP_DICT:
            temp_stem = ''
            if token in stem_dict:
                temp_stem = stem_dict[token]
            else:
                stem_dict[token] = stemmer.stemWord(token)
                temp_stem = stem_dict[token]
            stemmed_stop_free.append(temp_stem)
            # stemmed_stop_free.append(stemmer.stemWord(token))
    return stemmed_stop_free

The extract_under_ref function separates and preprocesses links, references and categories.
The extract_infobox_and_refs separates and preprocesses infobox contents and references.

In [None]:
def extract_under_ref(splits):
    if len(splits) == 1:
        return [], [], []
    else:
        data = splits[1].split('\n')
        links = []
        refs = []
        categories = []
        for line in data:
            if re.match(r'\*[\ ]*\[', line):
                links.append(line)
            if re.search(r'<ref', line):
                refs.append(re.sub(r'.*title[\ ]*=[\ ]*([^\|]*).*', r'\1', line))
            if re.match(r'\[\[category', line):
                categories.append(re.sub(r'\[\[category:(.*)\]\]', r'\1', line))
        # return links, refs, categories
        return preprocess(' '.join(links)), preprocess(' '.join(refs)), preprocess(' '.join(categories))

def extract_infobox_and_refs(text):
    data = text.split('\n')
    flag = 0
    info = []
    refs2 = []
    for line in data:
        for i in re.findall("{{cite.*title=.*}}", line):
            refs2.append(re.sub(r'.*title[\ ]*=[\ ]*([^\|]*).*', r'\1', line))
        if re.match(r'\{\{infobox', line):
            flag = 1
            info.append(re.sub(r'\{\{infobox(.*)', r'\1', line))
        elif flag == 1:
            if line == '}}':
                flag = 0
                continue
            info.append(line)
    return preprocess(' '.join(info)), preprocess(' '.join(refs2))

The split_page function splits a Wikipedia page into different parts like text, links, refs, body, categories

In [None]:
def split_page( text):
    # text = text.encode("ascii", errors="ignore").decode()
    text = text.lower()
    splits = re.split(r'== ?references.?.? ?==|== ?notes and references ?==',text)
    # global pageCount
    # global titlefile
    # pageCount += 1
    # if pageCount%1000 == 0:
    #     print(pageCount)
    if (len(splits)==1):
        splits = re.split(r'== ?footnotes ?==', splits[0])
    data = {}
    data['links'], data['refs'], data['categories'] = extract_under_ref(splits)
    data['text'] = preprocess(re.sub(r'\{\{.*\}\}', r' ', splits[0]))
    data['infobox'], data['refs2'] = extract_infobox_and_refs(splits[0])
    data['refs'] = data['refs'] + data['refs2']
    return data

The indexify function converts the document into inverted index

In [None]:
def indexify(data):
    global indexMap
    totalFreq = defaultdict(lambda: 0)
    inverted = {}
    for i in ['title','text','infobox','categories','links','refs']:
        d = defaultdict(lambda: 0)
        for word in data[i]:
            d[word] += 1
            totalFreq[word] += 1
        inverted[i] = d
    for word in totalFreq.keys():
        string = 'd'+str(data['id'])
        for i in ['title','text','infobox','categories','links','refs']:
            temp = inverted[i][word]
            if temp:
                if i != 'text':
                    string += i[0] + str(temp)
                else:
                    string += 'b' + str(temp)
        indexMap[word].append(string)

The xml_handler class is used to parse he xml file and call all the above functions

In [None]:
class xml_handler( xml.sax.ContentHandler ):
    def re_init(self):
        global pages
        self.title = ''
        self.text = ''
        # self.hashed = 0
        self.id = ''
        self.pages += 1
        pages += 1

    def __init__(self, start_time, filenum):
        global pages
        self.CurrentData = ''
        self.title = ''
        self.text = ''
        self.id = ''
        self.link_len = 0
        self.ref_len = 0
        self.categories_len = 0
        self.text_len = 0
        self.title_len = 0
        self.info_len = 0
        self.pages = pages
        self.start_time = start_time
        self.filenum = filenum
        # self.hashed = 0

    # Call when an element starts
    def startElement(self, tag, attributes):
        self.CurrentData = tag

    # Call when an elements ends
    def endElement(self, tag):
        if tag == 'page':
            # wiki_page = Page( self.title, self.text, self.id )
            # pages.append(wiki_page)
            data = split_page(self.text)
            data['title'] = preprocess(self.title)
            data['id'] = self.pages
            # if data['refs2'] == []:
            #     print(self.pages)
            # if data['id'] == 0:
            #     print(data)
            title_dict[self.pages] = self.title
            indexify(data)
            self.link_len += len(data['links'])
            self.info_len += len(data['links'])
            self.ref_len += len(data['refs'])
            self.categories_len += len(data['categories'])
            self.text_len += len(data['text'])
            self.title_len += len(data['title'])
            
            self.re_init()
            if self.pages %1000 == 0:
                print(self.link_len, self.ref_len, self.categories_len, self.text_len)
                print("Finished:", self.pages, "pages. Time elapsed:",time.time() - self.start_time )
            if self.pages % 50000 == 0:
                print("Writing temporary index")
                store_index(str(self.filenum)+'_'+str(self.pages // 50000))
    def endDocument(self):
        if self.pages % 50000 != 0:
            print("Writing temporary index")
            store_index(str(self.filenum)+'_'+str(self.pages // 50000 + 1))

    # Call when a character is read
    def characters(self, content):
        if self.CurrentData == 'title':
            self.title += content
        if self.CurrentData == 'text':
            self.text += content
        # if self.CurrentData == 'id' and not self.hashed:
        #     self.id = content
        #     self.hashed = 1

The store_index function stores the index and index stats in files

In [None]:
def store_index(st):
    global indexMap
    index_map_file = []
    for key in sorted(indexMap.keys()):
        string = key + ':' + ' '.join(indexMap[key])
        index_map_file.append(string)
    with open(OUTPUT+'index'+st+'.txt',"w+") as f:
        f.write('\n'.join(index_map_file))
    with open(INDEX_STAT,"w+") as f:
        f.write(str(TOTAL_TOKENS)+'\n')
        f.write(str(len(indexMap))+'\n')
    indexMap = defaultdict(list)

The wiki_parse function calls the xml_handler class and starts parsing the xml file. It also measures the time taken to parse the files.

In [None]:
def wiki_parse():
    print("Starting parser")
    file_num = 0
    total_time = 0
    for filename in listdir(INPUT_FILE):
        parse_start_time = time.time()
        xml_parser = xml.sax.make_parser()
        xml_parser.setFeature(xml.sax.handler.feature_namespaces, 0)
        handler = xml_handler(parse_start_time, file_num)
        xml_parser.setContentHandler(handler)
        xml_parser.parse(INPUT_FILE+'/'+filename)
        print("Parsing finished")
        # store_index(file_num)
        # print("Dumping finished")
        time_taken = time.time() - parse_start_time
        total_time += time_taken
        print("Time taken: ",time_taken)
        print("Time elapsed: ",total_time)
        print("Pages: ",pages)
        total_time += time_taken
        indexMap = defaultdict(list)
        file_num += 1

In [None]:
wiki_parse()

Starting parser
17313 28460 15025 810532
Finished: 1000 pages. Time elapsed: 5.735639572143555
31989 57373 24408 1711617
Finished: 2000 pages. Time elapsed: 11.21688199043274
60323 133724 38229 3808453
Finished: 3000 pages. Time elapsed: 22.05772852897644
78506 178893 51251 5300404
Finished: 4000 pages. Time elapsed: 30.37448263168335
99754 230730 63085 6956289
Finished: 5000 pages. Time elapsed: 39.55445432662964
124720 274799 75055 8616949
Finished: 6000 pages. Time elapsed: 48.02796697616577
144094 315319 86267 9819227
Finished: 7000 pages. Time elapsed: 55.8756320476532
158826 341375 100396 10636739
Finished: 8000 pages. Time elapsed: 61.181695222854614
177828 374383 114373 11589584
Finished: 9000 pages. Time elapsed: 68.1760938167572
198937 409292 131054 12447584
Finished: 10000 pages. Time elapsed: 74.22395157814026
215008 441541 146975 13293078
Finished: 11000 pages. Time elapsed: 80.16527366638184
230518 498431 163383 14349151
Finished: 12000 pages. Time elapsed: 86.53800487518

In [5]:
def merge_index():
    global pages
    file_pointers = []
    wordpostings = defaultdict(lambda: [])
    words = {}
    heap = []
    wordfilemap = defaultdict(lambda: [])
    curline = {}
    finishflag = 1
    flag = 0
    for filename in listdir(OUTPUT):
        if filename.endswith('.txt'):
            print(OUTPUT+filename)
            file_pointers.append(open(OUTPUT+filename, 'r'))
    filecomplete = [0 for i in range(len(file_pointers))]
    ind = open('index/index','w+')
    off = open('index/offset','w+')
    for i in range(len(file_pointers)):
        curline[i] = file_pointers[i].readline().strip()
        print(curline[i])
        word = curline[i].split(':')[0]
        wordfilemap[word].append(i)
        wordpostings[word] += curline[i].split(':')[1].split(" ")
        if word not in heap:
            heapq.heappush(heap,word)
    
    while (finishflag):
        minword = heapq.heappop(heap)
        string = minword + ':' + str(ind.tell())
        string = string.strip() + '\n'
        off.write(string)
        string = minword + ":" + " ".join(wordpostings[minword]) + "\n"
        ind.write(string)
        filenum = wordfilemap[minword]
        # print(wordfilemap)
        wordfilemap.pop(minword)
        for num in filenum:
            nextline = file_pointers[num].readline().strip()
            if nextline == '':
                filecomplete[num] = 1
            else:
                newword = nextline.split(':')[0]
                wordpostings[newword] += nextline.split(':')[1].split(" ")
                # print(wordfilemap[newword])
                if not wordfilemap[newword]:
                    heapq.heappush(heap,newword)
                    wordfilemap[newword].append(num)
                else:
                    wordfilemap[newword].append(num)
        for i in range(len(file_pointers)):
            flag = filecomplete[i] + flag
        flag = int(flag/(len(file_pointers)))

        if flag==1:
            finishflag = 0
    for i in range(len(file_pointers)):
        file_pointers[i].close()

    ind.close()
    off.close()

    return

In [6]:
merge_index()

index/index2_3.txt
index/index1_2.txt
index/index0_1.txt
index/index0_2.txt
index/index1_3.txt
0:d102088b2 d102090b1 d102091b5i13 d102093b1 d102095b45 d102096b5 d102100b51 d102102b1 d102103b1 d102105b65 d102106b2 d102109r1 d102113b1 d102115b2l7 d102118b1l1 d102121b2 d102124b1i2 d102125i1 d102127b6 d102128b4 d102129b3 d102130b33l1 d102132b1i1 d102133b5 d102136b10i2 d102137b1i1 d102139b2 d102141b1i1 d102144b27i2 d102146b6 d102147b1l4 d102148b33l7 d102151b2 d102153b2 d102154b1 d102155b1 d102157b12r2 d102162b2 d102163b2i2 d102165b4l2 d102170b75 d102171b1 d102172b2l4 d102175i4l1 d102177b8 d102178b6 d102179b1 d102180b2 d102181l2 d102185b1i1 d102186l2 d102187b2 d102193b1 d102195b3 d102199b11 d102201b3 d102204b1i2 d102205b1 d102206l3 d102207b3 d102208b1 d102209i1 d102210b2 d102211b2 d102216l1 d102217b2 d102219l2 d102223b1 d102224b1 d102226b28 d102228b1 d102229b5 d102231b2 d102232b11 d102233l4 d102234b13l1 d102235b3 d102236b2 d102237b1 d102241b31 d102247l3 d102248i1 d102254b1 d102256b9i2 d10225

In [None]:
# count = 0
# t_list = []
# for i in indexMap['egypt']:
#     splits = re.split('c',i)
#     if(len(splits) > 1):
#         t_list.append(preprocess(title_dict[int(re.split('d|b|i',splits[0])[1])].lower()))
#         print(int(re.split('d|b|i',splits[0])[1]))
# print(count)

In [None]:
# for i in range(len(title_dict)):
#     if preprocess(title_dict[i].lower()) == ['kellogg', 'briand', 'pact']:
#         print(i)

In [None]:
# string = 'Ancient Egypt; Abydos, Egypt; Amasis II; Ammonius Saccas; Ababda people; Aswan; Abbas II of Egypt; Ambrose of Alexandria; Alexandria; Athanasius of Alexandria; Anthony the Great; Basel Convention; Battle of the Nile; Battle of Actium; Convention on Biological Diversity; CITES; Environmental Modification Convention; Cairo; Clement of Alexandria; Cyril of Alexandria; Coptic Orthodox Church of Alexandria; Duke Nukem 3D; Diophantus; Geography of Egypt; Demographics of Egypt; Politics of Egypt; Economy of Egypt; Telecommunications in Egypt; Transport in Egypt; Egyptian Armed Forces; Foreign relations of Egypt; Book of Exodus; First Battle of El Alamein; Go Down Moses; Great Pyramid of Giza; Great Rift Valley; Herodotus; History of Egypt; International Tropical Timber Agreement, 1983; International Tropical Timber Agreement, 1994; Imhotep; Kyoto Protocol; Kellogg–Briand Pact; Lighthouse of Alexandria; Library of Alexandria; Maimonides; Montreal Protocol; Mark Antony; Metre Convention; Muslim Brotherhood; Munich massacre; Nile; Treaty on the Non-Proliferation of Nuclear Weapons; Ozymandias; Origen; Pachomius the Great; Prospero Alpini; Pompey; Ptolemy; Ptolemaic dynasty; Palestine Liberation Organization; Red Sea; Rosetta Stone; Return to Castle Wolfenstein; Saladin; Sahara desert (ecoregion); Sinai Peninsula; Stargate (film); Saluki; Suez Canal; Six-Day War; Second Battle of El Alamein; Tax'
# gaurang = string.split(';')
# g_list = []
# for i in gaurang:
#     g_list.append(preprocess(i.lower()))

In [None]:
# for element in g_list:
#     if element not in t_list:
#         print(element)

In [None]:
# indexMap['egypt']

In [None]:
# for i in ['2510', '12182', '13075', '19205']:
#     print(title_dict[int(i)], end='; ')

In [None]:
# line = '{{cite news|last=Rendell|first=Ruth|authorlink=Ruth Rendell|title=A most serious and extraordinary problem |url=https://www.theguardian.com/books/2008/sep/13/arthurconandoyle.crime|newspaper=[[The Guardian]]|date= 12 September 2008|accessdate=8 December 2018}}'
# re.sub(r'.*title[\ ]*=[\ ]*([^\|]*).*', r'\1', line)

In [None]:
# TOTAL_TOKENS

In [None]:
# len(indexMap)

In [None]:
filename

NameError: name 'filename' is not defined

In [None]:
help(xml.sax.handler)

In [None]:
del stem_dict