In [5]:
import re
import os

In [6]:
xml_path = '/mnt/sdb1/chenzhongyu/enwiki-20191101-pages-articles-multistream.xml'
index_path = '/mnt/sdb1/chenzhongyu/indexes'

tf_index_path = os.path.join(index_path, 'inverted_index_tf_lossy_index')
tf_path = os.path.join(index_path, 'inverted_index_tf_lossy')

id_index_path = os.path.join(index_path, 'inverted_index_id_index')
id_path = os.path.join(index_path, 'inverted_index_id')

text_index_path = os.path.join(index_path, 'search_index_index')
text_path = os.path.join(index_path, 'search_index')

In [7]:
import os


class Index:
    def __init__(self, path, target):
        self._index = {}
        self._load(path)
        self._target = target
        print('Loaded {}, with {} indexes, target for {}'.format(path, len(self._index), self._target))

    def __call__(self, key):
        key = str(key)
        with open(os.path.join(self._target, self._get_name(key)), 'r') as f:
            f.seek(self._get_start(key))
            return f.readline().strip().split('\t')[-1]
    
    def _add(self, key, name, start):
        self._index[key] = {
            'name': name,
            'start': int(start)
        }
        
    def _load(self, path):
        filenames = os.listdir(path)
        for name in filenames:
            file = os.path.join(path, name)
            if not os.path.isdir(file) and name[0] != '.':
                with open(file, 'r') as f:
                    for line in f:
                        split = line.strip().split('\t')
                        specs = split[1].split(',')
                        self._add(split[0], specs[0], specs[1])

    def _get_name(self, word):
        return self._index[word]['name']

    def _get_start(self, word):
        return int(self._index[word]['start'])

In [8]:
def parse_text(spec):
    spec = spec.strip().split(',')
    start = int(spec[-2])
    length = int(spec[-1])
    text = ''
    if start >= 0 and length >= 0:
        with open(xml_path, 'r') as f:
            f.seek(start)
            text = f.read(length)
    return {
        'title': ''.join(spec[0:-5]),
        'contri_name': spec[-5],
        'contri_id': spec[-4],
        'timestamp': spec[-3],
        'text': text
    }

In [9]:
def parse_id_tf(spec):
    spec = spec.strip().split('-')[0:2]
    spec[0] = int(spec[0])
    spec[1] = int(spec[1])
    return spec

In [10]:
def parse_tf(spec, start, amount, text_index):
    spec = spec.split(';')
    start_doc = parse_id_tf(spec[start + 1])
    
    i = start
    while i >= 1:
        doc = parse_id_tf(spec[i])
        if doc[1] == start_doc[1]:
            start_doc[0] += doc[0]
        else:
            break
        i -= 1
    
    
    doc_list = []
    doc_list.append(parse_text(text_index(start_doc[0])))
    past_doc = start_doc
    for i in range(start + 2, min(len(spec), start + 1 + amount)):
        doc = parse_id_tf(spec[i])
        if doc[1] == past_doc[1]:
            doc[0] += past_doc[0]
        past_doc = doc
        doc_list.append(parse_text(text_index(doc[0])))
    return doc_list

In [11]:

tf_index = Index(tf_index_path, tf_path)

Loaded /mnt/sdb1/chenzhongyu/indexes/inverted_index_tf_lossy_index, with 19092140 indexes, target for /mnt/sdb1/chenzhongyu/indexes/inverted_index_tf_lossy


In [12]:

text_index = Index(text_index_path, text_path)

Loaded /mnt/sdb1/chenzhongyu/indexes/search_index_index, with 19758736 indexes, target for /mnt/sdb1/chenzhongyu/indexes/search_index


In [13]:
tf_index('anarchism')
# word -> doc list sort by tf

'900;57203187-520-159270,23,10,34,10,28,10,30,37,43,51,52,52,52,52,52,52,52,52,52,52,51,52,52,52,52,52,52,52,52,52,52,51,52,51,51,51,51,51,51,44,53,54,54,54,54,54,54,54,54,54,54,53,54,54,54,54,54,54,54,54,54,54,53,54,54,53,53,53,53,53,53,46,58,58,58,58,57,58,58,58,58,58,58,58,58,58,57,57,46,57,58,58,58,58,58,58,58,58,58,58,57,58,58,58,58,58,58,58,58,58,58,57,57,57,57,57,57,57,45,55,56,56,56,56,56,56,56,56,56,56,55,56,56,56,56,56,56,56,56,56,56,55,56,56,55,55,55,55,55,55,42,49,50,50,50,50,50,50,50,50,50,50,49,50,50,50,50,50,50,50,50,50,50,49,50,50,49,49,49,49,49,49,42,49,50,50,50,50,50,50,50,50,50,50,49,50,50,50,50,50,50,50,50,50,50,49,50,49,49,49,49,49,49,43,51,52,52,52,52,52,52,52,52,52,52,51,52,52,52,52,52,52,52,52,52,52,51,52,52,51,51,51,51,51,51,41,47,48,48,48,48,48,48,48,48,48,48,47,48,48,48,48,48,48,48,48,48,48,47,48,48,47,47,47,47,47,47,46,58,58,58,58,57,58,58,58,58,58,57,57,45,55,56,56,56,56,56,56,56,56,56,56,55,56,56,56,56,56,56,56,56,56,56,55,56,56,55,55,55,55,55,55,47,59,60,

In [14]:
text_index('12')

'anarchism,,,2019-10-29t19:52:20z,3926,104891'

In [15]:
t = parse_text(text_index('20707934'))

In [16]:
l = parse_tf(tf_index('anarchism'), 22, 2, text_index)

In [17]:
l[0]['text'][206:206+10]

'Anarchism '

In [18]:
l = parse_tf(tf_index('apoka'), 0, 10, text_index)
# doc list sort by tf -> 

In [19]:
l[0]['text'][1019:1024+2]
#l[0]['text'][1602+2:1607+2]
#l[0]['text'][1871+2:1876+2]
l[0]

{'title': 'vilius šapoka',
 'contri_name': '',
 'contri_id': '',
 'timestamp': '2018-06-12t17:32:42z',
 'text': ">{{Infobox officeholder\n|name          = Vilius Šapoka\n|image         = Vilius Šapoka (36840220810) (cropped).jpg\n|office        = [[Ministry of Finance (Lithuania)|Minister of Finance]]\n|primeminister = [[Saulius Skvernelis]]\n|term_start    = 13 December 2016\n|term_end      = \n|predecessor   = [[Rasa Budbergytė]]\n|successor     = \n|birth_date    = {{birth date and age|1978|12|14|df=y}}\n|birth_place   = \n|death_date    = \n|death_place   = \n|party         = [[Independent politician|Independent]]\n|alma_mater    = [[Vilnius University]]\n}}\n'''Vilius Šapoka''' (born 14 December 1978) is the current [[Ministry of Finance (Lithuania)|Minister of Finance]] in the [[Lithuania|Republic of Lithuania]] and Governor of the [[European Investment Bank]] for the Republic of Lithuania.&lt;ref&gt;{{cite web|url=http://www.eib.org/infocentre/press/news/all/vilius-sapoka-new-go

In [20]:
id_index = Index(id_index_path, id_path)

Loaded /mnt/sdb1/chenzhongyu/indexes/inverted_index_id_index, with 19099730 indexes, target for /mnt/sdb1/chenzhongyu/indexes/inverted_index_id


In [21]:
tf_index('apoka')

'38;53733124-13-48,31,474,360,150,60,176,213,164,94,150,937,1048;26551591-3-1019,583,269;32800641-3-15739,83,106;17675-2-21197,19329;19615-2-20889,15372;5296-2-49904,81;33334417-2-724,1281;9029121-2-915,7;13527402-2-360,2424;17820-1-158000;782011-1-9485;470221-1-14331;186271-1-21700;497194-1-4569;4172431-1-28302;1359755-1-2267;1716186-1-5583;3372142-1-14521;4195342-1-19340;2374282-1-15509;2390274-1-12424;469227-1-7479;6590756-1-6730;2590556-1-11136;1314756-1-838;7566951-1-10516;209219-1-13242;7031262-1-16700;1696766-1-6895;3572102-1-8579;2112636-1-30890;1546599-1-15444;240877-1-731;75067-1-38701;1474733-1-2020;664302-1-36814;805548-1-147384;509771-1-148522;'

In [22]:
text_index('53733124')

'vilius šapoka,,,2018-06-12t17:32:42z,65443886317,4172'

In [23]:
parse_text(text_index('53733124'))['text'][48:48+10]

'Šapoka\n|im'

In [24]:
def parse_id(spec, start, amount, text_index):
    spec = spec.split(';')
    
    id_sum = 0
    for i in range(0, start):
        doc = parse_id_tf(spec[i + 1])
        id_sum += doc[0]
    
    doc_list = []
    for i in range(start, min(len(spec) - 1, start + amount)):
        doc = parse_id_tf(spec[i + 1])
        id_sum += doc[0]
        doc_list.append(parse_text(text_index(id_sum)))
    return doc_list

In [25]:
id_index('apoka')

'38;17675-2-21197,19329;145-1-158000;19470-2-20889,15372;5296-2-49904,81;757245-1-9485;470221-1-14331;186271-1-21700;497194-1-4569;4172431-1-28302;1359755-1-2267;1716186-1-5583;3372142-1-14521;4195342-1-19340;2374282-1-15509;2390274-1-12424;469227-1-7479;4548435-3-1019,583,269;2042321-1-6730;2590556-1-11136;1314756-1-838;877779-2-724,1281;6689172-1-10516;209219-1-13242;2130730-2-915,7;4900532-1-16700;1696766-1-6895;3572102-1-8579;1157600-13-48,31,474,360,150,60,176,213,164,94,150,937,1048;955036-1-30890;1245366-2-360,2424;301233-1-15444;240877-1-731;75067-1-38701;1474733-1-2020;664302-1-36814;662494-3-15739,83,106;143054-1-147384;509771-1-148522;'

In [26]:
l = parse_id(id_index('apoka'), 2, 3, text_index)

In [27]:
l[0]['text'][20889:20889+10]
l[0]['text'][20889+15372:20889+15372+10]
l[1]['text'][49904:49904+10]
l[1]['text']



In [28]:
text_index('17675')

'lithuania,vif12vf,20306027,2019-11-01t16:11:26z,375527523,298419'

In [29]:
parse_text(text_index('17675'))['text'][21197:21197+10]

'apoka|firs'

In [30]:
id_index('apoka')

'38;17675-2-21197,19329;145-1-158000;19470-2-20889,15372;5296-2-49904,81;757245-1-9485;470221-1-14331;186271-1-21700;497194-1-4569;4172431-1-28302;1359755-1-2267;1716186-1-5583;3372142-1-14521;4195342-1-19340;2374282-1-15509;2390274-1-12424;469227-1-7479;4548435-3-1019,583,269;2042321-1-6730;2590556-1-11136;1314756-1-838;877779-2-724,1281;6689172-1-10516;209219-1-13242;2130730-2-915,7;4900532-1-16700;1696766-1-6895;3572102-1-8579;1157600-13-48,31,474,360,150,60,176,213,164,94,150,937,1048;955036-1-30890;1245366-2-360,2424;301233-1-15444;240877-1-731;75067-1-38701;1474733-1-2020;664302-1-36814;662494-3-15739,83,106;143054-1-147384;509771-1-148522;'

In [31]:
id_index('apoka').strip(';')

'38;17675-2-21197,19329;145-1-158000;19470-2-20889,15372;5296-2-49904,81;757245-1-9485;470221-1-14331;186271-1-21700;497194-1-4569;4172431-1-28302;1359755-1-2267;1716186-1-5583;3372142-1-14521;4195342-1-19340;2374282-1-15509;2390274-1-12424;469227-1-7479;4548435-3-1019,583,269;2042321-1-6730;2590556-1-11136;1314756-1-838;877779-2-724,1281;6689172-1-10516;209219-1-13242;2130730-2-915,7;4900532-1-16700;1696766-1-6895;3572102-1-8579;1157600-13-48,31,474,360,150,60,176,213,164,94,150,937,1048;955036-1-30890;1245366-2-360,2424;301233-1-15444;240877-1-731;75067-1-38701;1474733-1-2020;664302-1-36814;662494-3-15739,83,106;143054-1-147384;509771-1-148522'

In [36]:
def parse_list_id(word):
    spec = id_index(word)
    spec = spec.strip(';').split(';')[1:]
    for i in range(len(spec)):
        spec[i] = spec[i].split('-')[0:2]
        spec[i][0] = int(spec[i][0])
        spec[i][1] = int(spec[i][1])
        if i > 0:
            spec[i][0] += spec[i - 1][0]
    return spec
    


In [37]:
def id_retrieval(query, start, amount):
    query = query.strip().lower()
    query = set(re.findall('[A-Za-z]+', query))
    result = []
    if len(query) == 0:
        return []
    word_list = []
    for q in query:
        l = parse_list_id(q)
        word_list.append(l)
#     word_list = []
#     word_list.append([[9,2],[21,1],[35,1]])
#     word_list.append([[1,2],[9,1],[21,3],[34,1],[35,2],[80,3]])
    p_list = [0] * len(word_list)
    while p_list[0] < len(word_list[0]):
        doc_id = word_list[0][p_list[0]][0]
        found = True
        for i in range(1, len(word_list)):
            while p_list[i] < len(word_list[i]) and word_list[i][p_list[i]][0] < doc_id:
                p_list[i] += 1
            if p_list[i] >= len(word_list[i]) or word_list[i][p_list[i]][0] != doc_id:
                found *= False
            if found is False:
                break
        if found is True:
            tf_list = []
            for i in range(len(p_list)):
                tf_list.append(word_list[i][p_list[i]][1])
            result.append([doc_id, tf_list])
        p_list[0] += 1
        
    scale_ratio = [0] * len(word_list)
    for i in range(len(result)):
        for j in range(len(result[i][1])):
            scale_ratio[j] = max(scale_ratio[j], result[i][1][j])
#     print(scale_ratio)
#     print(result)
    final_result = []
    for i in range(len(result)):
        doc_id = result[i][0]
        doc_score = 0
        for score, scale in zip(result[i][1], scale_ratio):
            doc_score += score / scale
        final_result.append((doc_id, doc_score))
    
#     print(final_result)
    final_result.sort(key=lambda k:k[1], reverse=True)
#     print(final_result)
    query_result = []
    for doc_id, doc_score in final_result[start: min(len(final_result), start + amount)]:
        title = parse_text(text_index(doc_id))['title']
        query_result.append((title, doc_id, doc_score))
        
    return query_result

In [69]:
a=[1,2,3,4]
a[1:len(a)-1]

[2, 3]

In [39]:
result = id_retrieval('distributed systems', 0, 5)

In [40]:
result

[('distributed computing', 8501, 1.1448763250883391),
 ('mohamed e. el-hawary', 51274281, 1.0810810810810811),
 ('wikipedia:wikiproject academic journals/journals cited by wikipedia/publisher3',
  60523485,
  1.0708385063508739),
 ('list of types of systems theory', 13314857, 0.9711106866583898),
 ('list of acquisitions by cisco systems', 1717097, 0.9643539299016332)]

In [None]:
re.findall('[A-Za-z]+', 'apple banan sdfad')

In [38]:
test_search = id_retrieval('apoka', 0, 5) 
test_search

[('vilius šapoka', 53733124, 1.0),
 ('naše věc', 26551591, 0.23076923076923078),
 ('union for the liberation of vilnius', 59352232, 0.23076923076923078),
 ('lithuania', 17675, 0.15384615384615385),
 ('saint casimir', 37290, 0.15384615384615385)]

In [71]:
def tf_retrieval(query, start, amount):
    query = query.strip().lower()
    query = set(re.findall('[A-Za-z]+', query))
    if len(query) == 0:
        return []
    score = {}
    for q in query:
        l = parse_list_tf(q)
        scale_ratio = l[0][1]
        for doc in l:
            if doc[0] not in score:
                score[doc[0]] = 0
            score[doc[0]] += doc[1] / scale_ratio
    score = sorted(score.items(), key=lambda x:x[1], reverse=True)

    result = []
    for doc_id, doc_score in score[start: min(len(score), start + amount)]:
        title = parse_text(text_index(doc_id))['title']
        result.append((title, doc_id, doc_score))
    
    return result

In [72]:
def parse_list_tf(word):
    spec = tf_index(word)
    spec = spec.strip(';').split(';')[1:]
    for i in range(len(spec)):
        spec[i] = spec[i].split('-')[0:2]
        spec[i][0] = int(spec[i][0])
        spec[i][1] = int(spec[i][1])
        if i > 0 and spec[i][1] == spec[i - 1][1]:
            spec[i][0] += spec[i - 1][0]
    return spec

In [73]:
tf_retrieval('apoka', 0, 5)

[('vilius šapoka', 53733124, 1.0),
 ('naše věc', 26551591, 0.23076923076923078),
 ('union for the liberation of vilnius', 59352232, 0.23076923076923078),
 ('lithuania', 17675, 0.15384615384615385),
 ('saint casimir', 37290, 0.15384615384615385)]

In [74]:
tf_retrieval('distributed systems', 0, 5)

[('distributed computing', 8501, 1.0445420160287777),
 ('list of multiplanetary systems', 551330, 1.0),
 ('studiocanal', 946329, 1.0),
 ('list of systems engineering universities', 12118737, 0.9226069246435845),
 ('distributed operating system', 26524575, 0.7682439759832835)]

In [76]:
parse_text(text_index('8501'))

{'title': 'distributed computing',
 'contri_name': 'mckay',
 'contri_id': '19640',
 'timestamp': '2019-09-23t03:27:10z',
 'text': "{{short description|System whose components are located on different networked computers}}\n{{redirect|Distributed application|trustless applications|Decentralized application}}\n{{Redir|Distributed Information Processing|the computer company|DIP Research}}\n'''Distributed computing '''is a field of [[computer science]] that studies distributed systems. A ''distributed system'' is a system whose components are located on different [[computer network|networked computers]], which communicate and coordinate their actions by [[message passing|passing messages]] to one another.&lt;ref name=&quot;tanenbaum&quot;&gt;{{cite book |author1=Tanenbaum, Andrew S. |author2=Steen, Maarten van |title=Distributed systems: principles and paradigms|publisher=Pearson Prentice Hall |location=Upper Saddle River, NJ |year=2002 |pages= |isbn=0-13-088893-1 |oclc= |doi= |accessdate=

In [42]:
tf_index('apoka')

'38;53733124-13-48,31,474,360,150,60,176,213,164,94,150,937,1048;26551591-3-1019,583,269;32800641-3-15739,83,106;17675-2-21197,19329;19615-2-20889,15372;5296-2-49904,81;33334417-2-724,1281;9029121-2-915,7;13527402-2-360,2424;17820-1-158000;782011-1-9485;470221-1-14331;186271-1-21700;497194-1-4569;4172431-1-28302;1359755-1-2267;1716186-1-5583;3372142-1-14521;4195342-1-19340;2374282-1-15509;2390274-1-12424;469227-1-7479;6590756-1-6730;2590556-1-11136;1314756-1-838;7566951-1-10516;209219-1-13242;7031262-1-16700;1696766-1-6895;3572102-1-8579;2112636-1-30890;1546599-1-15444;240877-1-731;75067-1-38701;1474733-1-2020;664302-1-36814;805548-1-147384;509771-1-148522;'

In [None]:
re.findall('banana', 'apple banana')
'apple banana'.find('banana')

In [None]:
len('">{{unreferenced|date=April 2010}}\n\n{{Infobox musical artist\n| name            = Naše Věc\n| image             = \n| alt         = \n| caption        = \n| image_size        = \n| background      =  group_or_band\n| alias           = \n| origin          = \n| genre           = [[Hip Hop music|Hip Hop]]\n| years_active    = &lt;!-- {{Start date|1997}}–2006 --&gt;\n| label           = \n| associated_acts = \n| website             = &lt;!-- {{URL|http://www.nasevec.cz}} --&gt;\n| current_members = \n| past_members    = Dup X, Scissal, Plgál and Číňan\n}}\n\'\'\'Naše Věc\'\'\' was a [[Czech Republic|Czech]] [[hip hop]] band, active from 1997 to 2006.\n\n== Beginnings ==\nNaše Věc started making [[Hip Hop music]] in [[Brno, Czech Republic]], in 1997. Their originating members, Dup X, Scissal, Plgál, and Číňan, were doing freestyles and demos. In 1998, only Dup X and Scissal stayed aboard, working on a new demo with Kaluža, a young female MC. After her retirement in 1999, there were new members who made this group famous. Psicho, 2Jay, ')

In [None]:
l[1]['text'][15739:15739+5]
l[1]['text'][15739+83:15739+83+5]
l[1]['text'][15739+83+106:15739+83+106+5]

In [None]:
l = parse_tf(tf_index('apoka'), 0, 2, text_index)

In [None]:
l[0]['text'][48+1:53+1]
l[0]['text'][48+31+1:53+31+1]

In [None]:
l[1]['text'][1019+2:1024+2]
#l[0]['text'][1602+2:1607+2]
#l[0]['text'][1871+2:1876+2]

In [None]:
text_index('10')
# document id -> document info (text start position in .xml)

In [None]:
parse_text(text_index('10'))
# parse document info, and get text

In [None]:
f = open('untitled.txt', 'r')
f.seek(0)
f.read(0)

In [77]:
import time
time.time()


1576586231.8032868

In [79]:
'tf\n'.upper()

'TF\n'