In [1]:
import importlib
import mwxml, bz2, random, json, time

import src.wiki_parser as wiki_parser

_ = importlib.reload(wiki_parser)

In [2]:
data_root_dir = '/media/mohit/E6A87A13A879E30B/WikipediaDataset/data/'

In [3]:
# count different types of pages in processed summaries
summaries_dir = data_root_dir + 'processed_summaries/'

counts = {
    'CategoryPages': 0,
    'RedirectPages': 0,
    'ArticlePages': 0
}

start_time = time.time()
for i in range(10):
    with open(summaries_dir + f"part-{i}.txt", 'r') as f:
        for line in f:
            page = json.loads(line)
            namespace = page['namespace']
            title = page['title']
            redirect_title = page['redirect_title']
            if namespace == 14:
                counts['CategoryPages'] += 1
            elif namespace == 0:
                if redirect_title:
                    counts['RedirectPages'] += 1
                else:
                    counts['ArticlePages'] += 1
            else:
                print(f"Unknown namespace: {namespace}")
            if sum(counts.values()) % 1000000 == 0:
                print(f"Total count: {sum(counts.values())}. ", counts)
    print(f"Finished part 0 to {i} in {(time.time() - start_time) / 60} minutes")

print(f"\nTotal count: {sum(counts.values())}. ", counts)

Total count: 1000000.  {'CategoryPages': 37956, 'RedirectPages': 441786, 'ArticlePages': 520258}
Total count: 2000000.  {'CategoryPages': 83109, 'RedirectPages': 949133, 'ArticlePages': 967758}
Finished part 0 to 0 in 0.4125656962394714 minutes
Total count: 3000000.  {'CategoryPages': 140595, 'RedirectPages': 1483441, 'ArticlePages': 1375964}
Total count: 4000000.  {'CategoryPages': 209218, 'RedirectPages': 2013407, 'ArticlePages': 1777375}
Finished part 0 to 1 in 0.7056660215059917 minutes
Total count: 5000000.  {'CategoryPages': 269597, 'RedirectPages': 2626450, 'ArticlePages': 2103953}
Total count: 6000000.  {'CategoryPages': 349075, 'RedirectPages': 3161953, 'ArticlePages': 2488972}
Finished part 0 to 2 in 0.984477452437083 minutes
Total count: 7000000.  {'CategoryPages': 429585, 'RedirectPages': 3709978, 'ArticlePages': 2860437}
Total count: 8000000.  {'CategoryPages': 525977, 'RedirectPages': 4282108, 'ArticlePages': 3191915}
Finished part 0 to 3 in 1.279146683216095 minutes
Tota

In [3]:
# fetch full details of specific pages efficiently
# load index
snapshot_name = 'enwiki-20240501-pages-articles-multistream'

start_time = time.time()
parser = wiki_parser.WikiParser(data_root_dir+snapshot_name+"-index.txt.bz2", 
                                data_root_dir+snapshot_name+".xml.bz2")
print (f"Time taken to load the index: {time.time() - start_time} seconds")

Time taken to load the index: 63.99101638793945 seconds


In [4]:
# fetch specific page ids
print ("Total number of page ids in the snapshot: ", len(parser.id_to_offset))

sampled_ids = random.sample(list(parser.id_to_offset.keys()), 10)
sampled_ids = [53547121, 3925581, 4546764]

for page in parser.page_stream(sampled_ids, include_text=True):
    print (page)

Total number of page ids in the snapshot:  23669656
{'page_id': 3925581, 'title': 'List of drama films', 'redirect_title': None, 'namespace': 0, 'text': "{{Short description|None}}\n{{Dramafilmlist}}\n'''List of drama films''' is a chronological listing of films in the [[drama]] genre.\n\n*[[List of drama films of the 1900s]]\n*[[List of drama films of the 1910s]]\n*[[List of drama films of the 1920s]]\n*[[List of drama films of the 1930s]]\n*[[List of drama films of the 1940s]]\n*[[List of drama films of the 1950s]]\n*[[List of drama films of the 1960s]]\n*[[List of drama films of the 1970s]]\n*[[List of drama films of the 1980s]]\n*[[List of drama films of the 1990s]]\n*[[List of drama films of the 2000s]]\n*[[List of drama films of the 2010s]]\n*[[List of drama films of the 2020s]]\n\n{{Filmsbygenre}}\n\n[[Category:Drama films| ]]\n[[Category:Lists of films by genre|Drama films]]"}
{'page_id': 4546764, 'title': 'San Diego Stingrays', 'redirect_title': None, 'namespace': 0, 'text': "