In [1]:
import importlib
import mwxml, bz2, random, json, time, pyperclip

import src.wiki_parser as wiki_parser

_ = importlib.reload(wiki_parser)

In [2]:
data_root_dir = r'D:\WikipediaDataset\data\\'

In [3]:
# count different types of pages in processed summaries
summaries_dir = data_root_dir + 'processed_summaries/'

counts = {
    'CategoryPages': 0,
    'RedirectPages': 0,
    'ArticlePages': 0
}

start_time = time.time()
for i in range(10):
    with open(summaries_dir + f"part-{i}.txt", 'r') as f:
        for line in f:
            page = json.loads(line)
            namespace = page['namespace']
            title = page['title']
            redirect_title = page['redirect_title']
            if namespace == 14:
                counts['CategoryPages'] += 1
            elif namespace == 0:
                if redirect_title:
                    counts['RedirectPages'] += 1
                else:
                    counts['ArticlePages'] += 1
            else:
                print(f"Unknown namespace: {namespace}")
            if sum(counts.values()) % 1000000 == 0:
                print(f"Total count: {sum(counts.values())}. ", counts)
    print(f"Finished part 0 to {i} in {(time.time() - start_time) / 60} minutes")

print(f"\nTotal count: {sum(counts.values())}. ", counts)

Total count: 1000000.  {'CategoryPages': 37956, 'RedirectPages': 441786, 'ArticlePages': 520258}
Total count: 2000000.  {'CategoryPages': 83109, 'RedirectPages': 949133, 'ArticlePages': 967758}
Finished part 0 to 0 in 0.4125656962394714 minutes
Total count: 3000000.  {'CategoryPages': 140595, 'RedirectPages': 1483441, 'ArticlePages': 1375964}
Total count: 4000000.  {'CategoryPages': 209218, 'RedirectPages': 2013407, 'ArticlePages': 1777375}
Finished part 0 to 1 in 0.7056660215059917 minutes
Total count: 5000000.  {'CategoryPages': 269597, 'RedirectPages': 2626450, 'ArticlePages': 2103953}
Total count: 6000000.  {'CategoryPages': 349075, 'RedirectPages': 3161953, 'ArticlePages': 2488972}
Finished part 0 to 2 in 0.984477452437083 minutes
Total count: 7000000.  {'CategoryPages': 429585, 'RedirectPages': 3709978, 'ArticlePages': 2860437}
Total count: 8000000.  {'CategoryPages': 525977, 'RedirectPages': 4282108, 'ArticlePages': 3191915}
Finished part 0 to 3 in 1.279146683216095 minutes
Tota

In [3]:
# fetch full details of specific pages efficiently
# load index
snapshot_name = 'enwiki-20240501-pages-articles-multistream'

start_time = time.time()
parser = wiki_parser.WikiParser(data_root_dir+snapshot_name+"-index.txt.bz2", 
                                data_root_dir+snapshot_name+".xml.bz2")
print (f"Time taken to load the index: {time.time() - start_time} seconds")

Time taken to load the index: 141.75604057312012 seconds


In [4]:
# fetch specific page ids
print ("Total number of page ids in the snapshot: ", len(parser.id_to_offset))

sampled_ids = random.sample(list(parser.id_to_offset.keys()), 10)
sampled_ids = [12, 308, 290]

page_id_to_page = {}

for page in parser.page_stream(sampled_ids, include_text=True):
    page_id_to_page[page['page_id']] = page

Total number of page ids in the snapshot:  23669656


In [5]:
internal_links = wiki_parser.parse_summary_from_raw_text(page_id_to_page[290]['text'])['internal_links']
print (len(internal_links))
pyperclip.copy(json.dumps(sorted(map(lambda l: l.lower(), internal_links)), indent=2))
sorted(internal_links)

244


['#Related characters in the Latin alphabet',
 '@',
 'A (Cyrillic)',
 'A-list',
 'ASCII',
 'Ae (digraph)',
 'Afrikaans',
 'Aleph',
 'Allograph',
 'Alpha',
 'Alphabet',
 'Anarchist symbolism#circle-a',
 'Argentine austral',
 'Armenian alphabet',
 'At sign',
 'Au (digraph)',
 'Australian English',
 'Ayb (letter)',
 'Azerbaijani language',
 'Bar (diacritic)',
 'Bashkir language',
 'Blackletter',
 'Bulgarian language',
 'Caroline script',
 'Caron',
 'Catalan dialects',
 'Catalan language',
 'Chemnitz dialect',
 'Chuvash language',
 'Coptic alphabet',
 'Cyrillic',
 'Czech language',
 'Danish language',
 'Dot (diacritic)',
 'Dutch dialects',
 'Dutch language',
 'Eastern Catalan',
 'Eau (trigraph)',
 'Egyptian hieroglyphs',
 'Emilian dialects',
 'English alphabet',
 'English alphabet#Letter names',
 'English articles',
 'English language',
 'English orthography',
 'English-language vowel changes before historic /r/',
 'Etruscan alphabet',
 'Etruscans',
 'Finnish language',
 'French language',

In [16]:
wiki_parser.parse_summary_from_raw_text(page_id_to_page[12]['text'])['internal_links']

['Permanent autonomous zone',
 'Mikhail Bakunin',
 'propaganda of the deed',
 'vanguard party',
 'civilisation',
 'Sex Pistols',
 'Christian anarchism',
 'Age of Enlightenment',
 'AK Press',
 'capitalism',
 'Free association (communism and anarchism)',
 'Noam Chomsky',
 'Robert Paul Wolff',
 "workers' self-management",
 'Princeton University Press',
 'reformist',
 'feminist',
 'strike action',
 'ruling class',
 'The Government of No One: The Theory and Practice of Anarchism',
 'W. W. Norton & Company',
 'History Today',
 'Spanish Civil War',
 'Stateless society',
 'hierarchical organisation',
 'Benjamin Tucker',
 'Wilhelm Weitling',
 'Current Anthropology',
 'definitional concerns in anarchist theory',
 'neo-impressionist',
 'Self-governance',
 'Cynicism (philosophy)',
 'Friedrich Engels',
 'Karl Marx',
 'second wave of feminism',
 'Rojava',
 'Bloomsbury Academic',
 'Cato Institute',
 'Reciprocity (cultural anthropology)',
 'naturism',
 'University of North Carolina Press',
 'solidarit