In [21]:
"""
This notebook prepares structured data to fed for further analysis.
It creates these datasets:

Page based:

page_info.tsv: 
PageName, PageId, TextLength, NumUniqueWords, NumFiles, 
NumExternalLinks, NumInfoBoxes, NumSections

page_links.tsv:
SourcePageId, DestinationPageId

page_inlink_counts.tsv:
PageId, NumInlinks

Category based:

-> Category pages

These are jsons (one per line) with the following structure:
{
    "category_name": "Category name",
    "sub_categories": ["Subcategory 1", "Subcategory 2", ...],
    "articles": [("Article Title 1", page_id_1), ("Article Title 2", page_id_2), ...],
    "parent_categories": ["Parent Category 1", "Parent Category 2", ...],
    "internal_links": ["link 1", "link 2", ...]
}

-> category_id_to_sub_category_ids.tsv

This is a tsv file with the following structure:
CategoryId, DescendantCategoryId, ConnectingChain

Connecting chain is a comma separated list of category ids that connect the two categories.

-> category_id_to_parent_category_ids.tsv

This is a tsv file with the following structure:
CategoryId, ParentCategoryId

-> category_id_to_page_ids.tsv

This is a tsv file with the following structure:
CategoryId, PageId

-> category_id_to_stats.tsv

This is a tsv file with the following structure:
CategoryId, NumArticles, NumChildcategories, NumParentCategories

"""
_ = 1

In [39]:
import pandas as pd
import time, json, os, random, traceback, pyperclip, importlib
import src.wiki_analysis_utils as wiki_analysis_utils

_ = importlib.reload(wiki_analysis_utils)

In [7]:
data_root_dir = r'D:\WikipediaDataset\data\\'
NUM_PARTITIONS = 10

In [8]:
"""
Save page_info.tsv: 
PageName, RedirectTitle, PageId, TextLength, NumInternalLinks, NumUniqueWords, NumFiles,
NumExternalLinks, NumInfoBoxes, NumSections
"""

start_time = time.time()
processed_line_count = 0
num_pages_processed = 0
with open(data_root_dir + 'page_info.tsv', 'w') as page_info_f:
    page_info_f.write("PageName\tRedirectTitle\tPageId\tTextLength\tNumInternalLinks\tNumUniqueWords\tNumFiles\tNumExternalLinks\tNumInfoBoxes\tNumSections\n")
    for i in range(10):
        with open(data_root_dir + f'processed_summaries/part-{i}.txt', 'r') as f:
            for line in f:
                page = json.loads(line)
                namespace = page['namespace']
                title = page['title']
                page_id = page['page_id']
                redirect_title = page['redirect_title']

                if namespace == 0:
                    if redirect_title:
                        page_info_f.write(f"{title}\t{redirect_title}\t{page_id}\t\t\t\t\t\t\t\n")
                    else:
                        page_info_f.write(f"{title}\t\t{page_id}\t{page['text_length']}\t{len(page['internal_links'])}\t{page['num_unique_words']}\t{page['number_of_files']}\t{page['number_of_external_links']}\t{page['number_of_info_boxes']}\t{page['number_of_sections']}\n")
                    num_pages_processed += 1
                processed_line_count += 1
                if processed_line_count % 1000000 == 0:
                    print(f"Processed {processed_line_count} lines. {num_pages_processed} pages found till now.")

        print(f"Finished part {i} in {(time.time() - start_time) / 60} minutes")

Processed 1000000 lines. 962044 pages found till now.
Processed 2000000 lines. 1916891 pages found till now.
Finished part 0 in 0.7270979086558024 minutes
Processed 3000000 lines. 2859405 pages found till now.
Processed 4000000 lines. 3790782 pages found till now.
Finished part 1 in 1.297219415505727 minutes
Processed 5000000 lines. 4730403 pages found till now.
Processed 6000000 lines. 5650925 pages found till now.
Finished part 2 in 1.7902512073516845 minutes
Processed 7000000 lines. 6570415 pages found till now.
Processed 8000000 lines. 7474023 pages found till now.
Finished part 3 in 2.322950883706411 minutes
Processed 9000000 lines. 8350942 pages found till now.
Processed 10000000 lines. 9212477 pages found till now.
Finished part 4 in 2.8843417684237163 minutes
Processed 11000000 lines. 10047260 pages found till now.
Processed 12000000 lines. 10918180 pages found till now.
Finished part 5 in 3.4488520979881288 minutes
Processed 13000000 lines. 11752593 pages found till now.
Proce

In [9]:
page_name_to_page_id, page_id_to_page_name, error_counts = \
    wiki_analysis_utils.load_page_name_to_id_map(data_root_dir)

Processed 1000000 lines. Fail count: 0. Success count: 1000000. Time taken: 0.03520254294077555 minutes.
Processed 2000000 lines. Fail count: 0. Success count: 2000000. Time taken: 0.0746461828549703 minutes.
Processed 3000000 lines. Fail count: 0. Success count: 3000000. Time taken: 0.11801429986953735 minutes.
Processed 4000000 lines. Fail count: 0. Success count: 4000000. Time taken: 0.17254188060760497 minutes.
Processed 5000000 lines. Fail count: 0. Success count: 5000000. Time taken: 0.224252720673879 minutes.
Processed 6000000 lines. Fail count: 0. Success count: 6000000. Time taken: 0.27663596868515017 minutes.
Processed 7000000 lines. Fail count: 0. Success count: 7000000. Time taken: 0.3174879471460978 minutes.
Processed 8000000 lines. Fail count: 0. Success count: 8000000. Time taken: 0.36652742624282836 minutes.
Processed 9000000 lines. Fail count: 0. Success count: 9000000. Time taken: 0.4145349780718485 minutes.
Processed 10000000 lines. Fail count: 0. Success count: 1000

In [10]:
(len(page_name_to_page_id), len(page_id_to_page_name), len(error_counts))

(16812294, 17753527, 0)

In [11]:
# load source_page_id to destination_page_id for redirect pages
# this will be used to resolve redirects in the cells below

redirect_pid_to_pid, error_counts = \
    wiki_analysis_utils.load_page_redirect_mapping(data_root_dir, page_name_to_page_id)

Processed 1000000 lines. Fail count: 70. Success count: 999930. Time taken: 0.03364824851353963 minutes.
Processed 2000000 lines. Fail count: 204. Success count: 1999796. Time taken: 0.06828335920969646 minutes.
Processed 3000000 lines. Fail count: 391. Success count: 2999609. Time taken: 0.10596334934234619 minutes.
Processed 4000000 lines. Fail count: 587. Success count: 3999413. Time taken: 0.1513728141784668 minutes.
Processed 5000000 lines. Fail count: 770. Success count: 4999230. Time taken: 0.19528584082921346 minutes.
Processed 6000000 lines. Fail count: 956. Success count: 5999044. Time taken: 0.23620920578638713 minutes.
Processed 7000000 lines. Fail count: 1182. Success count: 6998818. Time taken: 0.2848101774851481 minutes.
Processed 8000000 lines. Fail count: 1470. Success count: 7998530. Time taken: 0.33345146973927814 minutes.
Processed 9000000 lines. Fail count: 1709. Success count: 8998291. Time taken: 0.3767818530400594 minutes.
Processed 10000000 lines. Fail count: 1

In [12]:
(len(redirect_pid_to_pid), len(set(redirect_pid_to_pid.values())), len(error_counts))

(10904676, 3690970, 29764)

In [13]:
"""
Save page_links.tsv:
SourcePageId, DestinationPageId
"""

start_time = time.time()
processed_line_count = 0
num_pages_processed = 0
failure_counts = {
    'Unknown Page': 0
}
total_link_count = 0
log_level = 'ERROR'
with open(data_root_dir + 'page_links.tsv', 'w') as page_links_f:
    page_links_f.write("SourcePageId\tDestinationPageId\n")
    for i in range(10):
        with open(data_root_dir + f'processed_summaries/part-{i}.txt', 'r') as f:
            for line in f:
                page = json.loads(line)
                namespace = page['namespace']
                title = page['title']
                page_id = page['page_id']
                redirect_title = page['redirect_title']

                if namespace == 0 and not redirect_title:
                    for link in page['internal_links']:
                        link = wiki_analysis_utils.normalized_page_name(link)
                        if link.startswith(':category:'):
                            continue
                        total_link_count += 1
                        if link not in page_name_to_page_id:
                            if log_level in ['WARN', 'INFO']:
                                print(f"Page {link} in page {page_id} not found in page name to id mapping.")
                            failure_counts['Unknown Page'] += 1
                            continue
                        dest_page_id = page_name_to_page_id[link]
                        dest_page_id = redirect_pid_to_pid.get(dest_page_id, dest_page_id)
                        page_links_f.write(f"{page_id}\t{dest_page_id}\n")
                    num_pages_processed += 1
                processed_line_count += 1
                if processed_line_count % 1000000 == 0:
                    print(f"Processed {processed_line_count} lines. {num_pages_processed} pages found till now.")
                    #break

        print(f"Finished part {i} in {(time.time() - start_time) / 60} minutes")
        #break

print (f"Failure counts: {failure_counts}")
print (f"{sum(failure_counts.values())} / {total_link_count} ({(sum(failure_counts.values()) / total_link_count) * 100}%) links couldn't be captured.")

Processed 1000000 lines. 520258 pages found till now.
Processed 2000000 lines. 967758 pages found till now.
Finished part 0 in 7.403570981820424 minutes
Processed 3000000 lines. 1375964 pages found till now.
Processed 4000000 lines. 1777375 pages found till now.
Finished part 1 in 10.953632120291392 minutes
Processed 5000000 lines. 2103953 pages found till now.
Processed 6000000 lines. 2488972 pages found till now.
Finished part 2 in 13.118326369921366 minutes
Processed 7000000 lines. 2860437 pages found till now.
Processed 8000000 lines. 3191915 pages found till now.
Finished part 3 in 15.521819802125295 minutes
Processed 9000000 lines. 3503469 pages found till now.
Processed 10000000 lines. 3842350 pages found till now.
Finished part 4 in 17.97135885953903 minutes
Processed 11000000 lines. 4156171 pages found till now.
Processed 12000000 lines. 4498292 pages found till now.
Finished part 5 in 19.871130939324697 minutes
Processed 13000000 lines. 4834557 pages found till now.
Processed

In [14]:
pdf = pd.read_csv(data_root_dir + 'page_links.tsv', sep='\t', nrows=4000)
pdf['SourcePageName'] = pdf['SourcePageId'].map(page_id_to_page_name)
pdf['DestinationPageName'] = pdf['DestinationPageId'].map(page_id_to_page_name)
pdf['SourcePageId'].value_counts()

303    826
316    802
307    549
308    514
305    428
12     387
290    242
39     146
309    106
Name: SourcePageId, dtype: int64

In [15]:
internal_links = pdf.query('SourcePageId==290')['DestinationPageName'].to_list()
print (len(internal_links))
pyperclip.copy(json.dumps(sorted(internal_links), indent=2))
sorted(internal_links)

242


['a (cyrillic)',
 'a (cyrillic)',
 'a (indic)',
 'a with breve (cyrillic)',
 'a-list',
 'aardvark',
 'abo blood group system',
 'afrikaans',
 'aleph',
 'aleph',
 'aleph',
 'aleph number',
 'algebra',
 'allograph',
 'alpha',
 'alpha',
 'alphabet',
 'anarchist symbolism',
 'ancient greece',
 'angstrom',
 'ansuz (rune)',
 'apple',
 'argentine austral',
 'argentine austral',
 'armenian alphabet',
 'article (grammar)',
 'ascii',
 'asymmetry',
 'at sign',
 'at sign',
 'australian english',
 'ayb (armenian letter)',
 'ayb (armenian letter)',
 'azerbaijani language',
 'bar (diacritic)',
 'bashkir language',
 'bengali alphabet',
 'blackletter',
 'bra',
 'bulgarian language',
 'carolingian minuscule',
 'caron',
 'catalan dialects',
 'catalan dialects',
 'catalan language',
 'chemnitz dialect',
 'chuvash language',
 'close-mid front unrounded vowel',
 'code point',
 'combining character',
 'consonant',
 'coptic script',
 'cursive',
 'cyrillic script',
 'czech language',
 'danish language',
 'deci

In [16]:
"""
Save page_inlink_counts.tsv:
PageId, NumInlinks
"""

page_inlink_counts = {}
start_time = time.time()
line_no = 0
with open(data_root_dir + 'page_links.tsv', 'r') as f:
    for line in f:
        line_no += 1
        if line_no==1: continue
        source_page, destination_page = line.split('\t')
        destination_page = int(destination_page)
        page_inlink_counts[destination_page] = page_inlink_counts.get(destination_page, 0) + 1
        if line_no % 10000000 == 0:
            print(f"Processed {line_no} lines. Time taken: {(time.time() - start_time) / 60} minutes.")
            # break

print (f"Processed {line_no} lines. Time taken: {(time.time() - start_time) / 60} minutes.")

with open(data_root_dir + 'page_inlink_counts.tsv', 'w') as f:
    f.write("PageId\tNumInlinks\n")
    for page_id, num_inlinks in sorted(page_inlink_counts.items()):
        f.write(f"{page_id}\t{num_inlinks}\n")

Processed 10000000 lines. Time taken: 0.2552937984466553 minutes.
Processed 20000000 lines. Time taken: 0.4910214861234029 minutes.
Processed 30000000 lines. Time taken: 0.7426513274510701 minutes.
Processed 40000000 lines. Time taken: 1.0243276198705038 minutes.
Processed 50000000 lines. Time taken: 1.3267032305399578 minutes.
Processed 60000000 lines. Time taken: 1.6137513518333435 minutes.
Processed 70000000 lines. Time taken: 1.9017590920130412 minutes.
Processed 80000000 lines. Time taken: 2.191321623325348 minutes.
Processed 90000000 lines. Time taken: 2.490134263038635 minutes.
Processed 100000000 lines. Time taken: 2.7884846766789755 minutes.
Processed 110000000 lines. Time taken: 3.0721547563870746 minutes.
Processed 120000000 lines. Time taken: 3.351078001658122 minutes.
Processed 130000000 lines. Time taken: 3.6386491854985556 minutes.
Processed 140000000 lines. Time taken: 3.9113255540529885 minutes.
Processed 150000000 lines. Time taken: 4.21607338587443 minutes.
Processed

In [17]:
len(page_inlink_counts)

6349743

In [40]:
# combine information from full dataset and save category pages from the processed summaries

"""
prepare category page details
1. List of sub-categories
2. List of articles
3. List of parent categories
4. List of internal links on category page

To parallelize, 
1. dump data for each partition separately
2. process categories based on their hashes in N parts and dumps complete info for each category
"""

def get_empty_category_data():
    return {
        'sub_categories': set(),
        'internal_links': [],
        'articles': [],
        'parent_categories': set()
    }

start_time = time.time()
processed_line_count = 0
norm = wiki_analysis_utils.normalized_page_name
for i in range(10):
    category_name_to_data = {}
    with open(data_root_dir + f'processed_summaries/part-{i}.txt', 'r') as f:
        for line in f:
            page = json.loads(line)
            namespace = page['namespace']
            title = page['title']
            page_id = page['page_id']
            redirect_title = page['redirect_title']

            if namespace == 14:
                category_name = norm(title)
                category_name_to_data[category_name] = category_name_to_data\
                    .get(category_name, get_empty_category_data())
                category_data = category_name_to_data[category_name]
                category_data['internal_links'].extend(map(norm, page['internal_links']))
                category_data['parent_categories'].update(map(norm, page['categories']))
                category_data['category_id'] = int(page_id)
                for parent_category in page['categories']:
                    parent_category = norm(parent_category)
                    category_name_to_data[parent_category] = category_name_to_data\
                        .get(parent_category, get_empty_category_data())
                    category_name_to_data[parent_category]['sub_categories'].add(category_name)
            elif namespace == 0 and not redirect_title:
                for category in page['categories']:
                    category = norm(category)
                    category_name_to_data[category] = category_name_to_data\
                        .get(category, get_empty_category_data())
                    page_id = int(page_id)
                    page_id = redirect_pid_to_pid.get(page_id, page_id)
                    category_name_to_data[category]['articles'].append((title, page_id))
            processed_line_count += 1
            if processed_line_count % 1000000 == 0:
                print(f"Processed {processed_line_count} lines in {(time.time() - start_time) / 60} minutes.")
    os.makedirs(data_root_dir+'tmp/', exist_ok=True)
    print (f"Dumping data for part {i} containing {len(category_name_to_data)} category partial jsons.")
    with open(data_root_dir + f'tmp/partial_category_page_data_part-{i}.txt', 'w') as out_f:
        for cat_name, data in category_name_to_data.items():
            data["category_name"] = cat_name
            data['sub_categories'] = list(data['sub_categories'])
            data['parent_categories'] = list(data['parent_categories'])
            out_f.write(json.dumps(data)+"\n")
    print(f"Finished part {i} in {(time.time() - start_time) / 60} minutes")
    #break

Processed 1000000 lines in 0.6364568471908569 minutes.
Processed 2000000 lines in 1.1012573758761088 minutes.
Dumping data for part 0 containing 934933 category partial jsons.
Finished part 0 in 1.5530202349026998 minutes
Processed 3000000 lines in 1.9946431159973144 minutes.
Processed 4000000 lines in 2.4322856227556864 minutes.
Dumping data for part 1 containing 885607 category partial jsons.
Finished part 1 in 2.8973546028137207 minutes
Processed 5000000 lines in 3.1667434334754945 minutes.
Processed 6000000 lines in 3.5835325280825296 minutes.
Dumping data for part 2 containing 830371 category partial jsons.
Finished part 2 in 3.9203997294108075 minutes
Processed 7000000 lines in 4.218890984853108 minutes.
Processed 8000000 lines in 4.613288362820943 minutes.
Dumping data for part 3 containing 855587 category partial jsons.
Finished part 3 in 4.940384646256764 minutes
Processed 9000000 lines in 5.235450784365336 minutes.
Processed 10000000 lines in 5.667127799987793 minutes.
Dumpin

In [41]:
os.makedirs(data_root_dir+'category_pages/', exist_ok=True)
start_time = time.time()
for partition in range(NUM_PARTITIONS):
    category_name_to_data = {}
    for i in range(10):
        with open(data_root_dir + f'tmp/partial_category_page_data_part-{i}.txt', 'r') as f:
            for line in f:
                line = line.strip()
                if line=="": continue
                data = json.loads(line)
                if (hash(data['category_name']) % NUM_PARTITIONS) != partition:
                    continue
                if data['category_name'] not in category_name_to_data:
                    category_name_to_data[data['category_name']] = get_empty_category_data()
                category_data = category_name_to_data[data['category_name']]
                if 'category_id' in data:
                    category_data['category_id'] = data['category_id']
                category_data['sub_categories'].update(data['sub_categories'])
                category_data['parent_categories'].update(data['parent_categories'])
                category_data['internal_links'].extend(data['internal_links'])
                category_data['articles'].extend(data['articles'])
    with open(data_root_dir + f'category_pages/part-{partition}.txt', 'w') as out_f:
        for cat_name, data in category_name_to_data.items():
            data["category_name"] = cat_name
            data['sub_categories'] = list(data['sub_categories'])
            data['parent_categories'] = list(data['parent_categories'])
            out_f.write(json.dumps(data)+"\n")
    print(f"Finished part {partition} in {(time.time() - start_time) / 60} minutes")

                

Finished part 0 in 2.061130122343699 minutes
Finished part 1 in 3.9293649673461912 minutes
Finished part 2 in 5.823503355185191 minutes
Finished part 3 in 8.542663621902467 minutes
Finished part 4 in 10.408614214261373 minutes
Finished part 5 in 12.252984670797984 minutes
Finished part 6 in 14.039662718772888 minutes
Finished part 7 in 15.66956444978714 minutes
Finished part 8 in 17.287773271401722 minutes
Finished part 9 in 18.950868968168894 minutes


In [42]:
# check for a few category pages that they look good

selected_categories = ["Platonic solids", "Sampling (statistics)", "Drama films", "Comedy novels"]
category_pages = wiki_analysis_utils.load_category_pages(data_root_dir, selected_categories=selected_categories)

Processed till part 0 in 0.07508935133616129 minutes
Processed till part 1 in 0.1482577443122864 minutes
Processed till part 2 in 0.2128618319829305 minutes
Processed till part 3 in 0.2742927074432373 minutes
Processed till part 4 in 0.35173648595809937 minutes
Processed till part 5 in 0.47813774744669596 minutes
Processed till part 6 in 0.5374097108840943 minutes
Processed till part 7 in 0.595884398619334 minutes
Processed till part 8 in 0.6622572024663289 minutes
Processed till part 9 in 0.7481602390607198 minutes


In [43]:
category_pages['selected'][1]
# category_pages['random'][2]

{'sub_categories': ['sampling techniques',
  'empirical evidence',
  'sample statistics',
  'survey methodology'],
 'internal_links': [],
 'articles': [['Census', 6889],
  ['Sampling bias', 17692],
  ['Rock paper scissors', 27032],
  ['Statistical unit', 27580],
  ['Stratified sampling', 27596],
  ['Infrastructure bias', 47280],
  ['Sampling (statistics)', 160361],
  ['Autodidacticism', 255591],
  ['Opinion poll', 277315],
  ['Margin of error', 277379],
  ['Self-selection bias', 292154],
  ['Lottery machine', 379930],
  ['Selection bias', 394392],
  ['Coin flipping', 494410],
  ['Sampling distribution', 520670],
  ['Recall bias', 1360950],
  ['Survivorship bias', 1745325],
  ['Sample size determination', 1776839],
  ['Sampling error', 1955561],
  ['Sampling frame', 2050041],
  ['Odds and evens (hand game)', 2234844],
  ['Sampling fraction', 2719222],
  ["Whipple's index", 4039291],
  ['Selective recruitment', 5054888],
  ['Scale analysis (statistics)', 6055749],
  ['Expander walk sampl

In [44]:
# load category name to id mappings from the category pages
categories, failure_counts = wiki_analysis_utils.load_category_name_to_id_map(data_root_dir)

Processed till part 0 in 0.06616474787394205 minutes
Processed till part 1 in 0.12248515288035075 minutes
Processed till part 2 in 0.18007128636042277 minutes
Processed till part 3 in 0.2350926915804545 minutes
Processed till part 4 in 0.40694777965545653 minutes
Processed till part 5 in 0.5122623840967814 minutes
Processed till part 6 in 0.5809729059537252 minutes
Processed till part 7 in 0.6555250962575276 minutes
Processed till part 8 in 0.7199227015177408 minutes
Processed till part 9 in 0.7967670241991679 minutes
2346796 {'MissingCategoryId': 22359}
2346796 2346796


In [45]:
"""
Obtain and save these mappings:

- category_id -> number of pages
- category_id -> sub category ids, parent category ids
- category_id -> page_ids
"""

category_id_to_data = {}

start_time = time.time()
failure_counts = {
    'Unknown Category': 0,
    'Unknown SubCategory': 0,
    'Unknown ParentCategory': 0
}
log_level = 'ERROR'
for partition in range(NUM_PARTITIONS):
    with open(data_root_dir + f'category_pages/part-{partition}.txt', 'r') as f:
        for line in f:
            if line=='': continue
            category_json = json.loads(line)
            category_data = {
                'page_ids': [],
                'sub_category_ids': [],
                'parent_category_ids': []
            }
            category_id = categories['name_to_id'].get(category_json['category_name'], None)
            if category_id is None:
                if log_level in ['WARN', 'INFO']:
                    print(f"Category {category_json['category_name']} not found in category name to id mapping.")
                failure_counts['Unknown Category'] += 1
                continue
            category_id_to_data[category_id] = category_data
            for _, page_id in category_json['articles']:
                category_data['page_ids'].append(page_id)
            for sub_category in category_json['sub_categories']:
                sub_category_id = categories['name_to_id'].get(sub_category, None)
                if sub_category_id is None:
                    if log_level in ['WARN', 'INFO']:
                        print(f"Sub category {sub_category} not found in category name to id mapping.")
                    failure_counts['Unknown SubCategory'] += 1
                    continue
                category_data['sub_category_ids'].append(sub_category_id)
            for parent_category in category_json['parent_categories']:
                parent_category_id = categories['name_to_id'].get(parent_category, None)
                if parent_category_id is None:
                    if log_level in ['WARN', 'INFO']:
                        print(f"Parent category {parent_category} not found in category name to id mapping.")
                    failure_counts['Unknown ParentCategory'] += 1
                    continue
                category_data['parent_category_ids'].append(parent_category_id)
    print(f"Processed till part {partition} in {(time.time() - start_time) / 60} minutes")

print (f"Failure counts: {failure_counts}")


Processed till part 0 in 0.14185847441355387 minutes
Processed till part 1 in 0.29345881938934326 minutes
Processed till part 2 in 0.43326702117919924 minutes
Processed till part 3 in 0.5495299696922302 minutes
Processed till part 4 in 0.7334250688552857 minutes
Processed till part 5 in 0.8907455643018086 minutes
Processed till part 6 in 1.0160117149353027 minutes
Processed till part 7 in 1.177340281009674 minutes
Processed till part 8 in 1.280317223072052 minutes
Processed till part 9 in 1.3832293351491292 minutes
Failure counts: {'Unknown Category': 22359, 'Unknown SubCategory': 0, 'Unknown ParentCategory': 170572}


In [46]:
# save these mappings to tsv files

with open(data_root_dir + 'category_id_to_stats.tsv', 'w') as f:
    f.write("CategoryId\tNumPages\tNumSubCategories\tNumParentCategories\n")
    for category_id, data in category_id_to_data.items():
        f.write(f"{category_id}\t{len(data['page_ids'])}\t{len(data['sub_category_ids'])}\t{len(data['parent_category_ids'])}\n")

In [47]:
with open(data_root_dir + 'category_id_to_page_ids.tsv', 'w') as f:
    f.write("CategoryId\tPageId\n")
    for category_id, data in category_id_to_data.items():
        for page_id in data['page_ids']:
            f.write(f"{category_id}\t{page_id}\n")

In [48]:
with open(data_root_dir + 'category_id_to_sub_category_ids.tsv', 'w') as f:
    f.write("CategoryId\tSubCategoryId\n")
    for category_id, data in category_id_to_data.items():
        for sub_category_id in data['sub_category_ids']:
            f.write(f"{category_id}\t{sub_category_id}\n")

In [49]:
with open(data_root_dir + 'category_id_to_parent_category_ids.tsv', 'w') as f:
    f.write("CategoryId\tParentCategoryId\n")
    for category_id, data in category_id_to_data.items():
        for parent_category_id in data['parent_category_ids']:
            f.write(f"{category_id}\t{parent_category_id}\n")