This notebook analyzes the categories

In [21]:
import pandas as pd
import time, json, os, random

In [22]:
data_root_dir = '/media/mohit/E6A87A13A879E30B/WikipediaDataset/data/'
NUM_PARTITIONS = 10

In [18]:
# combine information from full dataset and save category pages from the processed summaries

"""
prepare category page details
1. List of sub-categories
2. List of articles
3. List of parent categories
4. List of internal links on category page

To parallelize, 
1. dump data for each partition separately
2. process categories based on their hashes in N parts and dumps complete info for each category
"""

def get_empty_category_data():
    return {
        'sub_categories': set(),
        'internal_links': [],
        'articles': [],
        'parent_categories': set()
    }

start_time = time.time()
processed_line_count = 0
for i in range(10):
    category_name_to_data = {}
    with open(data_root_dir + f'processed_summaries/part-{i}.txt', 'r') as f:
        for line in f:
            page = json.loads(line)
            namespace = page['namespace']
            title = page['title']
            page_id = page['page_id']
            redirect_title = page['redirect_title']

            if namespace == 14:
                category_name_to_data[title] = category_name_to_data.get(title, get_empty_category_data())
                category_data = category_name_to_data[title]
                category_data['internal_links'].extend(page['internal_links'])
                category_data['parent_categories'].update(page['categories'])
                for parent_category in page['categories']:
                    category_name_to_data[parent_category] = category_name_to_data.get(parent_category, get_empty_category_data())
                    category_name_to_data[parent_category]['sub_categories'].add(title)
            elif namespace == 0 and not redirect_title:
                for category in page['categories']:
                    category_name_to_data[category] = category_name_to_data.get(category, get_empty_category_data())
                    category_name_to_data[category]['articles'].append((title, page_id))
            processed_line_count += 1
            if processed_line_count % 1000000 == 0:
                print(f"Processed {processed_line_count} lines. {len(category_name_to_data)} categories found till now.")
    os.makedirs(data_root_dir+'tmp/', exist_ok=True)
    with open(data_root_dir + f'tmp/partial_category_page_data_part-{i}.txt', 'w') as out_f:
        for cat_name, data in category_name_to_data.items():
            data["category_name"] = cat_name
            data['sub_categories'] = list(data['sub_categories'])
            data['parent_categories'] = list(data['parent_categories'])
            out_f.write(json.dumps(data)+"\n")
    print(f"Finished part {i} in {(time.time() - start_time) / 60} minutes")
    #break

Processed 1000000 lines. 667920 categories found till now.
Processed 2000000 lines. 907769 categories found till now.
Finished part 0 in 1.015238106250763 minutes
Processed 3000000 lines. 561780 categories found till now.
Processed 4000000 lines. 844842 categories found till now.
Finished part 1 in 1.8206225077311198 minutes
Processed 5000000 lines. 417658 categories found till now.
Processed 6000000 lines. 767803 categories found till now.
Finished part 2 in 2.534004008769989 minutes
Processed 7000000 lines. 474404 categories found till now.
Processed 8000000 lines. 794315 categories found till now.
Finished part 3 in 3.25574977795283 minutes
Processed 9000000 lines. 503628 categories found till now.
Processed 10000000 lines. 862307 categories found till now.
Finished part 4 in 3.999419860045115 minutes
Processed 11000000 lines. 505264 categories found till now.
Processed 12000000 lines. 861881 categories found till now.
Finished part 5 in 4.74501971801122 minutes
Processed 13000000 l

In [20]:
os.makedirs(data_root_dir+'category_pages/', exist_ok=True)
start_time = time.time()
for partition in range(NUM_PARTITIONS):
    category_name_to_data = {}
    for i in range(10):
        with open(data_root_dir + f'tmp/partial_category_page_data_part-{i}.txt', 'r') as f:
            for line in f.readlines():
                line = line.strip()
                if line=="": continue
                data = json.loads(line)
                if (hash(data['category_name']) % NUM_PARTITIONS) != partition:
                    continue
                if data['category_name'] not in category_name_to_data:
                    category_name_to_data[data['category_name']] = get_empty_category_data()
                category_data = category_name_to_data[data['category_name']]
                category_data['sub_categories'].update(data['sub_categories'])
                category_data['parent_categories'].update(data['parent_categories'])
                category_data['internal_links'].extend(data['internal_links'])
                category_data['articles'].extend(data['articles'])
    with open(data_root_dir + f'category_pages/part-{partition}.txt', 'w') as out_f:
        for cat_name, data in category_name_to_data.items():
            data["category_name"] = cat_name
            data['sub_categories'] = list(data['sub_categories'])
            data['parent_categories'] = list(data['parent_categories'])
            out_f.write(json.dumps(data)+"\n")
    print(f"Finished part {partition} in {(time.time() - start_time) / 60} minutes")

                

Finished part 9 in 1.2498801310857137 minutes
Finished part 9 in 2.463934926191966 minutes
Finished part 9 in 3.403915067513784 minutes
Finished part 9 in 4.313976740837097 minutes
Finished part 9 in 5.202356016635894 minutes
Finished part 9 in 6.087162383397421 minutes
Finished part 9 in 6.9568502386411035 minutes
Finished part 9 in 7.881366391976674 minutes
Finished part 9 in 8.744562872250874 minutes
Finished part 9 in 9.610439304510752 minutes


In [30]:
# # check for a few category pages that they look good

# select_categories = ["Platonic solids", "Sampling (statistics)", "Drama films"]
# randomly_sampled_category_pages = []
# select_category_pages = []

# start_time = time.time()
# for partition in range(NUM_PARTITIONS):
#     with open(data_root_dir + f'category_pages/part-{partition}.txt', 'r') as f:
#         lines = [line for line in f.readlines() if line!='']
#         for line in lines:
#             data = json.loads(line)
#             if data['category_name'] in select_categories:
#                 select_category_pages.append(data)
#         for line in random.sample(lines, 10):
#             randomly_sampled_category_pages.append(json.loads(line))
#     print(f"Processed till part {partition} in {(time.time() - start_time) / 60} minutes")

In [31]:
# select_category_pages[2]
# randomly_sampled_category_pages[67]