In [None]:
import pandas as pd
import zipfile
import pandas as pd
import os
import json
import re
import ast
import matplotlib
import multiprocessing
from tqdm import tqdm

## Unzip top100 and minimum3 tables

In [None]:
#Path where all downloaded top100 and minimum3 zip files are:
original_data_path = 'data/stc-zip-files/'

#Unzip all Schema.org Table corpus files in directory: data/original-corpus-data
#Can take a few hours
unzipped_path = 'data/original-corpus-data/'
for file in os.listdir(unzipped_path):
    with zipfile.ZipFile(unzipped_path + file, 'r') as zip_ref:
        zip_ref.extractall(original_data_path)

tables = os.listdir(original_data_path)
len(tables)

## Create Statistic files

In [None]:
def get_table_statistics(file_name):
    
    file = 'data/stc-zip-files/' + file_name
    
    df = pd.read_json(file, compression='gzip', lines=True)
    df.drop(['row_id', 'page_url'], axis=1, inplace=True)
    
    try:
        number_of_rows = len(df.index)
        column_count = len(df.columns)
        empty_cells = df.isna().sum().sum()
        total_cells = number_of_rows * column_count

        column_name_and_density = {}
        overall_table_density = int((total_cells - empty_cells)/total_cells *100)

        for index, column in df.isna().sum().iteritems():
            column_name_and_density[index] = int(((number_of_rows - column) / number_of_rows) * 100)

        return [ file_name, number_of_rows, column_count, column_name_and_density, overall_table_density ]
    
    except ValueError:
        print(file_name)

In [None]:
pool = multiprocessing.Pool(processes=30)
res = pool.map(get_table_statistics, tables)
pool.close()
pool.join()

In [None]:
r = [re for re in res if re]
statistics = pd.DataFrame(r, columns=['file_name', 'number_of_rows', 'column_count', 'column_name_and_density', 'overall_table_density'])
statistics

In [None]:
statistics.to_csv('output-data/statistics/table_statistics.csv', index=False)

## Choose tables with at least 10 rows

In [None]:
minimum10 = statistics.loc[statistics['number_of_rows'] >= 10]
minimum10

## Column Names Statistics 
Re-arrange tables by unique column names.

In [None]:
min10_dict = minimum10.to_dict('records')

In [None]:
min10_dict[0]

In [None]:
#Add to a dictionary column names grouped by their class as keys and number of rows, columns and file_names as values
colnames = {}
for row in min10_dict:
    class_ = row['file_name'].split('_')[0]
    cols = row['column_name_and_density']
    
    if class_ not in colnames:
        colnames[class_] = {}
    
    for col in cols:
        if col in colnames[class_]:
            colnames[class_][col].append([row['number_of_rows'], cols[col], row['file_name'], row['overall_table_density']])
        else:
            colnames[class_][col] = [[row['number_of_rows'], cols[col], row['file_name'], row['overall_table_density']]]

In [None]:
#Turn the dictionary with the column names into a dataframe
allcols = []
for classes in colnames:
    for colname in colnames[classes]:
        total_rows = 0
        tables_and_density = {}
        
        for tabs in colnames[classes][colname]:
            total_rows += tabs[0]
            #Museum_takemetotheworld.com_September2020.json.gz: overall_table_density, column density in this table
            tables_and_density[tabs[2]] = [tabs[3], tabs[1]] 
        
        allcols.append([classes, colname, len(colnames[classes][colname]), total_rows, tables_and_density])

In [None]:
all_cols = pd.DataFrame(allcols, columns=['class', 'column_name', 'table_number', 'row_number', 'table_and_density'])
all_cols

## Match to Schema.org Properties and Types

In [None]:
#For each property file, read its propstotypes file and match expected types
result_all = []

for index, row in all_cols.iterrows():
    col_name = str(row['column_name'])
    class_ = row['class']
        
    types = pd.read_csv('data/PropsToTypes/' + class_ + '_propsToTypes.csv')
    find_label = (row['class'] + '.' + str(row['column_name'])).lower()
    is_label = types.loc[ types['property'].str.lower() == find_label ]
    
    final_label = ''
    final_type = ''

    if(len(is_label) != 0):
        idx = is_label.index.tolist()[0]
        final_label = is_label['property'][idx]
        final_type = is_label['expected_types'][idx]

    result_all.append([class_, col_name, final_label, final_type, row['table_number'], row['row_number'], row['table_and_density']])

In [None]:
cols_to_schema = pd.DataFrame(result_all, columns=['class', 'column_name', 'relation_label', 'type_label', 'table_number','row_number','table_and_density'])
cols_to_schema

In [None]:
#Filter out column names that did not match any schema.org property = are wrong
cols = cols_to_schema.loc[(cols_to_schema['relation_label'] != '')]
cols

In [None]:
cols.to_csv('output-data/statistics/column_label_mapping.csv', index=False)