In [1]:
import pandas as pd
import zipfile
import pandas as pd
import os
import json
import re
import ast
import matplotlib
import multiprocessing

## Create Statistic files for English Tables

In [None]:
#Existing English Tables
existing = open("output-data/english_table_names.txt", 'r')
existing_english_tables = [line.replace('\n', '') for line in existing.readlines()]
len(existing_english_tables)

In [None]:
#New English Tables
new_english_tables = os.listdir('output-data/new-english-tables/')
len(new_english_tables)

In [None]:
def get_table_statistics(file_name):
    
    if file_name in existing:
        file = 'data/original-corpus-data/' + file_name
    else:
        file = 'output-data/new-english-tables/' + file_name
    
    
    df = pd.read_json(file, compression='gzip', lines=True)
    df.drop(['page_url'], axis=1, inplace=True)
    
    number_of_rows = len(df.index)
    column_count = len(df.columns)
    empty_cells = df.isna().sum().sum()
    total_cells = number_of_rows * column_count
    
    column_name_and_density = {}
    overall_table_density = int((total_cells - empty_cells)/total_cells *100)
    
    for index, column in df.isna().sum().iteritems():
        column_name_and_density[index] = int(((number_of_rows - column) / number_of_rows) * 100)
    
    return [ file_name, number_of_rows, column_count, column_name_and_density, overall_table_density ]

In [None]:
pool = multiprocessing.Pool(processes=20)
res = pool.map(get_table_statistics, existing + new_english_tables)
pool.close()
pool.join()

In [None]:
statistics = pd.DataFrame(res, columns=['file_name', 'number_of_rows', 'column_count', 'column_name_and_density', 'overall_table_density'])
statistics

## Choose tables with at least 10 rows

In [4]:
minimum10 = statistics.loc[statistics['number_of_rows'] >= 10]
minimum10

Unnamed: 0,file_name,number_of_rows,column_count,column_name_and_density,overall_table_density
0,School_schoolscompared.com_September2020.json.gz,66,3,"{'name': 100, 'telephone': 100, 'aggregaterati...",100
1,Park_discoverdupage.com_September2020.json.gz,12,5,"{'name': 100, 'address': 100, 'description': 1...",98
2,Painting_josephraffael.com_September2020.json.gz,19,9,"{'name': 100, 'description': 100, 'author': 52...",67
3,Park_visitnordjylland.com_September2020.json.gz,26,7,"{'name': 100, 'address': 100, 'description': 1...",86
4,Park_visithoustontexas.com_September2020.json.gz,15,5,"{'name': 100, 'address': 100, 'description': 1...",88
...,...,...,...,...,...
1795120,Product_leickert-dental.de_September2020.json.gz,156,7,"{'name': 100, 'offers': 100, 'description': 10...",91
1795121,Product_marijuanabudsstore.com_September2020.j...,35,5,"{'name': 100, 'offers': 100, 'description': 10...",92
1795124,Product_tomjanuaryfloors.com_September2020.jso...,1043,6,"{'name': 100, 'offers': 100, 'description': 10...",100
1795125,Product_shop-kvplus.ru_September2020.json.gz,206,3,"{'name': 100, 'offers': 100, 'description': 100}",100


## Column Names Statistics 
Re-arrange tables by unique column names.

In [5]:
min10_dict = minimum10.to_dict('records')

In [6]:
min10_dict[0]

{'file_name': 'School_schoolscompared.com_September2020.json.gz',
 'number_of_rows': 66,
 'column_count': 3,
 'column_name_and_density': "{'name': 100, 'telephone': 100, 'aggregaterating': 100}",
 'overall_table_density': 100}

In [7]:
#Add to a dictionary column names grouped by their class as keys and number of rows, columns and file_names as values
colnames = {}
for row in min10_dict:
    class_ = row['file_name'].split('_')[0]
    cols = ast.literal_eval(row['column_name_and_density'])
    
    if class_ not in colnames:
        colnames[class_] = {}
    
    for col in cols:
        if col in colnames[class_]:
            colnames[class_][col].append([row['number_of_rows'], cols[col], row['file_name'], row['overall_table_density']])
        else:
            colnames[class_][col] = [[row['number_of_rows'], cols[col], row['file_name'], row['overall_table_density']]]

In [8]:
#Turn the dictionary with the column names into a dataframe
allcols = []
for classes in colnames:
    for colname in colnames[classes]:
        total_rows = 0
        tables_and_density = {}
        
        for tabs in colnames[classes][colname]:
            total_rows += tabs[0]
            #Museum_takemetotheworld.com_September2020.json.gz: overall_table_density, column density in this table
            tables_and_density[tabs[2]] = [tabs[3], tabs[1]] 
        
        allcols.append([classes, colname, len(colnames[classes][colname]), total_rows, tables_and_density])

In [9]:
all_cols = pd.DataFrame(allcols, columns=['class', 'column_name', 'table_number', 'row_number', 'table_and_density'])
all_cols

Unnamed: 0,class,column_name,table_number,row_number,table_and_density
0,School,name,145,85319,{'School_schoolscompared.com_September2020.jso...
1,School,telephone,72,52788,{'School_schoolscompared.com_September2020.jso...
2,School,aggregaterating,43,37264,{'School_schoolscompared.com_September2020.jso...
3,School,address,124,78550,{'School_ville-data.com_September2020.json.gz'...
4,School,geo,23,2830,{'School_ville-data.com_September2020.json.gz'...
...,...,...,...,...,...
6129,Recipe,recipecooktime,1,15,{'Recipe_marubotana.tv_September2020.json.gz':...
6130,Recipe,mentions,1,103,{'Recipe_uol.com.br_September2020.json.gz': [9...
6131,Recipe,reviewrating,1,63,{'Recipe_99juices.com_September2020.json.gz': ...
6132,Recipe,discussionurl,1,24,{'Recipe_manjoo.it_September2020.json.gz': [10...


## Match to Schema.org Properties and Types

In [11]:
#For each property file, read its propstotypes file and match expected types
result_all = []

for index, row in all_cols.iterrows():
    col_name = str(row['column_name'])
    class_ = row['class']
        
    types = pd.read_csv('../PropsToTypes/' + class_ + '_propsToTypes.csv')
    find_label = (row['class'] + '.' + str(row['column_name'])).lower()
    is_label = types.loc[ types['property'].str.lower() == find_label ]
    
    final_label = ''
    final_type = ''

    if(len(is_label) != 0):
        idx = is_label.index.tolist()[0]
        final_label = is_label['property'][idx]
        final_type = is_label['expected_types'][idx]

    result_all.append([class_, col_name, final_label, final_type, row['table_number'], row['row_number'], row['table_and_density']])

In [12]:
cols_to_schema = pd.DataFrame(result_all, columns=['class', 'column_name', 'relation_label', 'type_label', 'table_number','row_number','table_and_density'])
cols_to_schema

Unnamed: 0,class,column_name,relation_label,type_label,table_number,row_number,table_and_density
0,School,name,School.name,Text,145,85319,{'School_schoolscompared.com_September2020.jso...
1,School,telephone,School.telephone,Text,72,52788,{'School_schoolscompared.com_September2020.jso...
2,School,aggregaterating,School.aggregateRating,AggregateRating,43,37264,{'School_schoolscompared.com_September2020.jso...
3,School,address,School.address,"['PostalAddress', 'Text']",124,78550,{'School_ville-data.com_September2020.json.gz'...
4,School,geo,School.geo,"['GeoCoordinates', 'GeoShape']",23,2830,{'School_ville-data.com_September2020.json.gz'...
...,...,...,...,...,...,...,...
6129,Recipe,recipecooktime,,,1,15,{'Recipe_marubotana.tv_September2020.json.gz':...
6130,Recipe,mentions,Recipe.mentions,Thing,1,103,{'Recipe_uol.com.br_September2020.json.gz': [9...
6131,Recipe,reviewrating,,,1,63,{'Recipe_99juices.com_September2020.json.gz': ...
6132,Recipe,discussionurl,Recipe.discussionUrl,URL,1,24,{'Recipe_manjoo.it_September2020.json.gz': [10...


### Matching to all properties regardless of type

In [13]:
#Filter out column names that did not match any schema.org property = are wrong
cols = cols_to_schema.loc[(cols_to_schema['relation_label'] != '')]
cols

Unnamed: 0,class,column_name,relation_label,type_label,table_number,row_number,table_and_density
0,School,name,School.name,Text,145,85319,{'School_schoolscompared.com_September2020.jso...
1,School,telephone,School.telephone,Text,72,52788,{'School_schoolscompared.com_September2020.jso...
2,School,aggregaterating,School.aggregateRating,AggregateRating,43,37264,{'School_schoolscompared.com_September2020.jso...
3,School,address,School.address,"['PostalAddress', 'Text']",124,78550,{'School_ville-data.com_September2020.json.gz'...
4,School,geo,School.geo,"['GeoCoordinates', 'GeoShape']",23,2830,{'School_ville-data.com_September2020.json.gz'...
...,...,...,...,...,...,...,...
6114,Recipe,locationcreated,Recipe.locationCreated,Place,1,14,{'Recipe_cailler.ch_September2020.json.gz': [9...
6118,Recipe,version,Recipe.version,"['Number', 'Text']",2,91,{'Recipe_guideme.me_September2020.json.gz': [8...
6119,Recipe,contentlocation,Recipe.contentLocation,Place,2,1300,{'Recipe_arecetas.com_September2020.json.gz': ...
6130,Recipe,mentions,Recipe.mentions,Thing,1,103,{'Recipe_uol.com.br_September2020.json.gz': [9...


In [18]:
cols.to_csv('output-data/statistics/column_label_mapping.csv', index=False)