In [1]:
import os
import multiprocessing
import pandas as pd
import random

In [2]:
#Select Schema.org types/classes that have the most number of tables
selected_classes = ['Product', 'Event', 'Recipe', 'Person', 'LocalBusiness', 'JobPosting', 'CreativeWork', 'Restaurant', 'Place', 'Movie', 'Book', 'MusicRecording', 'Hotel', 'SportsEvent', 'MusicAlbum', 'TVEpisode', 'Museum']

In [10]:
#Filter out CPA labels that have less than 50 examples
column_label_mapping = pd.read_csv('output-data/statistics/column_label_mapping.csv')
column_label_mapping = column_label_mapping.loc[column_label_mapping['class'].isin(selected_classes)]
column_label_mapping = column_label_mapping.loc[column_label_mapping['table_number'] >= 50]
column_label_mapping['cpa'] = column_label_mapping['relation_label'].apply(lambda row: row.split('.')[1])
column_label_mapping

Unnamed: 0,class,column_name,relation_label,type_label,table_number,row_number,table_and_density,cpa
95,Museum,name,Museum.name,Text,181,13785,{'Museum_takemetotheworld.com_September2020.js...,name
96,Museum,address,Museum.address,"['PostalAddress', 'Text']",180,12743,{'Museum_takemetotheworld.com_September2020.js...,address
97,Museum,telephone,Museum.telephone,Text,160,11710,{'Museum_takemetotheworld.com_September2020.js...,telephone
98,Museum,description,Museum.description,Text,153,9800,{'Museum_takemetotheworld.com_September2020.js...,description
99,Museum,geo,Museum.geo,"['GeoCoordinates', 'GeoShape']",158,11184,{'Museum_takemetotheworld.com_September2020.js...,geo
...,...,...,...,...,...,...,...,...
1625,Recipe,cookingmethod,Recipe.cookingMethod,Text,421,85673,{'Recipe_yummly.com_September2020.json.gz': [8...,cookingMethod
1628,Recipe,inlanguage,Recipe.inLanguage,"['Language', 'Text']",80,22669,{'Recipe_giallozafferano.it_September2020.json...,inLanguage
1630,Recipe,performtime,Recipe.performTime,Duration,855,85364,{'Recipe_ricettario-bimby.it_September2020.jso...,performTime
1632,Recipe,ispartof,Recipe.isPartOf,"['CreativeWork', 'URL']",3719,367817,{'Recipe_thriftandspice.com_September2020.json...,isPartOf


In [6]:
# Manual checked file for annotations: can be downloaded in the website
matched = pd.read_csv('data/Final CTA and CPA Labels.csv')
matched

Unnamed: 0,class,column_name,CPA label,CTA label,table_number,fallback_CTA_label
0,Product,name,NONE,Product/name,810710,
1,Product,offers,offers,Offer,782333,
2,Product,offers:pricecurrency,priceCurrency,currency,763066,
3,Product,offers:price,price,price,716385,
4,Product,offers:availability,availability,ItemAvailability,432124,
...,...,...,...,...,...,...
667,Museum,telephone,telephone,telephone,160,
668,Museum,geo,geo,"GeoCoordinates, GeoShape",158,geo
669,Museum,geo:latitude,latitude,CoordinateAT,158,
670,Museum,geo:longitude,longitude,CoordinateAT,158,


In [5]:
# Schema.org type, column names and properties to expand to new columns
# Example: {Product: {offers: ['price', 'pricecurrency',...], ...}}
class_to_prop_to_table = {}

for index, row in matched.iterrows():
    if row['class'] not in class_to_prop_to_table:
        class_to_prop_to_table[row['class']] = {}
        
    if ':' not in row['column_name'] and row['column_name'] not in class_to_prop_to_table[row['class']]:
        class_to_prop_to_table[row['class']][row['column_name']] = []
        
    if ':' in row['column_name']:
        main, prop = row['column_name'].split(':')
        
        class_to_prop_to_table[row['class']][main].append(prop)
    

In [6]:
class_to_prop_to_table['Product']

{'name': [],
 'offers': ['pricecurrency',
  'price',
  'availability',
  'pricevaliduntil',
  'itemcondition',
  'description',
  'sku',
  'category',
  'mpn',
  'validfrom',
  'inventorylevel',
  'serialnumber',
  'gtin',
  'availabledeliverymethod',
  'gtin13',
  'warranty',
  'acceptedpaymentmethod',
  'validthrough',
  'availabilitystarts',
  'deliveryleadtime',
  'gtin14',
  'eligiblequantity',
  'eligibleregion',
  'pricespecification',
  'identifier',
  'availabilityends',
  'gtin8'],
 'description': [],
 'sku': [],
 'url': [],
 'brand': ['name'],
 'image': [],
 'aggregaterating': ['ratingvalue',
  'reviewcount',
  'bestrating',
  'ratingcount',
  'worstrating',
  'itemreviewed'],
 'productid': [],
 'mpn': [],
 'category': ['name'],
 'review': ['reviewbody', 'description'],
 'manufacturer': ['name'],
 'model': ['name'],
 'itemcondition': [],
 'gtin13': [],
 'weight': ['value', 'unitcode', 'unittext'],
 'releasedate': [],
 'color': [],
 'identifier': ['value'],
 'width': ['value'

In [11]:
tables = set()
for index, row in column_label_mapping.iterrows():
    for table in eval(row['table_and_density']):
        tables.add(table)

In [12]:
# Tables to expand
tables = list(tables)
len(tables)

1054154

### Get values from tables

In [None]:
#Existing English Tables
existing = open("output-data/english_table_names.txt", 'r')
existing_english_tables = [line.replace('\n', '') for line in existing.readlines()]
len(existing_english_tables)

In [28]:
def expand_table(file_name):
    
    if file_name in existing:
        file = 'data/original-corpus-data/' + file_name
    else:
        file = 'output-data/new-english-tables/' + file_name
    
    try:
        #Open table
        df = pd.read_json(file, compression='gzip', lines=True)
        # New columns to add from expanded properties
        new_columns = {}

        for column_name in df.columns:
        
            #If column has properties marked to be expanded
            if column_name in class_to_prop_to_table[class_]: 
                
                #Which properties are to be exapnded for this column:
                expand = class_to_prop_to_table[class_][column_name]

                dictionaries = False #check if a row contains dictionaries
                dict_keys = {} #properties + row count where it appears
                row_number = len(df[df[column_name].notna()][column_name].tolist())


                #First phase: for each row count how many properties are selected
                #For each row of the column
                for row in df[column_name].tolist():

                    #Check if row is a dictionary
                    if isinstance(row, dict):
                        dictionaries = True

                        #For each property of the entity = new column in table
                        for prop in row:
                            #If property is selected
                            if prop in expand:
                                if prop not in dict_keys:
                                    dict_keys[prop] = 0
                                dict_keys[prop] += 1

                    #Check if row is a list
                    elif isinstance(row, list):
                        props = set()
                        
                        #For each element in the list check if it is a dictionary
                        for element in row:
                            if isinstance(element, dict):
                                dictionaries = True
                                #For each property of the entity = new column in table
                                for prop in element:
                                    if prop in expand:
                                        props.add(prop)

                        for prop in props:
                            if prop not in dict_keys:
                                dict_keys[prop] = 0
                            dict_keys[prop] += 1

                #Second Phase:
                #After checking all rows if they contain dictionaries: Look at the properties and in how many rows they appear
                #If they appear in at least 80% of rows -> add as new column to table
                sel_props = {}

                if dictionaries:

                    #Note down new columns to add for this specific column
                    for column in dict_keys:
                        #Select properties with at least 80% density
                        if (dict_keys[column] / row_number) >= 0.8:
                            sel_props[column] = []

                    #Iterate again over the row values and add them as new values to a new column for each new selected property
                    for row in df[column_name].tolist():
                        
                        #Check if row is a dictionary
                        if isinstance(row, dict):
                            for prop in sel_props:
                                #If property exists add dictonary value as row value
                                if prop in row:
                                    sel_props[prop].append(row[prop])
                                #Else add an empty value
                                else:
                                    sel_props[prop].append(None)
                                    
                                    
                        #Check if row is a list
                        elif isinstance(row, list):
                            list_props = {}

                            #For each item in list:
                            for item in row:
                                
                                if isinstance(item, dict):
                                    for prop in sel_props:
                
                                        if prop not in list_props:
                                            list_props[prop] = []

                                        #If property exists add dictonary value as row value
                                        if prop in item:
                                            list_props[prop].append(item[prop])
                                        #Else add an empty value
                                        else:
                                            list_props[prop].append(None)
                                            
                                elif isinstance(item, str):
                                    for prop in sel_props:
                                        if prop not in list_props:
                                            list_props[prop] = []

                                        list_props[prop].append(item)
                            
                            #Add list of values as new column values
                            for prop in list_props:
                                sel_props[prop].append(list_props[prop])


                        elif isinstance(row, str):
                            for prop in sel_props:
                                sel_props[prop].append(row)

                        elif pd.isnull(row):
                            for prop in sel_props:
                                sel_props[prop].append(None)

                    #Manual cheks:
                    #Check if review:reviewbody and review:description both in selected:
                    #Keep the longest text value
                    if column_name == 'review' and 'reviewbody' in sel_props and 'description' in sel_props:
                        i = 0
                        for val in sel_props['reviewbody']:
                            first_l = 0
                            second_l = 0

                            if val != None:
                                first_l = len(val)
                            if sel_props['description'][i] != None:
                                second_l = len(sel_props['description'][i])

                            #Set new column value to the longest of the two text values
                            if first_l >= second_l:
                                sel_props['reviewbody'][i] = val
                            else:
                                sel_props['reviewbody'][i] = sel_props['description'][i]
                            i += 1

                        del sel_props['description']

                    #Check if there is already a telephone column
                    if 'telephone' in sel_props:
                        if 'telephone' in df.columns:
                            del sel_props['telephone']


                    #Check if there is already a faxNumber column
                    if 'faxnumber' in sel_props:
                        if 'faxnumber' in df.columns:
                            #print('already one!')
                            del sel_props['faxnumber']

                    #Check if there is already an email column
                    if 'email' in sel_props:
                        if 'email' in df.columns:
                            #print('already one!')
                            del sel_props['email']

                    #Add values to new_columns
                    for prop in sel_props:
                        new_columns[column_name+':'+prop] = sel_props[prop]

                    #Delete old column with dictionaries
                    #Delete if dictionary and no selected props: drop (url, image, photo, video)
                    df.drop(column_name, inplace=True, axis=1)

            else:
                #Drop if column not in selected properties
                df.drop(column_name, inplace=True, axis=1)
    
        #Add new columns to dataframe
        if new_columns:
            for column in new_columns:
                df[column] = new_columns[column]


        #Re-arrange column order:
        #Swap two first columns with a 20% probability
        #Swap all other columns with each other with a 70% probability
        col_name_list = df.columns.tolist()
        i = 0

        for col_name in col_name_list:
            percentage = 70

            if i <= 1:
                percentage = 20

            if i != 1 and i != len(col_name_list)-1:
                if random.choices([0, 1], weights=(100-percentage, percentage))[0]:
                    temp = col_name_list[i+1]
                    col_name_list[i+1] = col_name_list[i]
                    col_name_list[i] = temp
            i += 1

        #Change column order in dataframe
        df = df[col_name_list]
        df.to_json('output-data/expanded-tables/'+file_name, orient='records', lines=True, compression='gzip')
    
    except Exception:
        print(file_name)    


In [19]:
#Manual removal
tables.remove('Restaurant_ifoodie.tw_September2020.json.gz')
#Remove tables
#Movie_hotspur.ru_September2020.json.gz

In [29]:
pool = multiprocessing.Pool(processes=30)
res = pool.map(expand_table, tables)
pool.close()
pool.join()