In [None]:
import pandas as pd
import ast
import copy

In [None]:
cta_statistics = pd.read_csv('output-data/statistics/cta_statistics.csv')
cta_statistics = cta_statistics.loc[~pd.isnull(cta_statistics['cta_label'])]
cta_statistics = cta_statistics.loc[cta_statistics['column_count'] >= 50]
cta_statistics

In [None]:
annotated_tables = pd.read_csv('output-data/statistics/expanded_tables_annotations.csv')

In [None]:
type_lbls = {}
for index, row in annotated_tables.iterrows():
    for col in eval(row['type_labels']):
        if eval(row['type_labels'])[col] != None and eval(row['type_labels'])[col] != 'Wrong':
            type_lbls[col+'_'+row['file_name']] = eval(row['type_labels'])[col].strip() 

In [None]:
#Choose tables with more than 3 columns
tabs = pd.read_csv('output-data/statistics/expanded_tables_annotations_cta.csv')
tabs = tabs.loc[(tabs['column_count'] >= 3)]
tabs

In [None]:
file_names = set(tabs['file_name'].tolist())
len(file_names)

In [None]:
#Columns to ignore in selection with a density less than 70%
ignore = []
tables_to_dict = annotated_tables.to_dict('records')
for table in tables_to_dict:
    for col in eval(table['all_cols']):
        if eval(table['all_cols'])[col] < 70:
            ignore.append(col+'_'+table['file_name'])
ignore = set(ignore)

In [None]:
#Create a file to hold already selected columns from tables and their selection type
# Selection type: 'Missing values', 'Intra similarity', 'Inter simimilarity/dissimilarity', 'format heterogeneity'

## 1. Corner Cases Columns Selection

In [None]:
num_sim = pd.read_csv('output-data/similarities/subset8000_num_sim_cta.csv.gz', compression='gzip')
num_sim2 = pd.read_csv('output-data/similarities/subset7000_num_sim_cta.csv.gz', compression='gzip')
date_sim = pd.read_csv('output-data/similarities/subset6000_datetime_sim.csv.gz', compression='gzip')
date_sim2 = pd.read_csv('output-data/similarities/subset6000_2_datetime_sim.csv.gz', compression='gzip')
text_sim = pd.read_csv('output-data/similarities/subset219000_textcols_coltype_sim.csv.gz', compression='gzip')
text_sim2 = pd.read_csv('output-data/similarities/subset150000_textcols_coltype_sim.csv.gz', compression='gzip')
similarities = pd.concat([num_sim, num_sim2, date_sim, date_sim2, text_sim, text_sim2], ignore_index=True)
similarities['file_name'] = similarities['col_name'].apply(lambda row: row.split('_')[1]+'_'+row.split('_')[2]+'_'+row.split('_')[3])
similarities = similarities.loc[similarities['file_name'].isin(file_names)]
similarities = similarities.loc[~similarities['col_name'].isin(ignore)]
similarities

### Select intra similarity columns

In [None]:
#Column -> its similar cols
intra_similarities = {}
for index, row in similarities.iterrows():
    col_name, class_name, table_name, ending = row['col_name'].split('_')
    file_name = '_'.join([class_name, table_name, ending])
    
    #If correct column
    if row['col_name'] in type_lbls and (type_lbls[row['col_name']] != None or type_lbls[row['col_name']] != 'Wrong'):
        
        #Look at only its 10 most similar columns
        similar_cols = row['similar_cols'].split('; ')[:10]
        intra_similarities[row['col_name']] = []

        for col in similar_cols:
            col_name2, class_name2, table_name2, ending2 = col.split('_')
            file_name_2 = '_'.join([class_name2, table_name2, ending2])

            # More than 70% of density, same table and in selected tables for CPA
            if col in type_lbls and file_name_2 in file_names and file_name == file_name_2 and col not in ignore:
                if type_lbls[row['col_name']] != type_lbls[col]:
                    if (type_lbls[col] != 'None' or type_lbls[col] != 'Wrong'):
                        intra_similarities[row['col_name']].append(col)

In [None]:
intra_tables = {} # Mark which columns can be selected in each table
intra_cols = set() # Which columns selected overall
intra_class = {} # How many columns per schema.org type/class
intra_labels = {}

for key in intra_similarities:
    if intra_similarities[key]:
        
        col_name, class_, table_name, ending = key.split('_')
        file_name = '_'.join([class_, table_name, ending])
        label = type_lbls[key]
        
        if file_name not in intra_tables:
            intra_tables[file_name] = set()
        
        if class_ not in intra_class:
            intra_class[class_] = 0
            
        if label not in intra_labels:
            intra_labels[label] = []
            
        if key not in intra_labels[label]:
            intra_labels[label].append(key)
            
        intra_cols.add(key)
        intra_tables[file_name].add(key)
        intra_class[class_] += 1
        
        for col in intra_similarities[key]:
            label = type_lbls[col]
            
            if label not in intra_labels:
                intra_labels[label] = []
            
            if col not in intra_labels[label]:
                intra_labels[label].append(col)
            
            intra_cols.add(col)
            intra_tables[file_name].add(col)
            intra_class[class_] += 1
            

In [None]:
#Mark all selected columns in a dictionary:
selected_cols_tables = {}

for table in intra_tables:
    
    selected_cols_tables[table] = {}
    for col in intra_tables[table]:
        selected_cols_tables[table][col.split('_')[0]] = 'Intra similarity'

In [None]:
low_class = [] #Schema.org types that have less than 1500 selected columns until now
low_label = [] #CTA labels that have less than 100 examples until now

for cl in intra_class:
    if intra_class[cl] < 1500:
        low_class.append(cl)
        
for lb in intra_labels:
    if len(intra_labels[lb]) < 100:
        low_label.append(lb)

### Select inter table similarity columns

In [None]:
inter_similarities = {}
for index, row in similarities.iterrows():
    col_name, class_name, table_name, ending = row['col_name'].split('_')
    file_name = '_'.join([class_name, table_name, ending])
        
    #If correct column
    if row['col_name'] in type_lbls and type_lbls[row['col_name']] != None and type_lbls[row['col_name']] != 'Wrong':
        #Look at 10 most similar columns
        similar_cols = row['similar_cols'].split('; ')[:10]
        inter_similarities[row['col_name']] = []

        for col in similar_cols:
            col_name2, class_name2, table_name2, ending2 = col.split('_')
            file_name_2 = '_'.join([class_name2, table_name2, ending2])

            if col in type_lbls and file_name_2 in file_names and file_name != file_name_2 and col not in ignore:
                if type_lbls[row['col_name']] != type_lbls[col]:
                    if (type_lbls[col] != None and type_lbls[col] != 'Wrong'):
                        inter_similarities[row['col_name']].append(col)

In [None]:
#Select columns that have at least one similar column
inter_sims = [ [x] + inter_similarities[x] for x in inter_similarities if len(inter_similarities[x]) > 0]

In [None]:
# Selecting inter similarities: 
#Count how many columns for low class number of columns and for low CTA labels can be selected
select_test = []
i = 0

for cols in inter_sims:
    select_test.append([])
    
    for col in cols:
        #class_ = col.split('_')[1]
        
        col_name, class_name, table_name, ending = col.split('_')
        table = '_'.join([class_name, table_name, ending])
        
        #If table is in low class and in low label
        if class_name in low_class and type_lbls[col] in low_label:
            #If it hasn't been selected already in the intra similarity phase
            if (table in selected_cols_tables and col_name not in selected_cols_tables[table]) or table not in selected_cols_tables:
                select_test[i].append(col)
    i+=1
    
    
# And Filter out columns with less than 1 similar column
s = [ x for x in select_test if len(x) > 1]


# And Select maximum 3500 cols for each CTA label
sel_labels = {}

for sim in s:
    for col in sim:
        
        if type_lbls[col] not in sel_labels:
            sel_labels[type_lbls[col]] = []
        
        if len(sel_labels[type_lbls[col]]) < 3500:
            sel_labels[type_lbls[col]].append(col)

sel_labels

In [None]:
#Select columns which are included in sel_labels
selected_test_2 = []
i = 0

for cols in inter_sims:
    selected_test_2.append([])
    
    for col in cols:
        if type_lbls[col] in sel_labels and col in sel_labels[type_lbls[col]]:
            selected_test_2[i].append(col)
        
    i+=1

#Filter out columns with no similar columns
s_2 = [ x for x in selected_test_2 if len(x) > 1]

In [None]:
# Add the new selected columns to the already selected ones
selected_cols = copy.deepcopy(selected_cols_tables)
for s in s_2:
    for col in s:
        tab = col.split('_')[1] + '_' + col.split('_')[2] + '_' + col.split('_')[3]
        
        if tab not in selected_cols:
            selected_cols[tab] = {}
        
        selected_cols[tab][col.split('_')[0]] = 'Inter Similarity'
        

## 2. Select Missing values columns

In [None]:
#Choose some low density tables
low_dens_70 = annotated_tables.loc[ (annotated_tables['overall_table_density'] < 70) ]
low_dens_70

In [None]:
#How many low density columns can be annotated per table
low_dens_70['low_cols'] = low_dens_70['all_cols'].apply(lambda row: len( [x for x in eval(row) if eval(row)[x] < 70 and eval(row)[x] > 10 ] ))

In [None]:
low_dens_70.loc[ (low_dens_70['low_cols'] >= 3) ].groupby(['class'])['low_cols'].sum()

In [None]:
# Limit Product, Recipe and Event tables to 800
prods = low_dens_70.loc[ (low_dens_70['low_cols'] >= 3) & (low_dens_70['class'] == 'Product' ) ][1000:]
recipe = low_dens_70.loc[ (low_dens_70['low_cols'] >= 3) & (low_dens_70['class'] == 'Recipe' ) ][1000:]
event = low_dens_70.loc[ (low_dens_70['low_cols'] >= 3) & (low_dens_70['class'] == 'Event' ) ][1000:]

remove_tables = prods['file_name'].tolist() + recipe['file_name'].tolist() + event['file_name'].tolist()

In [None]:
#Select columns with low density from the low density tables that have at least 3 low density columns
#Add to all other selected columns

for index, row in low_dens_70.iterrows():
    file_name = row['file_name']
    
    if file_name not in remove_tables:
        if file_name not in selected_cols:
            selected_cols[file_name] = {}

        #Look at low density columns and select if not already annotated from corner cases
        annotated_cols = eval(row['all_cols'])

        for column in annotated_cols:
            if annotated_cols[column] < 70 and annotated_cols[column] > 10 and column not in selected_cols[file_name]:
                selected_cols[file_name][column] = 'Missing values'


## 3. Select value format heterogeneity columns

In [None]:
#Select the same columns that were selected for CPA excpet from some datetime columns
selection_cpa = pd.read_csv('output-data/cpa-datasets/selected_1_2.csv')

In [None]:
selection = annotated_tables.loc[ annotated_tables['file_name'].isin(selected_cols) ]
selection['selected_cols'] = selected_cols

In [None]:
count_value = {}

for index, row in selection_cpa.iterrows():
    types = eval(row['type_labels'])
    for col in eval(row['selected_cols']):
        col_file = col + '_' + row['file_name']
        
        if col != 'basesalary' and col != 'estimatedsalary' and col != 'basesalary' and col != 'nutrition:servingsize' and col != 'size' and col not in ignore and col_file in type_lbls:
            if eval(row['selected_cols'])[col] == 'Value Heterogeneity':
                if (row['file_name'] in selected_cols and col not in selected_cols[row['file_name']]) or row['file_name'] not in selected_cols:
                    
                    if type_lbls[col_file] not in count_value:
                        count_value[type_lbls[col_file]] = 0
                    
                    
                    if count_value[type_lbls[col_file]] < 2000:
                        count_value[type_lbls[col_file]] += 1
                        
                        if row['file_name'] not in selected_cols:
                            selected_cols[row['file_name']] = {}
                        selected_cols[row['file_name']][col] = 'Value Heterogeneity'

## 4. Select random columns for all labels

In [None]:
all_cta = cta_statistics['cta_label'].tolist()

In [None]:
selection = annotated_tables.loc[ annotated_tables['file_name'].isin(selected_cols) ]
sel_cols = []
for index, row in selection.iterrows():
    sel_cols.append(selected_cols[row['file_name']])
selection['selected_cols'] = sel_cols
selection['selected_cols_number'] = selection['selected_cols'].apply(lambda row: len(row))

In [None]:
selection_dict = selection.to_dict('records')

In [None]:
#Select from already selected tables all non-annotated columns with a limit of 6500 columns
count_class = {} #Count columns per Schema.org type/class
type_count = {} #Count columns per CTA label

for tab in selection_dict:
    sel = tab['selected_cols']
    
    if tab['class'] not in count_class:
        count_class[tab['class']] = 0
    
    #Select a maximum of 6500 columns per Schema.org type/class 
    if count_class[tab['class']] < 6500 and col+'_'+row['file_name'] not in ignore :
    
        for col in eval(tab['all_cols']):
            #If column has not been yet selected:
            if col not in sel and col+'_'+row['file_name'] in type_lbls:
                count_class[tab['class']] += 1
                
                #Add to selected
                selected_cols[tab['file_name']][col] = 'Random'

                if type_lbls[col+'_'+tab['file_name']] not in type_count:
                    type_count[type_lbls[col+'_'+tab['file_name']]] = 0

                type_count[type_lbls[col+'_'+tab['file_name']]] += 1


#             elif col in type_lbls:
#                 if type_lbls[col+'_'+tab['file_name']] not in type_count:
#                     type_count[type_lbls[col+'_'+tab['file_name']]] = 0

#                 type_count[type_lbls[col+'_'+tab['file_name']]] += 1

In [None]:
all_cta = all_cta + list(type_count.keys())
all_cta = list(set(all_cta))
#Mark only labels that have not yet reached at least 100 examples
for c in all_cta:
    if c in type_count and type_count[c] >= 100: 
        all_cta.remove(c)

In [None]:
#Add more examples to CTA labels that have not reached enough examples
count_class_2 = {} #Count columns per Schema.org type, do not pass 6500 columns

for cl in count_class:
    count_class_2[cl] = 0
    if count_class[cl] < 2000:
        d = annotated_tables.loc[(~annotated_tables['file_name'].isin(selected_cols)) & (annotated_tables['class'] == cl ) & (annotated_tables['overall_table_density'] >= 70 )]
        
        for index, row in d.iterrows():
            for col in eval(row['all_cols']):
                if col+'_'+row['file_name'] in type_lbls and col+'_'+row['file_name'] not in ignore:
                    
                    if type_lbls[col+'_'+row['file_name']] not in type_count:
                        type_count[type_lbls[col+'_'+row['file_name']]] = 0
                    
                    if type_count[type_lbls[col+'_'+row['file_name']]] < 2000:
                        if count_class_2[cl] < (6500 - count_class[cl]):
                            if row['file_name'] not in selected_cols:
                                selected_cols[row['file_name']] = {}
                            
                            selected_cols[row['file_name']][col] = 'Random 2'
                            count_class_2[cl] += 1
                            type_count[type_lbls[col+'_'+row['file_name']]] +=1
        

In [None]:
#Select maximum 300 columns per CTA label
new_cols = {}
for l in all_cta:
    new_cols[l] = []

for index, row in annotated_tables.sort_values('column_count', ascending=False).iterrows():
    types = eval(row['type_labels'])
    
    for col in eval(row['all_cols']):
        if types[col] in all_cta and len(new_cols[types[col]]) < 300 and col+'_'+row['file_name'] in type_lbls and col+'_'+row['file_name'] not in ignore:      
            if row['file_name'] not in selected_cols or col not in selected_cols[row['file_name']]:
                
                new_cols[types[col]].append(col+'_'+row['file_name'])
                
                #add to selected columns
                if row['file_name'] not in selected_cols:
                    selected_cols[row['file_name']] = {}
                selected_cols[row['file_name']][col] = 'Random 3'
                

## Assemble all selected columns

In [None]:
selection = annotated_tables.loc[ annotated_tables['file_name'].isin(selected_cols) ]
sel_cols = []
for index, row in selection.iterrows():
    sel_cols.append(selected_cols[row['file_name']])
selection['selected_cols'] = sel_cols
selection

In [None]:
res = []
for index, row in selection.iterrows():
    rel_labels = eval(row['rel_labels'])
    type_labels = eval(row['type_labels'])
    densities = eval(row['all_cols'])
    
    for cols in selected_cols[row['file_name']]:
        if cols in rel_labels and cols in type_labels:
            res.append([row['class'], cols, row['file_name'], rel_labels[cols], type_lbls[cols+'_'+row['file_name']], densities[cols], selected_cols[row['file_name']][cols] ])
        

In [None]:
dataset = pd.DataFrame(res, columns=['class', 'column_name', 'file_name', 'relation_label', 'type_label', 'density', 'selection_type'])
dataset

In [None]:
dataset['selection_type'].unique()

In [None]:
dataset = dataset.loc[dataset['density'] >= 10 ]
dataset

In [None]:
#Remove columns belonging to CTA labels that do not have a minimum of 50 examples
s = dataset.groupby(['type_label'])['column_name'].count()
no_rels = list(s[s < 50].keys())
dataset = dataset.loc[~dataset['type_label'].isin(no_rels)]

In [None]:
dataset

In [None]:
len(dataset['type_label'].unique())

In [None]:
len(dataset['file_name'].unique())

In [None]:
dataset['selection_type'].replace(['Intra similarity','Inter Similarity'], 'Corner Cases', inplace=True)
dataset['selection_type'].replace(['Random', 'Random 2', 'Random 3'], 'Random', inplace=True)

In [None]:
dataset.to_csv('output-data/cta-datasets/dataset_cta.csv', index=False)