In [None]:
import pandas as pd
import copy

In [None]:
cpa_statistics = pd.read_csv('output-data/statistics/cpa_statistics.csv')
cpa_statistics = cpa_statistics.loc[cpa_statistics['column_count'] >= 50]
cpa_statistics

In [None]:
cpa_labels = cpa_statistics['cpa_label'].tolist()
cpa_labels.remove('name')

In [None]:
rels = pd.read_csv('data/Final CTA and CPA Labels.csv')
rels = rels.loc[rels['CPA label'].isin(cpa_statistics['cpa_label'].tolist())]
rels

In [None]:
annotated_tables = pd.read_csv('output-data/statistics/expanded_tables_annotations_cpa.csv')
annotated_tables

In [None]:
rel_lbls = {}
for index, row in annotated_tables.iterrows():
    for col in eval(row['rel_labels']):
        rel_lbls[col] = eval(row['rel_labels'])[col] 
del rel_lbls['name']

In [None]:
file_names = set(annotated_tables['file_name'].tolist())
#Columns to ignore in selection with a density less than 70%
ignore = []
tables_to_dict = annotated_tables.to_dict('records')
for table in tables_to_dict:
    for col in eval(table['all_cols']):
        if eval(table['all_cols'])[col] < 70:
            ignore.append(col+'_'+table['file_name'])
ignore = set(ignore)

## 1. Corner Cases Columns Selection

In [None]:
num_sim = pd.read_csv('output-data/similarities/subset8000_num_sim.csv.gz', compression='gzip')
num_sim2 = pd.read_csv('output-data/similarities/subset7000_num_sim.csv.gz', compression='gzip')
date_sim = pd.read_csv('output-data/similarities/subset6000_datetime_sim.csv.gz', compression='gzip')
date_sim2 = pd.read_csv('output-data/similarities/subset6000_2_datetime_sim.csv.gz', compression='gzip')
text_sim = pd.read_csv('output-data/similarities/subset240000_textcols_sim.csv.gz', compression='gzip')
text_sim2 = pd.read_csv('output-data/similarities/subset239000_textcols_sim.csv.gz', compression='gzip')
similarities = pd.concat([num_sim, num_sim2, date_sim, date_sim2, text_sim, text_sim2], ignore_index=True)
similarities['file_name'] = similarities['col_name'].apply(lambda row: row.split('_')[1]+'_'+row.split('_')[2]+'_'+row.split('_')[3])
similarities = similarities.loc[similarities['file_name'].isin(file_names)]
#similarities = similarities.loc[~similarities['col_name'].isin(ignore)]
similarities

In [None]:
#Create a file to hold already selected columns from tables and their selection type
# Selection type: 'Missing values', 'Intra similarity', 'Inter simimilarity/dissimilarity', 'format heterogeneity'

### Select intra similarity columns

In [None]:
#Column -> its similar cols
intra_similarities = {}
for index, row in similarities.iterrows():
    col_name, class_name, table_name, ending = row['col_name'].split('_')
    file_name = '_'.join([class_name, table_name, ending])
    
    #If correct column
    if col_name in rel_lbls:
        
        #if pd.isnull(row['similar_cols']):
        #similar_cols = row['not_similar_cols'].split('; ')[:10]
        #else:
        similar_cols = row['similar_cols'].split('; ')[:10]
        
        intra_similarities[row['col_name']] = []

        for col in similar_cols:
            col_name2, class_name2, table_name2, ending2 = col.split('_')
            file_name_2 = '_'.join([class_name2, table_name2, ending2])
            
            # More than 70% of density, same table and in selected tables for CPA
            if col_name2 in rel_lbls and rel_lbls[col_name] != rel_lbls[col_name2] and file_name_2 in file_names and file_name == file_name_2 and col not in ignore:
                intra_similarities[row['col_name']].append(col)

In [None]:
intra_tables = {} # Mark which columns can be selected in each table
intra_cols = set() # Which columns selected overall
intra_class = {} # How many columns per schema.org type/class
intra_labels = {}

for key in intra_similarities:
    if intra_similarities[key]:

        col_name, class_, table_name, ending = key.split('_')
        file_name = '_'.join([class_, table_name, ending])
        label = rel_lbls[col_name]
        
        if file_name not in intra_tables:
            intra_tables[file_name] = set()
        
        if class_ not in intra_class:
            intra_class[class_] = 0
            
        if label not in intra_labels:
            intra_labels[label] = []
            
        if key not in intra_labels[label]:
            intra_labels[label].append(key)
            
        intra_cols.add(key)
        intra_tables[file_name].add(key)
        intra_class[class_] += 1
        
        for col in intra_similarities[key]:
            label = rel_lbls[col.split('_')[0]]
            
            if label not in intra_labels:
                intra_labels[label] = []
            
            if col not in intra_labels[label]:
                intra_labels[label].append(col)
            
            intra_cols.add(col)
            intra_tables[file_name].add(col)
            intra_class[class_] += 1
            

In [None]:
#Mark all selected columns in a dictionary:
selected_cols_tables = {}

for table in intra_tables:
    
    selected_cols_tables[table] = {}
    
    for col in intra_tables[table]:
        selected_cols_tables[table][col.split('_')[0]] = 'Intra similarity'

In [None]:
low_class = [] #Schema.org types that have less than 1500 selected columns until now
low_label = [] #CPA labels that have less than 100 examples until now

for cl in intra_class:
    if intra_class[cl] < 1500:
        low_class.append(cl)
        
for lb in intra_labels:
    if len(intra_labels[lb]) < 100:
        low_label.append(lb)

### Select inter table similarity columns

In [None]:
inter_similarities = {}
for index, row in similarities.iterrows():
    col_name, class_name, table_name, ending = row['col_name'].split('_')
    file_name = '_'.join([class_name, table_name, ending])
        
    #If correct column
    if col_name in rel_lbls:
        
        #if pd.isnull(row['similar_cols']):
        #similar_cols = row['not_similar_cols'].split('; ')[:10]
        #else:
        similar_cols = row['similar_cols'].split('; ')[:10]
        
        inter_similarities[row['col_name']] = []

        for col in similar_cols:
            col_name2, class_name2, table_name2, ending2 = col.split('_')
            file_name_2 = '_'.join([class_name2, table_name2, ending2])

            if col_name2 in rel_lbls and rel_lbls[col_name] != rel_lbls[col_name2] and  file_name_2 in file_names and file_name != file_name_2 and col not in ignore:
                inter_similarities[row['col_name']].append(col)

In [None]:
inter_sims = [ [x] + inter_similarities[x] for x in inter_similarities if len(inter_similarities[x]) > 0]

In [None]:
# Selecting inter similarities: 
#Count how many columns for low class number of columns and for low CTA labels can be selected
#Filter out above inter sims

select_test = []
i = 0

for cols in inter_sims:
    select_test.append([])
    
    for col in cols:
        
        col_name, class_name, table_name, ending = col.split('_')
        table = '_'.join([class_name, table_name, ending])
        
        #If table is in low class and in low label 
        if class_name in low_class and rel_lbls[col_name] in low_label: #
            #If it hasn't been selected already in the intra similarity phase
            if (table in selected_cols_tables and col_name not in selected_cols_tables[table]) or table not in selected_cols_tables:
                select_test[i].append(col)
    i+=1

# And Filter out columns with less than 1 similar column
s = [ x for x in select_test if len(x) > 1]

# And Select maximum 3500 cols for each CPA label
sel_labels = {}

for sim in s:
    for col in sim:
        
        if rel_lbls[col.split('_')[0]] not in sel_labels:
            sel_labels[rel_lbls[col.split('_')[0]]] = []
        
        if len(sel_labels[rel_lbls[col.split('_')[0]]]) < 3500:
            sel_labels[rel_lbls[col.split('_')[0]]].append(col)

In [None]:
#Select columns which are included in sel_labels
selected_test_2 = []
i = 0

for cols in inter_sims:
    selected_test_2.append([])
    
    for col in cols:
        if rel_lbls[col.split('_')[0]] in sel_labels and col in sel_labels[rel_lbls[col.split('_')[0]]]:
            selected_test_2[i].append(col)
    i+=1
    
#Filter out columns with no similar columns
s_2 = [ x for x in selected_test_2 if len(x) > 1]

In [None]:
# Add the new selected columns to the already selected ones
selected_cols = copy.deepcopy(selected_cols_tables)

for s in s_2:
    for col in s:
        tab = col.split('_')[1] + '_' + col.split('_')[2] + '_' + col.split('_')[3]
        
        if tab not in selected_cols:
            selected_cols[tab] = {}
        
        selected_cols[tab][col.split('_')[0]] = 'Inter Similarity'
        

## 2. Select Missing values columns

In [None]:
low_dens_70 = annotated_tables.loc[ (annotated_tables['overall_table_density'] < 70) ]
low_dens_70

In [None]:
low_dens_70['low_cols'] = low_dens_70['all_cols'].apply(lambda row: len( [x for x in eval(row) if eval(row)[x] < 70 and eval(row)[x] > 10 ] ))

In [None]:
low_dens_70.loc[ (low_dens_70['low_cols'] >= 3) ]['file_name'].count()

In [None]:
# That have at least three columns to be annotated
low_dens_70.loc[ (low_dens_70['low_cols'] >= 3) ].groupby(['class'])['low_cols'].count()

In [None]:
low_dens_70.loc[ (low_dens_70['low_cols'] >= 3) ]['low_cols'].sum()

In [None]:
# Limit Product tables to 1500
prods = low_dens_70.loc[ (low_dens_70['low_cols'] >= 3) & (low_dens_70['class'] == 'Product' ) ][1000:]
recipe = low_dens_70.loc[ (low_dens_70['low_cols'] >= 3) & (low_dens_70['class'] == 'Recipe' ) ][1000:]
event = low_dens_70.loc[ (low_dens_70['low_cols'] >= 3) & (low_dens_70['class'] == 'Event' ) ][1000:]

remove_tables = prods['file_name'].tolist() + recipe['file_name'].tolist() + event['file_name'].tolist()

In [None]:
#Select columns with low density from the low density tables that have at least 3 low density columns
#Add to all other selected columns
# count_missing_labels = {}
# for rel in cpa_labels:
#     count_missing_labels[rel] = 0

for index, row in low_dens_70.loc[ (low_dens_70['low_cols'] >= 3) ].iterrows():
    file_name = row['file_name']
    
    if file_name not in remove_tables:    
        if file_name not in selected_cols:
            selected_cols[file_name] = {}

        #Look at low density columns and select if not already annotated from intra sim
        annotated_cols = eval(row['all_cols'])

        for column in annotated_cols:
            if column in rel_lbls: #and count_missing_labels[rel_lbls[column]] < 1000:
                if annotated_cols[column] < 70 and annotated_cols[column] > 10 and column not in selected_cols[file_name]:
                    selected_cols[file_name][column] = 'Missing values'
                    #count_missing_labels[rel_lbls[column]] += 1

In [None]:
remove_tabs = []
for tab in selected_cols:
    if not selected_cols[tab]:
        remove_tabs.append(tab)
for tab in remove_tabs:
    del selected_cols[tab]

In [None]:
#Save selected columns to file
selection = annotated_tables.loc[ annotated_tables['file_name'].isin(selected_cols) ]
sel_cols = []
for index, row in selection.iterrows():
    sel_cols.append(selected_cols[row['file_name']])
selection['selected_cols'] = sel_cols
selection.to_csv('output-data/cpa-datasets/selected_1.csv', index=False)