In [None]:
import os
import pandas as pd
import numpy as np
from collections import Counter
from skmultilearn.model_selection import iterative_train_test_split
from skmultilearn.model_selection.measures import get_combination_wise_output_matrix
import multiprocessing

## Read the created dataset and attach column values

In [None]:
dataset = pd.read_csv('output-data/cta-datasets/dataset_cta.csv')
dataset

In [None]:
col_table = (dataset['column_name']+'|'+dataset['file_name']).tolist()

### Add value columns

In [None]:
#Existing English Tables
existing = open("output-data/english_table_names.txt", 'r')
existing_english_tables = [line.replace('\n', '') for line in existing.readlines()]

In [None]:
#Returns values of cleaned textual columns
def get_values(col_table_name):
    column_name, file_name = col_table_name.split('|')
    
    if file_name in existing_english_tables:
        file = 'output-data/expanded-tables/' + file_name
    else:
        file = 'output-data/new-english-tables/' + file_name
    
    #Open table
    df = pd.read_json(file, compression='gzip', lines=True)
           
    return df[column_name].tolist()

In [None]:
pool = multiprocessing.Pool(processes=25)
values = pool.map(get_values, col_table)
pool.close()
pool.join()

In [None]:
dataset['values'] = values
dataset

## Create training, validation and test split

In [None]:
grouped_by_table = dataset.groupby(['file_name'])['type_label'].apply(','.join).reset_index()
grouped_by_table

In [None]:
X = grouped_by_table[["file_name"]].values

### One hot encoding of CTA labels

In [None]:
all_labels = dataset['type_label'].unique()
len(all_labels)

In [None]:
y = np.zeros(shape=(len(grouped_by_table['file_name'].tolist()), len(all_labels))) #encoded labels

In [None]:
for index, row in grouped_by_table.iterrows():
    table_labels = row['type_label'].split(',')
    count = 0
    
    for label in all_labels:
        if label in table_labels:
            y[index][count] = 1
        else:
            y[index][count] = 0
        count += 1

In [None]:
X_train, y_train, X_test, y_test = iterative_train_test_split(X, y, test_size = 0.2)
print('Training set length: '+str(len(X_train)) +', Testing set length: '+ str(len(X_test)))

In [None]:
#Relation Labels in each set
pd.DataFrame({
    'train': Counter(str(combination) for row in get_combination_wise_output_matrix(y_train, order=1) for combination in row),
    'test' : Counter(str(combination) for row in get_combination_wise_output_matrix(y_test, order=1) for combination in row)
}).T.fillna(0.0)

### Split testing set into validation and testing

In [None]:
X_val, y_val, X_test, y_test = iterative_train_test_split(X_test, y_test, test_size = 0.5)
print('Validation set length: '+str(len(X_val)) +', Testing set length: '+ str(len(X_test)))

In [None]:
pd.DataFrame({
    'val': Counter(str(combination) for row in get_combination_wise_output_matrix(y_val, order=1) for combination in row),
    'test' : Counter(str(combination) for row in get_combination_wise_output_matrix(y_test, order=1) for combination in row)
}).T.fillna(0.0)

## Statistics for each set

In [None]:
import matplotlib.pyplot as plt

In [None]:
grouped_by_table_dict = grouped_by_table.to_dict('records')

In [None]:
#Dictionary with table names as key and relation labels as values
file_to_label = {}
for row in grouped_by_table_dict:
    file_to_label[row['file_name']] = row['type_label']

### Training set

In [None]:
#Count how many columns per label
label_and_number_train = {}
for row in X_train:
    
    for label in file_to_label[row[0]].split(','):
        if label in label_and_number_train:
            label_and_number_train[label] += 1
        else:
            label_and_number_train[label] = 1
print('Number of unique relation labels in training set: '+ str(len(label_and_number_train)))

In [None]:
label_and_number_train.values()

In [None]:
print('Minimum column count per label is: '+str(min(label_and_number_train.values())) )
print('Maximum column count per label is: '+str(max(label_and_number_train.values())) )
print('Total column count is: '+str(sum(label_and_number_train.values())) )

In [None]:
plt.figure(figsize=(15,5))
plt.hist(label_and_number_train.values(), bins=10)
plt.ylabel('Label Count')
plt.xlabel('Number of columns')

### Validation set

In [None]:
#Count how many columns per label
label_and_number_val = {}
for row in X_val:
    
    for label in file_to_label[row[0]].split(','):
        if label in label_and_number_val:
            label_and_number_val[label] += 1
        else:
            label_and_number_val[label] = 1
print('Number of unique relation labels in validation set: '+ str(len(label_and_number_val)))

In [None]:
label_and_number_val.values()

In [None]:
print('Minimum column count per label is: '+str(min(label_and_number_val.values())) )
print('Maximum column count per label is: '+str(max(label_and_number_val.values())) )
print('Total column count is: '+str(sum(label_and_number_val.values())) )

In [None]:
plt.figure(figsize=(15,5))
plt.hist(label_and_number_val.values(), bins=10)
plt.ylabel('Label Count')
plt.xlabel('Number of columns')

### Testing set

In [None]:
#Count how many columns per label
label_and_number_test = {}
for row in X_test:
    
    for label in file_to_label[row[0]].split(','):
        if label in label_and_number_test:
            label_and_number_test[label] += 1
        else:
            label_and_number_test[label] = 1
print('Number of unique relation labels in testing set: '+ str(len(label_and_number_test)))

In [None]:
label_and_number_test.values()

In [None]:
print('Minimum column count per label is: '+str(min(label_and_number_test.values())) )
print('Maximum column count per label is: '+str(max(label_and_number_test.values())) )
print('Total column count is: '+str(sum(label_and_number_test.values())) )

In [None]:
plt.figure(figsize=(15,5))
plt.hist(label_and_number_test.values(), bins=10)
plt.ylabel('Label Count')
plt.xlabel('Number of columns')

## Prepare csv file for each set

In [None]:
training_tables = [ table[0] for table in X_train ]
validation_tables = [ table[0] for table in X_val ]
testing_tables = [ table[0] for table in X_test ]

In [None]:
training_set = dataset.loc[dataset['file_name'].isin(training_tables)]
validation_set = dataset.loc[dataset['file_name'].isin(validation_tables)]
testing_set = dataset.loc[dataset['file_name'].isin(testing_tables)]


In [None]:
alltypes = list(testing_set['type_label'].unique())
training_set = training_set.loc[training_set['type_label'].isin(alltypes)]
validation_set = validation_set.loc[validation_set['type_label'].isin(alltypes)]

In [None]:
#Manual corrections:
#Remove some types that do not have at least 10 examples in test set
#Remove tables with less than 10% density

# training_set = training_set.loc[~training_set['type_label'].isin(['Map', 'PublicationEvent', 'VideoObject', 'AggregateRating'])]
# training_set = training_set.loc[training_set['density'] >= 10 ]

# validation_set = validation_set.loc[~validation_set['type_label'].isin(['Map', 'PublicationEvent', 'VideoObject', 'AggregateRating'])]
# validation_set = validation_set.loc[validation_set['density'] >= 10 ]

# testing_set = testing_set.loc[~testing_set['type_label'].isin(['Map', 'PublicationEvent', 'VideoObject', 'AggregateRating'])]
# testing_set = testing_set.loc[testing_set['density'] >= 10 ]

In [None]:
#Write all CTA labels in a file
with open('output-data/cta-datasets/type_vocab.txt', 'a') as file:
    i = 0
    for label in all_types:
        file.write(str(i)+ '\t' + label +'\n')
        i += 1

In [None]:
training_set.to_csv('output-data/cta-datasets/training_set_cta.csv.gz', index=False, compression='gzip')
validation_set.to_csv('output-data/cta-datasets/validation_set_cta.csv.gz', index=False, compression='gzip')
testing_set.to_csv('output-data/cta-datasets/testing_set_cta.csv.gz', index=False, compression='gzip')

### Create small subset of training set

In [None]:
training_set = pd.read_csv('output-data/cta-datasets/training_set_cta.csv.gz', compression='gzip')

In [None]:
grouped_by_table = training_set.groupby(['file_name'])['type_label'].apply(','.join).reset_index()
grouped_by_table

In [None]:
X_train = grouped_by_table[["file_name"]].values

In [None]:
types_file = open("output-data/cta-datasets/type_vocab.txt", 'r')
all_labels = [line.replace('\n', '').split('\t')[1] for line in types_file.readlines()]

In [None]:
y = np.zeros(shape=(len(grouped_by_table['file_name'].tolist()), len(all_labels))) #encoded labels

In [None]:
for index, row in grouped_by_table.iterrows():
    table_labels = row['type_label'].split(',')
    count = 0
    
    for label in all_labels:
        if label in table_labels:
            y[index][count] = 1
        else:
            y[index][count] = 0
        count += 1

In [None]:
X_rest, y_rest, X_test, y_test = iterative_train_test_split(X_train, y, test_size = 0.25)
print('Training set length: '+str(len(X_train)) +', Testing set length: '+ str(len(X_test)))

In [None]:
#Relation Labels in each set
pd.DataFrame({
    'train': Counter(str(combination) for row in get_combination_wise_output_matrix(y_rest, order=1) for combination in row),
    'test' : Counter(str(combination) for row in get_combination_wise_output_matrix(y_test, order=1) for combination in row)
}).T.fillna(0.0)

In [None]:
training_tables_small = [ table[0] for table in X_test ]
training_set_small = training_set.loc[training_set['file_name'].isin(training_tables_small)]
training_set_small

In [None]:
training_set_small.to_csv('output-data/cta-datasets/training_set_small_cta.csv.gz', index=False, compression='gzip')