In [7]:
import pandas as pd
import numpy as np
import fasttext
import os

In [8]:
# Import all the functions inf the functions.ipynb file
%run functions.ipynb

## Basic Execution
This first section allows you to choose the parameters for the experiment, train a text classifier and a random forest and output a prediction for a testing dataset. You can view the performance and the exact predictions made.

Step 1: Choose the parameters of your experiment. This is the only block you need to modify.

In [15]:
# Main metadata file. File from which the data and labels will be read
metadata_file = 'Datasets/public_dataset.csv'
labels_file = 'Datasets/labels_public_dataset.csv'

# Set specify_test_data = 0 to perform a training/test split on the metadata_file and labels_file
# Set specify_test_data = 1 to use the metadata_file and labels_file as training data and metadata_file_test and labels_file_test as test data
specify_test_data = 0

# ONLY NECESSARY IF specify_test_data = 1:
metadata_file_test = 'Datasets/public_dataset.csv'
labels_file_test = 'Datasets/labels_public_dataset.csv'

# ONLY NECESSARY IF specify_test_data = 0: How to perform the training/test split
# split_method = 0: by specified percentage. The size of the training data will be len(data)*perc_training
# split_method = 1: by number of cases. The size of the training data will be num_trainingsamples
split_method = 1
perc_training = 0.9
num_trainingsamples = 800

# Threshold from which a prediction will be considered a predicted positive
# e.g. if decision_threshold = 0.3, all predictions 0.3 or higher will be assigned the '1' label (personal data)
# This is only used to calculate the precision, recall and f1 scores
decision_threshold = 0.5

# -----------------------------------------------------
# Choose which features you want to use to train the model. Comment out the lines you don't want
# The model in the paper includes all of these
columns_to_use = [
    'NUM_ROWS',
    'AVG_ROW_LEN',
    'AVG_COL_LEN',
    'distinct_ratio',
    'perc_null',
    'is_id',
    'is_number',
    'is_character',
    'is_date',
    'is_large_blob',
    'embedding_pred' #This is the prediction made by the text classifier
]

# -----------------------------------------------------
# Choose which type of model you want to train. The default one, used to obtain the results in the paper, is 1 (Random Forest)

model_type = 1
# 1 - Random Forest
# 2 - Logistic Regression
# 3 - Gaussian Naive Bayes
# 4 - Ada Boost
# 5 - Gradient Boosting
# 6 - KNN classifier

# -----------------------------------------------------
# Choose the name of the file where the results will be output
results_file = 'Results/results.csv'

Step 2: Read the data, create a train/test split and enrich it

In [16]:
data = pd.read_csv(metadata_file)
labels = pd.read_csv(labels_file)

#OPTIONAL, but used for the results in the paper:
data = data[data['NUM_DISTINCT'] > 0] #We'll only analyze columns for which there were some entries
labels = labels[labels.index.isin(data.index)] #Make sure the data and labels match

if(specify_test_data == 0):
    data_train, data_test, labels_train, labels_test, msk = train_test_split(data, labels, split_method, perc_training, num_trainingsamples)
elif(specify_test_data == 1):
    data_train = data
    labels_train = labels
    data_test = pd.read_csv(metadata_file_test)
    labels_test = pd.read_csv(labels_file_test)
    data_test = data_test[data_test['NUM_DISTINCT'] > 0]
    labels_test = labels_test[labels_test.index.isin(data_test.index)]

data_train = enrich(data_train)
data_test = enrich(data_test)

Step 3: Train an text classifier and add a column to the dataframes with its prediction

In [17]:
if('embedding_pred' in columns_to_use):
    embed_model = train_embedding_model(labels_train,'COLUMN_NAME')
    data_train = add_embedding_to_df(data_train,embed_model,'COLUMN_NAME')
    data_test = add_embedding_to_df(data_test,embed_model,'COLUMN_NAME')

Step 4: Remove the table and column names, and train the machine learning model. Look at the "Results" folder for more details.

In [18]:
test_columns = labels_test[['TABLE_NAME','COLUMN_NAME']]

data_train = data_train[columns_to_use]
data_test = data_test[columns_to_use]
labels_train = labels_train.drop(['TABLE_NAME','COLUMN_NAME'], axis=1)
labels_test = labels_test.drop(['TABLE_NAME','COLUMN_NAME'], axis=1)

In [19]:
ml_model = train_model(data_train, labels_train, model_type, 100)
predictions = predict(ml_model, data_test)
importances = calc_importances_df(ml_model,data_train)

precision,recall,f1,auc = evaluate_prediction(test_columns,predictions,labels_test, decision_threshold, results_file)
print('ROC AUC:', auc)

ROC AUC: 0.817772952853598


## Advanced options
In this section, steps 2, 3 and 4 have been merged into a single function called full_run (see functions.ipynb). It can be used to perform a sweep in order to assess the effect of a certain variable in the performance.

Even for the same sample size, different samples can yield different performances. In order to obtain a more reliable metric, the function samples num_tests times and trains a model each time, returning the mean precision, recall, f1 and ROC AUC.

In this example, a sweep is performed similar to those done in the paper. Models are trained with different training set sizes, and the results are exported to a .csv
num_tests was 100 in the paper. For a faster execution, reduce this number:

In [None]:
results_df = pd.DataFrame(columns =  ['Training Samples', 'Precision', 'Recall', 'F1', 'AUC'])

num_tests = 10
split_method = 1

for num_trainingsamples in [100,200,300,400,500,600,700,800,900]:
    precision, recall, f1, auc = full_run(specify_test_data, split_method, perc_training, num_trainingsamples, columns_to_use, model_type, results_file, num_tests)
    results_df.loc[len(results_df)] = [num_trainingsamples, precision, recall, f1, auc]

results_df.to_csv('Results/sweep_training_samples.csv')

This example is similar to the previous one, but the training set size is determined as a percentage of the total set

In [4]:
results_df = pd.DataFrame(columns =  ['Percentage Training Set', 'Precision', 'Recall', 'F1', 'AUC'])

num_tests = 10
split_method = 0

for perc_training in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
    precision, recall, f1, auc = full_run(specify_test_data, split_method, perc_training, num_trainingsamples, columns_to_use, model_type, results_file, num_tests)
    results_df.loc[len(results_df)] = [perc_training, precision, recall, f1, auc]

results_df.to_csv('Results/sweep_percentage_training_samples.csv')

In this third example, the training set size is fixed, and different machine learning models from the scikit-learn package are tested:

In [14]:
results_df = pd.DataFrame(columns =  ['Model Type', 'Precision', 'Recall', 'F1', 'AUC'])

num_tests = 10
split_method = 1
num_trainingsamples = 800

for model_type in [1,2,3,4,5,6]:
    precision, recall, f1, auc = full_run(specify_test_data, split_method, perc_training, num_trainingsamples, columns_to_use, model_type, results_file, num_tests)
    results_df.loc[len(results_df)] = [model_type, precision, recall, f1, auc]

results_df.to_csv('Results/sweep_percentage_training_samples.csv')