In [1]:
#modules here are updated everytime I run a cell, except those excluded by %aimport
%load_ext autoreload
%autoreload 2

import data_transformation_funcs as dt
import data_specific_funcs as ds

%aimport random 

Next cell is where you have to provide original data information to be transformed, specifically:

### mandatory information: 

- **original_train_list**: a python list of train sequences, each sequence is a python list of strings
- **original_test_list**: same format as original_train_list but with test sequences
- **original_test_labels**: a python list of the test labels, each test label is a string
- **task** : a string that describes the task learned from the dataset, can be either "lm" for language model or "classif" for classification (binary or multiple, both are accepted)
- **categorical**: a boolean that is True if the labels on the dataset are categorical, false if they are numerical.

### conditional information:
if the task on the dataset is "classif", you will also have to provide:

- **original_train_labels** : a python list of the train labels, each label is a string

### optional information:
these lists are not mandatory, if you do not provide them, the function will automatically carve them out of the 
training set while respecting the valid_ratio property.

- **original_valid_list**: a python list of validation/dev sequences, each sequence is a python list of strings
- **original_valid_labels**: only if task="classif", a python list of the validation/dev labels, each label is a string

## Data for langage modeling task with non categorical data and no provided validation set

In [2]:
# open text file and read in data 
data_path = 'PAutomaC-competition_sets'
paut_id = '1'

with open(data_path+'/'+paut_id+'.pautomac.train', 'r') as f:
    original_train_text = f.read()
with open(data_path+'/'+paut_id+'.pautomac.test', 'r') as f:
    original_test_text = f.read()
with open(data_path+'/'+paut_id+'.pautomac_solution.txt', 'r') as f:
    original_solution_text = f.read()
    
#extract list of sequences, each sequence being a list of strings, and list of labels, each label being a string.
original_train_list = ds.extract_examples_pautomac(original_train_text)
original_test_list = ds.extract_examples_pautomac(original_test_text)
original_test_labels = ds.extract_labels_pautomac(original_solution_text)

task_paut = "lm" # task can be either "lm" or "classif"
categorical_paut = False #pautomac labels are scores, not categorical values

print(len(original_train_list), original_train_list[0])


20000 ['5', '4', '1', '1', '5', '3', '4', '7', '4', '7', '5', '0']


In [3]:
#using a seed to get the same consistent random mapping everytime 
#Note: if working with a notebook, this has to be on the same cell as the function that calls random
random.seed(4242)

# we generate the transformed lists
train_list, valid_list, test_list, test_labels, dataset_key = dt.transform_lists(original_train_list,
                                                                                 original_test_list,
                                                                                 original_test_labels,
                                                                                 categorical = categorical_paut,
                                                                                 task=task_paut)

print(len(train_list), train_list[0])
print(len(valid_list), valid_list[0])
print(len(test_list), test_list[0])
print(len(test_labels), test_labels[0])


18180 [8, 7, 3, 0, 1, 9]
1818 [8, 3, 2, 1, 7, 5, 6, 3, 0, 7, 5, 1, 7, 4, 3, 3, 4, 3, 5, 3, 3, 2, 4, 3, 5, 1, 3, 6, 6, 3, 9]
1000 [8, 1, 0, 5, 7, 2, 7, 5, 2, 4, 9]
1000 7.43792566036e-09


In [4]:
# we output these lists as txt documents in the pautomac format
# if no target folder is given, the datasets will be output in the current directory


tashill_id = 1 # this number determines the prefix of the file, naming of files also respects pautomac standard

dt.make_competition_sets(train_list, 
                            valid_list, 
                            test_list, 
                            test_labels,
                            data_id = tashill_id,
                            categorical = categorical_paut,
                            target_path = "tashill_sets",
                            task = task_paut)

Notice we generated an object called "dataset_key" in the cells before. This object serves to decode and retrieve the original dataset. It should not be given to participants.

In the next cell, we will use it to recover the original dataset. This function is not perfectly bijective because we carved out a validation set from the training set, and we also got rid of a few examples to allow for a perfect valid_ratio.

However if you see the generated file in the "pautomac_competition_sets" folder, you will notice we perfectly recover the first 18180 training sequences for example

In [5]:
recovered_train_list, recovered_valid_list, recovered_list, recovered_test_labels = dt.reverse_transform(train_list, 
                                                                        valid_list, 
                                                                        test_list, 
                                                                        test_labels, 
                                                                        dataset_key,
                                                                        categorical = categorical_paut,
                                                                        task = task_paut)
print(len(recovered_train_list), recovered_train_list[0])
with open(data_path+'/'+paut_id+'.recovered_pautomac.train', 'w') as f:
     f.write(dt.generate_data_text(recovered_train_list))

18180 ['5', '4', '1', '1', '5', '3', '4', '7', '4', '7', '5', '0']


## Data for binary classification task with categorical data and a provided validation set

In [3]:
# open text file and read in data 
#heinz_data_path = 'HeinzData'
#heinz_data_id = '16.16.LT.4.1.5'
heinz_data_path = 'ENE_sets'
heinz_data_id = 'ENE'

#with open(heinz_data_path+'/'+heinz_data_id+'_Train.txt', 'r') as f:
#    train_text = f.read()
#with open(heinz_data_path+'/'+heinz_data_id+'_Dev.txt', 'r') as f:
#    valid_text = f.read()
#with open(heinz_data_path+'/'+heinz_data_id+'_TestSR.txt', 'r') as f:
#    testSR_text = f.read()

with open(heinz_data_path+'/'+heinz_data_id+'.train', 'r') as f:
    train_text = f.read()
with open(heinz_data_path+'/'+heinz_data_id+'.valid', 'r') as f:
    valid_text = f.read()
with open(heinz_data_path+'/'+heinz_data_id+'.test', 'r') as f:
    testSR_text = f.read()

#extract list of sequences, each sequence being a list of strings, and list of labels, each label being a string.
original_train_list, original_train_label = ds.extract_HeinzData(train_text)
original_valid_list, original_valid_label = ds.extract_HeinzData(valid_text)
original_test_list, original_test_label = ds.extract_HeinzData(testSR_text)

task_heinz = "classif"
categorical_heinz = True

print(len(original_train_list), original_train_list[0],"\n",
      len(original_train_label), original_train_label[:10],"\n")


511538 ['M', 'A', 'D', 'P', 'S', 'L', 'Y', 'T', 'Y', 'P', 'S', 'P', 'L', 'Q', 'G', 'Y', 'E', 'N', 'L', 'A', 'P', 'L', 'G', 'T', 'E', 'V', 'S', 'P', 'D', 'G', 'K', 'S', 'L', 'L', 'N', 'P', 'E', 'T', 'G', 'I', 'K', 'S', 'K', 'S', 'Y', 'E', 'K', 'F', 'T', 'E', 'P', 'L', 'D', 'S', 'G', 'I', 'R', 'G', 'A', 'F', 'D', 'V', 'H', 'I', 'Y', 'H', 'F', 'Q', 'K', 'N', 'K', 'E', 'Q', 'A', 'K', 'F', 'A', 'R', 'E', 'L', 'W', 'E', 'R', 'I', 'R', 'R', 'E', 'F', 'P', 'E', 'L', 'R', 'I', 'Y', 'R', 'F', 'W', 'E', 'E', 'P', 'I', 'G', 'P', 'H', 'P', 'V', 'A', 'M', 'F', 'E', 'V', 'N', 'L', 'F', 'T', 'P', 'E', 'Q', 'F', 'G', 'A', 'F', 'I', 'P', 'W', 'L', 'V', 'I', 'N', 'R', 'G', 'P', 'L', 'S', 'A', 'L', 'V', 'H', 'P', 'N', 'T', 'V', 'D', 'E', 'K', 'G', 'E', 'L', 'L', 'D', 'E', 'E', 'R', 'D', 'H', 'T', 'Q', 'R', 'A', 'I', 'W', 'M', 'G', 'E', 'Q', 'L', 'P', 'L', 'D', 'L', 'S', 'L', 'V', 'K', 'R', 'L', 'K', 'Q', 'Q', 'K', 'A', 'A', 'H'] 
 511538 ['FALSE', 'FALSE', 'FALSE', 'FALSE', 'FALSE', 'FALSE', 'FALSE', 'FAL

In [11]:

vocab = dt.generate_vocab(original_valid_list)
vocab_size = len( vocab )
print("validation set: \n",vocab, vocab_size)

vocab = dt.generate_vocab(original_train_list)
vocab_size = len( vocab )
print("train set: \n",vocab, vocab_size)

vocab = dt.generate_vocab(original_test_list)
vocab_size = len( vocab )
print("test set: \n",vocab, vocab_size)


validation set: 
 ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y'] 23
train set: 
 ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] 25
test set: 
 ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] 24


In [12]:
#using a seed to get the same consistent random mapping everytime 
#Note: if working with a notebook, this has to be on the same cell as the function that calls random
random.seed(4242)

# we generate the transformed lists
train_list, train_labels, valid_list, valid_labels, test_list, test_labels, dataset_key = dt.transform_lists(
                                                                                            original_train_list,
                                                                                            original_test_list,
                                                                                            original_test_label,
                                                                                            valid_list = original_valid_list, 
                                                                                            train_labels = original_train_label,
                                                                                            valid_labels = original_valid_label,
                                                                                            categorical = categorical_heinz,
                                                                                            task=task_heinz)

print(len(train_list), train_list[0])
print(len(valid_list), valid_list[0])
print(len(test_list), test_list[0])
print(len(test_labels), test_labels[:10])


47750 [25, 24, 1, 12, 9, 12, 23, 22, 9, 15, 7, 7, 1, 11, 12, 10, 15, 12, 19, 21, 5, 5, 10, 15, 8, 23, 20, 10, 7, 22, 3, 19, 5, 19, 13, 12, 21, 22, 7, 19, 15, 12, 20, 22, 12, 15, 22, 23, 22, 12, 13, 23, 7, 5, 13, 8, 24, 15, 22, 23, 12, 5, 15, 13, 16, 1, 20, 10, 10, 15, 15, 10, 10, 23, 1, 23, 5, 13, 22, 23, 12, 12, 1, 8, 5, 13, 22, 8, 14, 16, 7, 12, 3, 7, 3, 13, 14, 22, 19, 23, 3, 14, 0, 10, 12, 19, 1, 19, 11, 9, 1, 13, 20, 13, 8, 5, 23, 7, 13, 22, 7, 11, 15, 21, 3, 9, 24, 20, 3, 13, 19, 21, 1, 1, 9, 26]
4775 [25, 24, 9, 20, 14, 22, 13, 13, 13, 13, 13, 22, 20, 10, 13, 1, 13, 22, 13, 13, 11, 13, 1, 11, 10, 1, 14, 10, 13, 11, 11, 19, 11, 7, 24, 3, 12, 7, 5, 19, 5, 13, 19, 20, 7, 23, 21, 15, 10, 12, 19, 10, 14, 5, 15, 3, 23, 7, 20, 15, 22, 1, 10, 19, 19, 13, 19, 10, 22, 22, 0, 12, 13, 9, 7, 11, 10, 22, 11, 1, 7, 23, 5, 14, 0, 10, 12, 1, 15, 3, 21, 20, 13, 22, 1, 1, 10, 15, 9, 3, 12, 20, 13, 22, 1, 22, 3, 8, 12, 22, 10, 22, 13, 23, 10, 20, 3, 10, 14, 10, 12, 5, 20, 14, 8, 0, 21, 13, 10, 10, 

In [13]:
print(dataset_key["word_mapping"])

{'A': 13, 'B': 4, 'C': 0, 'D': 19, 'E': 12, 'F': 20, 'G': 10, 'H': 8, 'I': 5, 'K': 15, 'L': 22, 'M': 24, 'N': 7, 'O': 2, 'P': 11, 'Q': 9, 'R': 21, 'S': 1, 'T': 14, 'U': 17, 'V': 23, 'W': 16, 'X': 6, 'Y': 3, 'Z': 18}


In [14]:
# we output these lists as txt documents in the pautomac format
# if no target folder is given, the datasets will be output in the current directory
tashill_id = 3
dt.make_competition_sets(train_list, 
                            valid_list, 
                            test_list, 
                            test_labels,
                            train_labels = train_labels,
                            valid_labels = valid_labels,
                            data_id = tashill_id,
                            categorical = categorical_heinz,
                            target_path = "tashill_sets",
                            task = task_heinz)



we recover original sequences as python lists of strings

In [25]:
(recovered_train_list, 
 recovered_train_labels, 
 recovered_valid_list,
 recovered_valid_labels,
 recovered_test_list, 
 recovered_test_labels) = dt.reverse_transform(train_list, 
                                             valid_list, 
                                             test_list, 
                                             test_labels, 
                                             dataset_key,
                                             train_labels = train_labels,
                                             valid_labels = valid_labels,
                                             categorical = categorical_heinz,
                                             task = task_heinz)


we output these recovered sequences in pautomac format in the "Heinzdata" folder

In [26]:
print(len(recovered_train_list), recovered_train_list[0])
print(len(recovered_train_labels), recovered_train_labels[:10])

with open(heinz_data_path+'/'+heinz_data_id+'.recovered_heinz.train', 'w') as f:
     f.write(dt.generate_data_text(recovered_train_list))
with open(heinz_data_path+'/'+heinz_data_id+'.recovered_heinz.labels', 'w') as f:
     f.write(dt.generate_label_text(recovered_train_labels, categorical=True))

47750 ['M', 'A', 'D', 'P', 'S', 'L', 'Y', 'T', 'Y', 'P', 'S', 'P', 'L', 'Q', 'G', 'Y', 'E', 'N', 'L', 'A', 'P', 'L', 'G', 'T', 'E', 'V', 'S', 'P', 'D', 'G', 'K', 'S', 'L', 'L', 'N', 'P', 'E', 'T', 'G', 'I', 'K', 'S', 'K', 'S', 'Y', 'E', 'K', 'F', 'T', 'E', 'P', 'L', 'D', 'S', 'G', 'I', 'R', 'G', 'A', 'F', 'D', 'V', 'H', 'I', 'Y', 'H', 'F', 'Q', 'K', 'N', 'K', 'E', 'Q', 'A', 'K', 'F', 'A', 'R', 'E', 'L', 'W', 'E', 'R', 'I', 'R', 'R', 'E', 'F', 'P', 'E', 'L', 'R', 'I', 'Y', 'R', 'F', 'W', 'E', 'E', 'P', 'I', 'G', 'P', 'H', 'P', 'V', 'A', 'M', 'F', 'E', 'V', 'N', 'L', 'F', 'T', 'P', 'E', 'Q', 'F', 'G', 'A', 'F', 'I', 'P', 'W', 'L', 'V', 'I', 'N', 'R', 'G', 'P', 'L', 'S', 'A', 'L', 'V', 'H', 'P', 'N', 'T', 'V', 'D', 'E', 'K', 'G', 'E', 'L', 'L', 'D', 'E', 'E', 'R', 'D', 'H', 'T', 'Q', 'R', 'A', 'I', 'W', 'M', 'G', 'E', 'Q', 'L', 'P', 'L', 'D', 'L', 'S', 'L', 'V', 'K', 'R', 'L', 'K', 'Q', 'Q', 'K', 'A', 'A', 'H']
47750 ['FALSE', 'FALSE', 'FALSE', 'FALSE', 'FALSE', 'FALSE', 'FALSE', 'FALSE',