In [1]:
%pip install sentencepiece pandas

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import os
import pandas as pd
import numpy as np

In [3]:
DATA_FOLDER = os.path.join(os.getcwd(), '..', 'data')
DATASET_FILE = os.path.join(DATA_FOLDER, 'raw', 'filtered.tsv')
MODEL_FOLDER = os.path.join(os.getcwd(), '..', 'models')
MODEL_PREFIX = os.path.join(MODEL_FOLDER, 'tokenizer')
VOCAB_SIZE = 10000

# Data Preprocessing

The notebook contains the code for preprocessing the data. The data is split and sentences are processed with tokenizer model

In [4]:
data = pd.read_csv(DATASET_FILE, sep='\t')
corpus = data['reference'].to_list() + data['translation'].to_list()

"Sentences number: {}".format(len(corpus))

'Sentences number: 1155554'

In [5]:
from sentencepiece import SentencePieceTrainer

with open(os.path.join(DATA_FOLDER, 'interim', 'corpus.txt'), 'w+') as f:
    f.write('\n'.join(corpus))
    
SentencePieceTrainer.Train(
    '--input={0} --model_prefix={1} --vocab_size={2}'.format(
        os.path.join(DATA_FOLDER, 'interim', 'corpus.txt'),
        MODEL_PREFIX,
        VOCAB_SIZE
    )
)

sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=/shared/detoxification/notebooks/../data/interim/corpus.txt --model_prefix=/shared/detoxification/notebooks/../models/tokenizer --vocab_size=10000
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: /shared/detoxification/notebooks/../data/interim/corpus.txt
  input_format: 
  model_prefix: /shared/detoxification/notebooks/../models/tokenizer
  model_type: UNIGRAM
  vocab_size: 10000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_out

## Load Model

In [6]:
from sentencepiece import SentencePieceProcessor

sp = SentencePieceProcessor()
sp.load(os.path.join(MODEL_PREFIX + '.model'))

True

## Transform Dataset

In [7]:

def encode(s):
    return [sp.bos_id()] + sp.encode_as_ids(s) + [sp.eos_id()]

data['reference'] = data['reference'].apply(encode)
data['translation'] = data['translation'].apply(encode)

In [8]:
data.head()

Unnamed: 0.1,Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,0,"[1, 154, 1229, 1304, 199, 24, 5760, 64, 49, 28...","[1, 44, 1229, 1304, 199, 5760, 6, 49, 28, 49, ...",0.785171,0.010309,0.014195,0.981983
1,1,"[1, 324, 8, 5, 23, 418, 706, 3, 2]","[1, 8, 5, 23, 3730, 632, 3, 2]",0.749687,0.071429,0.065473,0.999039
2,2,"[1, 257, 4, 37, 107, 2714, 22, 160, 4, 32, 65,...","[1, 219, 4, 37, 55, 2714, 22, 160, 3, 2]",0.919051,0.268293,0.213313,0.985068
3,3,"[1, 1722, 18, 4959, 4, 8, 5, 85, 94, 11, 3585,...","[1, 812, 4, 8, 33, 11, 1264, 47, 3, 2]",0.664333,0.309524,0.053362,0.994215
4,4,"[1, 9, 5, 85, 94, 1757, 11, 149, 49, 129, 3, 2]","[1, 9, 33, 1757, 11, 79, 49, 3, 2]",0.726639,0.181818,0.009402,0.999348


In [9]:
flipped_data = data.copy()

flips = 0
for i, row in data.iterrows():
    if row['ref_tox'] < row['trn_tox']:
        flipped_data.at[i, 'reference'], flipped_data.at[i, 'translation'] = data.at[i, 'translation'], data.at[i, 'reference']
        flipped_data.at[i, 'ref_tox'], flipped_data.at[i, 'trn_tox'] = data.at[i, 'trn_tox'], data.at[i, 'ref_tox']
        flips += 1
        
"Flips: {}".format(flips)

'Flips: 258635'

In [10]:
assert all(flipped_data['ref_tox'] >= flipped_data['trn_tox'])

## Split Dataset

In [11]:
reference_array = [np.array(f, dtype=np.uint16) for f in flipped_data['reference'].values]
translation_array = [np.array(f, dtype=np.uint16) for f in flipped_data['translation'].values]
ref_tox = flipped_data['ref_tox'].to_numpy().astype(np.float32)
trn_tox = flipped_data['trn_tox'].to_numpy().astype(np.float32)

assert len(reference_array) == len(translation_array) == ref_tox.shape[0] == trn_tox.shape[0]

In [12]:
np.random.seed(1409)

In [13]:
train_indices = np.random.choice(len(reference_array), int(len(reference_array) * 0.8), replace=False)
val_indices = np.setdiff1d(np.arange(len(reference_array)), train_indices)
test_indices = np.random.choice(val_indices, int(len(val_indices) * 0.5), replace=False)
val_indices = np.setdiff1d(val_indices, test_indices)

assert len(train_indices) + len(val_indices) + len(test_indices) == len(reference_array)

# Save Dataset

In [14]:
import pickle 

with open(os.path.join(DATA_FOLDER, 'interim', 'train.pkl'), 'wb') as f:
    refs = [reference_array[i] for i in train_indices]
    trns = [translation_array[i] for i in train_indices]
    pickle.dump(
        {
            'reference': refs,
            'translation': trns,
            'ref_tox': ref_tox[train_indices],
            'trn_tox': trn_tox[train_indices]
        },
        f
    )
    
with open(os.path.join(DATA_FOLDER, 'interim', 'val.pkl'), 'wb') as f:
    refs = [reference_array[i] for i in val_indices]
    trns = [translation_array[i] for i in val_indices]
    pickle.dump(
        {
            'reference': refs,
            'translation': trns,
            'ref_tox': ref_tox[val_indices],
            'trn_tox': trn_tox[val_indices]
        },
        f
    )
    
    
with open(os.path.join(DATA_FOLDER, 'interim', 'test.pkl'), 'wb') as f:
    refs = [reference_array[i] for i in test_indices]
    trns = [translation_array[i] for i in test_indices]
    pickle.dump(
        {
            'reference': refs,
            'translation': trns,
            'ref_tox': ref_tox[test_indices],
            'trn_tox': trn_tox[test_indices]
        },
        f
    )
    

In [15]:
print('Size of .tsv file: {:.2f} MB'.format(os.path.getsize(DATASET_FILE) / 1024 / 1024))
print('Size of train.pkl file: {:.2f} MB'.format(os.path.getsize(os.path.join(DATA_FOLDER, 'interim', 'train.pkl')) / 1024 / 1024))
print('Size of val.pkl file: {:.2f} MB'.format(os.path.getsize(os.path.join(DATA_FOLDER, 'interim', 'val.pkl')) / 1024 / 1024))
print('Size of test.pkl file: {:.2f} MB'.format(os.path.getsize(os.path.join(DATA_FOLDER, 'interim', 'test.pkl')) / 1024 / 1024))

Size of .tsv file: 103.27 MB
Size of train.pkl file: 58.37 MB
Size of val.pkl file: 7.28 MB
Size of test.pkl file: 7.30 MB


# Post Check

In [16]:
train = pd.read_pickle(os.path.join(DATA_FOLDER, 'interim', 'train.pkl'))
val = pd.read_pickle(os.path.join(DATA_FOLDER, 'interim', 'val.pkl'))
test = pd.read_pickle(os.path.join(DATA_FOLDER, 'interim', 'test.pkl'))

assert len(train['reference']) == len(train['translation']) == train['ref_tox'].shape[0] == train['trn_tox'].shape[0]
assert len(val['reference']) == len(val['translation']) == val['ref_tox'].shape[0] == val['trn_tox'].shape[0]
assert len(test['reference']) == len(test['translation']) == test['ref_tox'].shape[0] == test['trn_tox'].shape[0]

print('Train size: {}'.format(len(train['reference'])))
print('Val size: {}'.format(len(val['reference'])))
print('Test size: {}'.format(len(test['reference'])))

Train size: 462221
Val size: 57778
Test size: 57778
