In [1]:
!pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/235.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.8


In [2]:
import pickle
import json
import os
import math
import unidecode
import tensorflow as tf
import pandas as pd
import numpy as np
from datetime import datetime

from collections import Counter
from math import ceil
from sklearn.model_selection import train_test_split

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
rutaDatos = "/content/drive/MyDrive/ProyectoRanking/parsinginstitoa/Datos"
iteration_save_path = "/content/drive/MyDrive/ProyectoRanking/parsinginstitoa/002_Model/institutional_affiliation_classification"
ruta = "/content/drive/MyDrive/ProyectoRanking/parsinginstitoa/002_Model"

In [5]:
# HuggingFace library to train a tokenizer
from tokenizers import Tokenizer, normalizers
from tokenizers.models import WordPiece
from tokenizers.normalizers import NFD, Lowercase, StripAccents
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordPieceTrainer

### Combining the training data from 001 notebook and artificial data

In [6]:
# All training samples that have less than 50 different version of the affiliation text
# ---- Created in previous notebook
lower_than = pd.read_parquet(f"{iteration_save_path}/lower_than_50.parquet")

# All training samples that have more than 50 different version of the affiliation text
# ---- Created in previous notebook
more_than = pd.read_parquet("/content/drive/MyDrive/ProyectoRanking/parsinginstitoa/002_Model/institutional_affiliation_classification/more_than_50.parquet/part-00000-2af188cd-c760-4ae1-aa01-4a638a0d436a-c000.snappy.parquet")

print(lower_than.shape)
print(more_than.shape)

(7950, 2)
(12656, 2)


In [7]:
full_affs_data = pd.concat([more_than, lower_than],
                           axis=0).reset_index(drop=True)

In [8]:
full_affs_data.to_parquet(f"{ruta}/full_affs_data.parquet")

In [9]:
full_affs_data.shape

(20606, 2)

In [10]:
full_affs_data['text_len'] = full_affs_data['original_affiliation'].apply(len)

In [11]:
full_affs_data = full_affs_data[full_affs_data['text_len'] < 500][['original_affiliation','affiliation_id']].copy()

In [12]:
full_affs_data.shape

(20606, 2)

In [13]:
full_affs_data['affiliation_id'] = full_affs_data['affiliation_id'].astype('str')

### Processing and splitting the data

In [14]:
full_affs_data['processed_text'] = full_affs_data['original_affiliation'].apply(unidecode.unidecode)

In [15]:
train_data, val_data = train_test_split(full_affs_data, train_size=0.7, random_state=1)  #0.985
train_data = train_data.reset_index(drop=True).copy()
val_data = val_data.reset_index(drop=True).copy()

In [16]:
train_data.shape

(14424, 3)

In [17]:
val_data.shape

(6182, 3)

In [18]:
affs_list_train = train_data['processed_text'].tolist()
affs_list_val = val_data['processed_text'].tolist()

In [19]:
try:
    os.system(f"rm {ruta}/aff_text.txt")
    print("Done")
except:
    pass

Done


In [20]:
affs_list_train[:5]

['Institute of Economic and Social Development Buenos Aires Buenos Aires F.D.',
 'Hospital Pedro de Elizalde Dept. of Ophthalmology Buenos Aires F.D.',
 'Centro Regional Universitario Cordoba Argentina',
 'UTDT Department of Foreign Languages and Literatures, Buenos Aires, Buenos Aires F.D.',
 'Fundacion Huesped, Buenos Aires, Buenos Aires F.D., Argentina']

In [21]:
# save the affiliation text that will be used to train a tokenizer
with open(f"{ruta}/aff_text.txt", "w") as f:
    for aff in affs_list_train:
        f.write(f"{aff}\n")

In [22]:
try:
    os.system(f"rm {ruta}/basic_model_tokenizer")
    print("Done")
except:
    pass

Done


In [23]:
full_affs_data[['processed_text','affiliation_id']].to_parquet(f"{ruta}/full_affs_data_processed.parquet")

### Creating the tokenizer for the basic model

In [24]:
wordpiece_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

# NFD Unicode, lowercase, and getting rid of accents (to make sure text is as readable as possible)
wordpiece_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])

# Splitting on whitespace
wordpiece_tokenizer.pre_tokenizer = Whitespace()

# Training a tokenizer on the training dataset
trainer = WordPieceTrainer(vocab_size=3816, special_tokens=["[UNK]"])
files = [f"{ruta}/aff_text.txt"]
wordpiece_tokenizer.train(files, trainer)

wordpiece_tokenizer.save(f"{ruta}/basic_model_tokenizer")

### Further processing of data with tokenizer

In [25]:
def max_len_and_pad(tok_sent):
    """
    Truncates sequences with length higher than max_len and also pads the sequence
    with zeroes up to the max_len.
    """
    max_len = 128
    tok_sent = tok_sent[:max_len]
    tok_sent = tok_sent + [0]*(max_len - len(tok_sent))
    return tok_sent

def create_affiliation_vocab(x):
    """
    Checks if affiliation is in vocab and if not, adds to the vocab.
    """
    if x not in affiliation_vocab.keys():
        affiliation_vocab[x]=len(affiliation_vocab)
    return [affiliation_vocab[x]]

In [26]:
# initializing an empty affiliation vocab
affiliation_vocab = {}

# tokenizing the training dataset
tokenized_output = []
for i in affs_list_train:
    tokenized_output.append(wordpiece_tokenizer.encode(i).ids)

train_data['original_affiliation_tok'] = tokenized_output

In [27]:
# tokenizing the validation dataset
tokenized_output = []
for i in affs_list_val:
    tokenized_output.append(wordpiece_tokenizer.encode(i).ids)

val_data['original_affiliation_tok'] = tokenized_output

In [28]:
# applying max length cutoff and padding
train_data['original_affiliation_model_input'] = train_data['original_affiliation_tok'].apply(max_len_and_pad)
val_data['original_affiliation_model_input'] = val_data['original_affiliation_tok'].apply(max_len_and_pad)

In [29]:
# creating the label affiliation vocab
train_data['label'] = train_data['affiliation_id'].apply(lambda x: create_affiliation_vocab(x))

In [30]:
len(affiliation_vocab)

331

In [31]:
val_data['label'] = val_data['affiliation_id'].apply(lambda x: [affiliation_vocab.get(x)])

In [32]:
print(train_data.shape)
print(val_data.shape)

(14424, 6)
(6182, 6)


In [33]:
train_data.to_parquet(f"{ruta}/train_data.parquet")
val_data.to_parquet(f"{ruta}/val_data.parquet")

In [34]:
# saving the affiliation vocab
with open(f"{ruta}/affiliation_vocab.pkl","wb") as f:
    pickle.dump(affiliation_vocab, f)

### Creating TFRecords from the training and validation datasets

In [35]:
train_data = pd.read_parquet(f"{ruta}/train_data.parquet")

In [36]:
val_data = pd.read_parquet(f"{ruta}/val_data.parquet")

In [37]:
print(train_data.shape)
val_data.shape

(14424, 6)


(6182, 6)

In [38]:
# saving the affiliation vocab
with open(f"{ruta}/affiliation_vocab.pkl","rb") as f:
    affiliation_vocab = pickle.load(f)

In [39]:
def create_tfrecords_dataset(data, iter_num, dataset_type='train'):
    """
    Creates a TF Dataset that can then be saved to a file to make it faster to read
    data during training and allow for transferring of data between compute instances.
    """
    ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(data['original_affiliation_model_input'].to_list()),
                              tf.data.Dataset.from_tensor_slices(data['label'].to_list())))

    serialized_features_dataset = ds.map(tf_serialize_example)

    filename = f"{ruta}/training_data/{dataset_type}/{str(iter_num).zfill(4)}.tfrecord"
    writer = tf.data.experimental.TFRecordWriter(filename)
    writer.write(serialized_features_dataset)

In [40]:
def tf_serialize_example(f0, f1):
    """
    Serialization function.
    """
    tf_string = tf.py_function(serialize_example, (f0, f1), tf.string)
    return tf.reshape(tf_string, ())

In [41]:
def serialize_example(features, label):
    """
    Takes in features and outputs them to a serialized string that can be written to
    a file using the TFRecord Writer.
    """
    features_list = tf.train.Int64List(value=features.numpy().tolist())
    label_list = tf.train.Int64List(value=label.numpy().tolist())

    features_feature = tf.train.Feature(int64_list = features_list)
    label_feature = tf.train.Feature(int64_list = label_list)

    features_for_example = {
        'features': features_feature,
        'label': label_feature
    }

    example_proto = tf.train.Example(features=tf.train.Features(feature=features_for_example))

    return example_proto.SerializeToString()

In [42]:
# Making sure data is in the correct format before going into TFRecord
train_data['original_affiliation_model_input'] = train_data['original_affiliation_model_input'] \
.apply(lambda x: np.asarray(x, dtype=np.int64))

val_data['original_affiliation_model_input'] = val_data['original_affiliation_model_input'] \
.apply(lambda x: np.asarray(x, dtype=np.int64))

In [43]:
os.system(f"mkdir -p {ruta}/training_data/train/")
os.system(f"mkdir -p {ruta}/training_data/val/")
print("Done")

Done


#### Creating the Train Dataset

In [44]:
train_data.shape
train_data.head(1)

Unnamed: 0,original_affiliation,affiliation_id,processed_text,original_affiliation_tok,original_affiliation_model_input,label
0,Institute of Economic and Social Development B...,https://openalex.org/I4210131459,Institute of Economic and Social Development B...,"[247, 75, 1348, 129, 328, 1355, 83, 84, 83, 84...","[247, 75, 1348, 129, 328, 1355, 83, 84, 83, 84...",[0]


In [45]:
%%time
for i in range(ceil(train_data.shape[0]/1000)): #500.000
    print(i)
    low = i*1000
    high = (i+1)*1000
    create_tfrecords_dataset(train_data.iloc[low:high,:], i, 'train')

0


Instructions for updating:
To write TFRecords to disk, use `tf.io.TFRecordWriter`. To save and load the contents of a dataset, use `tf.data.experimental.save` and `tf.data.experimental.load`


1
2
3
4
5
6
7
8
9
10
11
12
13
14
CPU times: user 7.25 s, sys: 669 ms, total: 7.92 s
Wall time: 14.7 s


#### Creating the Validation Dataset

In [46]:
%%time
for i in range(ceil(val_data.shape[0]/3000)):
    print(i)
    low = i*3000
    high = (i+1)*3000
    create_tfrecords_dataset(val_data.iloc[low:high,:], i, 'val')

0
1
2
CPU times: user 2.9 s, sys: 277 ms, total: 3.18 s
Wall time: 3.64 s


### Loading the Data

In [47]:
def _parse_function(example_proto):
    """
    Parses the TFRecord file.
    """
    feature_description = {
        'features': tf.io.FixedLenFeature((128,), tf.int64),
        'label': tf.io.FixedLenFeature((1,), tf.int64)
    }

    example = tf.io.parse_single_example(example_proto, feature_description)

    features = example['features']
    label = example['label'][0]

    return features, label

In [48]:
def get_dataset(path, data_type='train'):
    """
    Takes in a path to the TFRecords and returns a TF Dataset to be used for training.
    """
    tfrecords = [f"{path}{data_type}/{x}" for x in os.listdir(f"{path}{data_type}/") if x.endswith('tfrecord')]
    tfrecords.sort()


    raw_dataset = tf.data.TFRecordDataset(tfrecords, num_parallel_reads=AUTO)
    parsed_dataset = raw_dataset.map(_parse_function, num_parallel_calls=AUTO)

    parsed_dataset = parsed_dataset.apply(tf.data.experimental.dense_to_ragged_batch(512,drop_remainder=True))
    return parsed_dataset

In [49]:
train_data_path = f"{ruta}/training_data/"
AUTO = tf.data.experimental.AUTOTUNE
training_data = get_dataset(train_data_path, data_type='train')
validation_data = get_dataset(train_data_path, data_type='val')

Instructions for updating:
Use `tf.data.Dataset.ragged_batch` instead.


### Load Vocab

In [50]:
# Loading the affiliation (target) vocab
with open(f"{ruta}/affiliation_vocab.pkl","rb") as f:
    affiliation_vocab = pickle.load(f)

In [51]:
inverse_affiliation_vocab = {i:j for j,i in affiliation_vocab.items()}

In [52]:
print(len(affiliation_vocab))
print(inverse_affiliation_vocab)

331
{0: 'https://openalex.org/I4210131459', 1: 'https://openalex.org/I2802639098', 2: 'https://openalex.org/I4210136257', 3: 'https://openalex.org/I232641801', 4: 'https://openalex.org/I4210106546', 5: 'https://openalex.org/I4210089903', 6: 'https://openalex.org/I4210113468', 7: 'https://openalex.org/I4210143717', 8: 'https://openalex.org/I297007282', 9: 'https://openalex.org/I4210135618', 10: 'https://openalex.org/I147729237', 11: 'https://openalex.org/I15881080', 12: 'https://openalex.org/I113598218', 13: 'https://openalex.org/I4210125239', 14: 'https://openalex.org/I228664971', 15: 'https://openalex.org/I4210116832', 16: 'https://openalex.org/I4210129512', 17: 'https://openalex.org/I4210120277', 18: 'https://openalex.org/I4210159438', 19: 'https://openalex.org/I4210112003', 20: 'https://openalex.org/I4210163650', 21: 'https://openalex.org/I4210164210', 22: 'https://openalex.org/I4210133262', 23: 'https://openalex.org/I4210117969', 24: 'https://openalex.org/I4210126837', 25: 'https:/

### Creating Model

In [53]:
# Hyperparameters to tune
emb_size = 256
max_len = 128
num_layers = 6
num_heads = 8
dense_1 = 2048
dense_2 = 1024
learn_rate = 0.00004

In [54]:
def scheduler(epoch, curr_lr):
    """
    Setting up a exponentially decaying learning rate.
    """
    rampup_epochs = 2
    exp_decay = 0.17
    def lr(epoch, beg_lr, rampup_epochs, exp_decay):
        if epoch < rampup_epochs:
            return beg_lr
        else:
            return beg_lr * math.exp(-exp_decay * epoch)
    return lr(epoch, start_lr, rampup_epochs, exp_decay)

In [55]:
# Allow for use of multiple GPUs
mirrored_strategy = tf.distribute.MirroredStrategy()

with mirrored_strategy.scope():
    # Model Inputs
    tokenized_aff_string_ids = tf.keras.layers.Input((128,), dtype=tf.int64, name='tokenized_aff_string_input')

    # Embedding Layers  #3816
    tokenized_aff_string_emb_layer = tf.keras.layers.Embedding(input_dim=330,
                                                               output_dim=int(emb_size),
                                                               mask_zero=True,
                                                               trainable=True,
                                                               name="tokenized_aff_string_embedding")

    tokenized_aff_string_embs = tokenized_aff_string_emb_layer(tokenized_aff_string_ids)

    # First dense layer
    dense_output = tf.keras.layers.Dense(int(dense_1), activation='relu',
                                             kernel_regularizer='L2', name="dense_1")(tokenized_aff_string_embs)
    dense_output = tf.keras.layers.Dropout(0.20, name="dropout_1")(dense_output)
    dense_output = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layer_norm_1")(dense_output)
    pooled_output = tf.keras.layers.GlobalAveragePooling1D()(dense_output)

    # Second dense layer
    dense_output = tf.keras.layers.Dense(int(dense_2), activation='relu',
                                             kernel_regularizer='L2', name="dense_2")(pooled_output)
    dense_output = tf.keras.layers.Dropout(0.20, name="dropout_2")(dense_output)
    dense_output = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layer_norm_2")(dense_output)

    # Last dense layer
    final_output = tf.keras.layers.Dense(len(affiliation_vocab), activation='softmax', name='cls')(dense_output)

    model = tf.keras.Model(inputs=tokenized_aff_string_ids, outputs=final_output)

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learn_rate, beta_1=0.9,
                                                     beta_2=0.99),
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                  metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

    curr_date = datetime.now().strftime("%Y%m%d")

    filepath_1 = f"{ruta}/models/{curr_date}_{dense_1}d1_{dense_2}d2/" \


    filepath = filepath_1 + "model_epoch{epoch:02d}ckpt"

    # Adding in checkpointing
    model_checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath, monitor='val_loss',
                                                          verbose=0, save_best_only=False,
                                                          save_weights_only=False, mode='auto',
                                                          save_freq='epoch')

    # Adding in early stopping
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.001, patience=4)

    start_lr = float(learn_rate)

    # Adding in a learning rate schedule to decrease learning rate in later epochs
    lr_schedule = tf.keras.callbacks.LearningRateScheduler(scheduler, verbose=1)

    callbacks = [model_checkpoint, early_stopping, lr_schedule]


In [56]:
model.summary()

### Training the Model

In [57]:
history = model.fit(training_data, epochs=20, validation_data=validation_data, verbose=1, callbacks=callbacks)


Epoch 1: LearningRateScheduler setting learning rate to 4e-05.
Epoch 1/20
     28/Unknown [1m15s[0m 226ms/step - loss: 24.2372 - sparse_categorical_accuracy: 0.0140

  self.gen.throw(typ, value, traceback)


[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 281ms/step - loss: 24.1925 - sparse_categorical_accuracy: 0.0145 - val_loss: 20.5823 - val_sparse_categorical_accuracy: 0.1553 - learning_rate: 4.0000e-05

Epoch 2: LearningRateScheduler setting learning rate to 4e-05.
Epoch 2/20
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 275ms/step - loss: 22.4183 - sparse_categorical_accuracy: 0.1211 - val_loss: 19.2873 - val_sparse_categorical_accuracy: 0.3169 - learning_rate: 4.0000e-05

Epoch 3: LearningRateScheduler setting learning rate to 2.8470812910504387e-05.
Epoch 3/20
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 275ms/step - loss: 21.2753 - sparse_categorical_accuracy: 0.2555 - val_loss: 18.6408 - val_sparse_categorical_accuracy: 0.3648 - learning_rate: 2.8471e-05

Epoch 4: LearningRateScheduler setting learning rate to 2.4019823152490636e-05.
Epoch 4/2

In [None]:
json.dump(str(history.history), open(f"{filepath_1}_25EPOCHS_HISTORY.json", 'w+'))

In [None]:
model.save(f"{filepath_1}basic_model.keras")