In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from src.utils.path_converter import path_converter

df = pd.read_csv(path_converter("/data/raw/dates_advanced_25k.csv"))

train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

train_df.to_csv(path_converter("/data/raw/train_dates.csv"), index=False)
val_df.to_csv(path_converter("/data/raw/val_dates.csv"), index=False)
test_df.to_csv(path_converter("/data/raw/test_dates.csv"), index=False)

In [2]:
from datasets import load_dataset
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("t5-small")


def preprocess(example):
    model_inputs = tokenizer(
        example["input_text"], max_length=64, padding="max_length", truncation=True
    )
    labels = tokenizer(
        example["target_text"], max_length=16, padding="max_length", truncation=True
    )["input_ids"]
    model_inputs["labels"] = labels
    return model_inputs


dataset = load_dataset(
    "csv",
    data_files={
        "train": path_converter("/data/raw/train_dates.csv"),
        "val": path_converter("/data/raw/val_dates.csv"),
        "test": path_converter("/data/raw/test_dates.csv"),
    },
)

dataset = dataset.map(preprocess)

  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Generating train split: 20250 examples [00:00, 533187.63 examples/s]
Generating val split: 2250 examples [00:00, 646736.84 examples/s]
Generating test split: 2500 examples [00:00, 783806.25 examples/s]
Map: 100%|██████████| 20250/20250 [00:02<00:00, 7306.36 examples/s]
Map: 100%|██████████| 2250/2250 [00:00<00:00, 7507.13 examples/s]
Map: 100%|██████████| 2500/2500 [00:00<00:00, 7042.69 examples/s]


In [4]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import TrainingArguments, Trainer

model = T5ForConditionalGeneration.from_pretrained("t5-small")

training_args = TrainingArguments(
    output_dir="date-normalizer",
    num_train_epochs=4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=3e-4,
    logging_steps=200,
    use_cpu=True,
    no_cuda=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["val"],
)

trainer.train()


  return FileStore(store_uri, store_uri)


Step,Training Loss
200,0.5031
400,0.1465
600,0.1252
800,0.1092
1000,0.107
1200,0.0974
1400,0.0934
1600,0.0883
1800,0.0846
2000,0.0869


TrainOutput(global_step=2532, training_loss=0.13033619198188962, metrics={'train_runtime': 7010.56, 'train_samples_per_second': 11.554, 'train_steps_per_second': 0.361, 'total_flos': 1370335739904000.0, 'train_loss': 0.13033619198188962, 'epoch': 4.0})

In [5]:
def normalize_date(text: str):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=20, num_beams=5)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


print(normalize_date("2011-12-09 12:50:00"))


2011-12-09


In [6]:
df = pd.DataFrame(["2011-12-09 12:50:00"])
df.columns = ['date']

df = pd.to_datetime(df['date'])
print(df.dt.date)

0    2011-12-09
Name: date, dtype: object


In [7]:
import numpy as np


def evaluate_model(model, tokenizer, dataset):
    correct = 0
    total = len(dataset)
    i = 0

    for sample in dataset:
        i += 1
        inp = tokenizer(sample["input_text"], return_tensors="pt")
        output = model.generate(**inp, max_length=20)
        prediction = tokenizer.decode(output[0], skip_special_tokens=True)

        if prediction == sample["target_text"]:
            correct += 1
        print(correct / i)

    print("Accuracy:", correct / total)

In [8]:
evaluate_model(model, tokenizer, dataset["test"])

0.0
0.0
0.3333333333333333
0.25
0.4
0.3333333333333333
0.42857142857142855
0.5
0.5555555555555556
0.5
0.45454545454545453
0.5
0.5384615384615384
0.5714285714285714
0.6
0.625
0.6470588235294118
0.6666666666666666
0.6842105263157895
0.7
0.7142857142857143
0.7272727272727273
0.7391304347826086
0.7083333333333334
0.72
0.6923076923076923
0.7037037037037037
0.7142857142857143
0.7241379310344828
0.7333333333333333
0.7419354838709677
0.75
0.7575757575757576
0.7647058823529411
0.7714285714285715
0.75
0.7297297297297297
0.7105263157894737
0.717948717948718
0.725
0.7317073170731707
0.7380952380952381
0.7441860465116279
0.75
0.7555555555555555
0.7391304347826086
0.723404255319149
0.7291666666666666
0.7346938775510204
0.74
0.7450980392156863
0.75
0.7547169811320755
0.7592592592592593
0.7636363636363637
0.7678571428571429
0.7543859649122807
0.7586206896551724
0.7627118644067796
0.7666666666666667
0.7704918032786885
0.7741935483870968
0.7777777777777778
0.78125
0.7692307692307693
0.7727272727272727
0

KeyboardInterrupt: 

In [9]:
model.save_pretrained("model-small", safe_serialization=True)