In [None]:
# Add fairhousingguardrail package to system path

import os
import sys

dir_path = os.path.dirname(os.path.realpath("./"))

if dir_path not in sys.path:
    sys.path.insert(1, dir_path)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import plotly.graph_objects as go

from fair_housing_guardrail.data.json_dataset import JsonDataset
from fair_housing_guardrail.utils.helper import load_config, load_dataset, load_tokenizer
from fair_housing_guardrail.utils.fair_housing_classification import (
    FairHousingGuardrailClassification,
)

In [None]:
CONFIG_FILE_PATH = "configs/train-config.yaml"
config = load_config(CONFIG_FILE_PATH)
tokenizer = load_tokenizer()
train_dataset, test_dataset = load_dataset(config, tokenizer)

In [4]:
fh_classifier = FairHousingGuardrailClassification(config, tokenizer, train_dataset, test_dataset)
train_loss, eval_loss = fh_classifier.train()

print("Train Loss")
print(train_loss)

print("Validation Loss")
print(eval_loss)

# Plot loss
_train_loss, steps = zip(*train_loss)
_eval_loss, steps = zip(*eval_loss)

fig = go.Figure()
fig.add_trace(go.Scatter(x=steps, y=_train_loss, mode="lines+markers", name="Training Loss"))
fig.add_trace(go.Scatter(x=steps, y=_eval_loss, mode="lines+markers", name="Validation Loss"))

fig.update_layout(title="Loss", xaxis_title="Training Steps", yaxis_title="Loss")
fig.show()

# Save model and tokenizer
fh_classifier.trainer.save_model()

fh_classifier.model.save_pretrained(config["output_data"]["model_output_dir"])
fh_classifier.tokenizer.save_pretrained(config["output_data"]["tokenizer_output_dir"])

{'loss': 0.4724, 'grad_norm': 8.26887035369873, 'learning_rate': 2.0000000000000003e-06, 'epoch': 4.0}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 0.6633723974227905, 'eval_accuracy': 0.5714285714285714, 'eval_runtime': 0.4879, 'eval_samples_per_second': 14.346, 'eval_steps_per_second': 4.099, 'epoch': 4.0}
{'loss': 0.5772, 'grad_norm': 4.045468807220459, 'learning_rate': 1.0000000000000002e-06, 'epoch': 4.5}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 0.6623252034187317, 'eval_accuracy': 0.5714285714285714, 'eval_runtime': 0.3952, 'eval_samples_per_second': 17.714, 'eval_steps_per_second': 5.061, 'epoch': 4.5}
{'loss': 0.7105, 'grad_norm': 6.000375270843506, 'learning_rate': 0.0, 'epoch': 5.0}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 0.6618289351463318, 'eval_accuracy': 0.5714285714285714, 'eval_runtime': 0.9013, 'eval_samples_per_second': 7.767, 'eval_steps_per_second': 2.219, 'epoch': 5.0}
{'train_runtime': 117.0595, 'train_samples_per_second': 0.256, 'train_steps_per_second': 0.085, 'train_loss': 0.6271948099136353, 'epoch': 5.0}
Train Loss
[(0.6347, 1), (0.6736, 2), (0.735, 3), (0.5158, 4), (0.6069, 5), (0.6754, 6), (0.6705, 7), (0.4724, 8), (0.5772, 9), (0.7105, 10)]
Validation Loss
[(0.6840260624885559, 1), (0.6792192459106445, 2), (0.6757868528366089, 3), (0.6723960041999817, 4), (0.6695025563240051, 5), (0.6669005751609802, 6), (0.6649256944656372, 7), (0.6633723974227905, 8), (0.6623252034187317, 9), (0.6618289351463318, 10)]


('../output/tokenizer/tokenizer_config.json',
 '../output/tokenizer/special_tokens_map.json',
 '../output/tokenizer/vocab.txt',
 '../output/tokenizer/added_tokens.json',
 '../output/tokenizer/tokenizer.json')