In [None]:
# Execute only once!
import os
import sys
sys.path.append("..")
os.chdir("..")

os.environ["CUDA_VISIBLE_DEVICES"]="3"

In [None]:
import numpy as np
import pandas as pd
import logging
from sklearn import datasets

In [None]:
from be_great import GReaT
import logging


class CustomFormatter(logging.Formatter):
    grey = "\x1b[39;20m"
    yellow = "\x1b[33;20m"
    red = "\x1b[31;20m"
    bold_red = "\x1b[31;1m"
    reset = "\x1b[0m"
    format = "%(asctime)s - %(levelname)s - %(message)s (%(filename)s:%(lineno)d)"

    FORMATS = {
        logging.DEBUG: grey + format + reset,
        logging.INFO: grey + format + reset,
        logging.WARNING: yellow + format + reset,
        logging.ERROR: red + format + reset,
        logging.CRITICAL: bold_red + format + reset,
    }

    def format(self, record):
        log_fmt = self.FORMATS.get(record.levelno)
        formatter = logging.Formatter(log_fmt)
        return formatter.format(record)


def set_logging_level(level=logging.INFO):
    logger = logging.getLogger()
    logger.setLevel(level)

    ch = logging.StreamHandler()
    ch.setLevel(level)
    ch.setFormatter(CustomFormatter())

    logger.addHandler(ch)

    return logger


In [None]:
g = 6
dataset = 'iris'
data = np.load(f"datasets/gen/{dataset}-{g}.npy", allow_pickle=True).item()
data = data['watermarked_data']
data.head()

columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']

In [None]:
great = GReaT("distilgpt2",                  # Name of the large language model used (see HuggingFace for more options)
              epochs=400,                   # Number of epochs to train
              save_steps=2000,               # Save model weights every x steps
              logging_steps=500,             # Log the loss and learning rate every x steps
              experiment_dir="trainer_iris", # Name of the directory where all intermediate steps are saved
              batch_size=16,                 # Batch Size
              #lr_scheduler_type="constant", # Specify the learning rate scheduler 
              #learning_rate=5e-5            # Set the inital learning rate
             )

In [None]:
trainer = great.fit(data)

In [None]:
loss_hist = trainer.state.log_history.copy()
loss_hist.pop()

In [None]:
loss = [x["loss"] for x in loss_hist]
epochs = [x["epoch"] for x in loss_hist]

In [None]:
import matplotlib.pyplot as plt

plt.plot(epochs, loss)

In [None]:
great.save("iris")

In [None]:
great = GReaT.load_from_dir("iris")

In [None]:
n_samples = 150

In [None]:
samples = great.sample(n_samples, k=5)

In [None]:
samples.head()
print(samples)
samples.to_csv(f"datasets/iris/iris_{g}_gen.csv",index=False)