In [1]:
import random

from transformers import AutoTokenizer, TrainingArguments, AutoModelForCausalLM
from be_great.great_dataset import GReaTDataset
from sklearn.model_selection import train_test_split
from be_great.great_dataset import GReaTDataset, GReaTDataCollator
from be_great.great_trainer import GReaTTrainer

import pandas as pd
import pickle
import json
import os

import numpy as np

import matplotlib.pyplot as plt

from utils import *

DATA_PATH = 'data/processed_dataset'
path='iris-dataset'
# path = 'diabetes-readmissions-column-annotation'
# path = 'environment-impact-of-food-production'
# path = 'stackoverflow2016'
SAVE_PATH = 'rs/pretraining'
SPLIT_INFO_PATH = 'split_3sets.json'

TOTAL_EPOCHS = 500
CHECKPOINT_EPOCH = 25 # save after every checkpoint epoch
BATCH_SIZE = 32 # paper
LR = 5.e-5 # paper

MODEL_CONFIG = {
    "epochs": 1,
    "batch_size": BATCH_SIZE,
    "lr": LR,
    "verbose": True
}


tokenizer = AutoTokenizer.from_pretrained('distilgpt2')
tokenizer.model_max_length = 512
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained('distilgpt2')

training_args = TrainingArguments(
            output_dir='test_code',
            save_strategy='no',
            num_train_epochs=MODEL_CONFIG['epochs'],
            per_device_train_batch_size=MODEL_CONFIG['batch_size'],
            per_device_eval_batch_size=MODEL_CONFIG['batch_size'],
            logging_strategy='epoch',
            do_eval=True,
            evaluation_strategy='epoch',
        )


path = os.path.join(DATA_PATH, path)
df = get_df(path)

print('\t - Split')
df, df_val = train_test_split(df, test_size=0.3, random_state=121)

print('\t - Create training set')
# train set
great_ds_train = GReaTDataset.from_pandas(df)
great_ds_train.set_tokenizer(tokenizer)

print('\t - Create validation set')
# val set
great_ds_val = GReaTDataset.from_pandas(df_val)
great_ds_val.set_tokenizer(tokenizer)

  from .autonotebook import tqdm as notebook_tqdm


	 - Split
	 - Create training set
	 - Create validation set


In [2]:
from be_great.great import CustomGReaT

finetune_model = CustomGReaT('rs/finetune_val/iris-dataset/checkpoint-136')




In [3]:
import torch

finetune_model.init_column_info(df)
df_syn = finetune_model.sample(len(df_val), device=torch.device('cpu'))
df_syn

95it [00:04, 19.13it/s]               


Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.0,4.3,1.3,Iris-setosa
1,5.4,3.2,5.5,1.2,Iris-setosa
2,6.0,2.4,4.7,1.5,Iris-setosa
3,6.8,3.0,5.3,0.2,Iris-versicolor
4,4.5,3.9,5.1,1.4,Iris-virginica
5,6.9,3.0,5.7,1.5,Iris-versicolor
6,7.3,3.2,5.4,1.4,Iris-versicolor
7,6.1,3.0,5.1,0.2,Iris-virginica
8,5.6,3.0,2.3,1.8,Iris-setosa
9,6.1,2.0,1.6,2.5,Iris-setosa


In [4]:
metadata = get_metadata(path)
metadata

{'primary_key': 'Id',
 'columns': {'Id': {'sdtype': 'id', 'subtype': 'integer'},
  'SepalLengthCm': {'sdtype': 'numerical', 'subtype': 'float'},
  'SepalWidthCm': {'sdtype': 'numerical', 'subtype': 'float'},
  'PetalLengthCm': {'sdtype': 'numerical', 'subtype': 'float'},
  'PetalWidthCm': {'sdtype': 'numerical', 'subtype': 'float'},
  'Species': {'sdtype': 'categorical'}}}

In [7]:
metadata = get_metadata(path)

filtered_metadata = filter_metdata(metadata, df_syn.columns)
ft_report = scoring(df_val[df_val.columns[:-1]], df_syn[df_syn.columns[:-1]], filtered_metadata)
# st_report = scoring(real_data, st_syn_data, filtered_metadata)

ValueError: The metadata does not match the data. The following columns are missing in the real/synthetic data or in the metadata: Species

In [6]:
df_syn.columns

Index(['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')

In [12]:
from transformers import EarlyStoppingCallback

great_trainer = GReaTTrainer(
    model,
    training_args,
    train_dataset=great_ds_train,
    eval_dataset=great_ds_val,
    tokenizer=tokenizer,
    data_collator=GReaTDataCollator(tokenizer),
    callbacks = [EarlyStoppingCallback(early_stopping_patience=1)]
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [4]:
tokenizer.model_max_length

1024

In [13]:

print('\t - Training')
# Start training
great_trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


	 - Training


Epoch,Training Loss,Validation Loss
1,5.1182,4.505304
2,4.3737,3.828388
3,3.9483,3.38508
4,3.3813,3.035224
5,3.0437,2.822397
6,2.8343,2.60089
7,2.5496,2.430202
8,2.4802,2.358957
9,2.3363,2.29156
10,2.2297,2.170893


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=24, training_loss=3.049200793107351, metrics={'train_runtime': 61.8383, 'train_samples_per_second': 11.32, 'train_steps_per_second': 0.647, 'total_flos': 5421397229568.0, 'train_loss': 3.049200793107351, 'epoch': 12.0})

In [6]:
model = AutoModelForCausalLM.from_pretrained('test_code/checkpoint-24')

In [8]:
model.state_dict()

OrderedDict([('transformer.wte.weight',
              tensor([[-0.1452, -0.0452,  0.0042,  ..., -0.1528,  0.0180,  0.0985],
                      [ 0.0567, -0.0721,  0.0241,  ...,  0.0596, -0.0039,  0.0471],
                      [-0.1113,  0.0393,  0.1948,  ...,  0.0414, -0.1135, -0.1461],
                      ...,
                      [-0.0710, -0.0173,  0.0176,  ...,  0.0834,  0.1340, -0.0746],
                      [ 0.1993,  0.0201,  0.0152,  ..., -0.0830,  0.0750, -0.0295],
                      [ 0.0345,  0.0639,  0.0300,  ...,  0.0294,  0.0941,  0.0637]])),
             ('transformer.wpe.weight',
              tensor([[-1.8336e-02, -1.9774e-01,  4.5633e-03,  ..., -4.2501e-02,
                        2.8477e-02,  5.4382e-02],
                      [ 2.4498e-02, -5.3674e-02, -9.5113e-02,  ...,  3.4056e-02,
                        9.6513e-03,  1.7008e-05],
                      [ 4.8749e-03, -8.4548e-02,  5.4577e-02,  ...,  1.9744e-02,
                        1.9432e-02, -2.1456

In [9]:
model = AutoModelForCausalLM.from_pretrained('distilgpt2')



In [14]:
great_trainer.save_model('test_weights.pt')

In [15]:
model = AutoModelForCausalLM.from_pretrained('test_weights.pt')

In [None]:
great_trainer.load

## Finetune

In [1]:
# from ctgan.synthesizers.tvae import CustomTVAE
import random

from transformers import AutoTokenizer, TrainingArguments, AutoModelForCausalLM, EarlyStoppingCallback
from be_great.great_dataset import GReaTDataset
from sklearn.model_selection import train_test_split
from be_great.great_dataset import GReaTDataset, GReaTDataCollator
from be_great.great_trainer import GReaTTrainer

import pandas as pd
import pickle
import json
import os

import numpy as np

import matplotlib.pyplot as plt

from utils import *

############# CONFIG #############

DATA_PATH= 'data/processed_dataset'
PRETRAIN_PATH = 'rs/pretraining/weights.pt'
PRETRAIN_PATH = 'test_weights.pt'
SAVE_PATH = 'rs/test_finetuning'
SPLIT_INFO_PATH = 'split_3sets.json'

TOTAL_EPOCHS = 500
# CHECKPOINT_EPOCH = 25 # save after every checkpoint epoch
BATCH_SIZE = 32 # paper
LR = 5.e-5 # paper
# EMBEDDING_DIM = 128
# ENCODERS_DIMS = (512, 256, 256, 128)
# DECODER_DIMS = (128, 256, 256, 512)

############# END CONFIG #############

MODEL_CONFIG = {
    # "input_dim": get_max_input_dim(DATA_PATH),
    "epochs": TOTAL_EPOCHS,
    "batch_size": BATCH_SIZE,
    "lr": LR,
    # "embedding_dim": EMBEDDING_DIM,
    # "compress_dims": ENCODERS_DIMS,
    # "decompress_dims": DECODER_DIMS,
    "verbose": True
}

tokenizer = AutoTokenizer.from_pretrained('distilgpt2')
tokenizer.model_max_length = 512
tokenizer.pad_token = tokenizer.eos_token

training_hist = []

# list_data_paths = os.listdir(data_path)
split_info = json.load(open(SPLIT_INFO_PATH, 'r'))

list_data_paths = split_info['pretrain_paths']
list_data_paths
    
    

  from .autonotebook import tqdm as notebook_tqdm


['crime-economic-factors-datasets',
 'list-of-countries-by-wealth-equality',
 'top-100-greatest-hollywood-actors-of-all-time',
 'heart-data',
 'zoo-animals',
 'campusplacementinternship',
 'indian-chess-grandmasters',
 'richestrappers',
 'proyeksi-jumlah-penduduk-indonesia-jenis-kelamin',
 'clouds',
 'diet-of-elderly-people',
 'iphone-purchase-records',
 'early-stage-diabetes-risk-prediction-dataset',
 'svalbard-climate-19102017',
 'advertising-dataset',
 'the-sinking-of-mv-sewol-south-korea',
 'fruits',
 'global-politcs-and-governance-data-apr-2020',
 '50ulke',
 'prostate-cancer-survival-data',
 'mall-customer-cluster',
 'singapore-train-station-coordinates',
 'forest-fire-prediction',
 'userdata',
 'morocco-covid-19-evolution-dataset',
 'world-press-index-20192021',
 'pokemon-dataset-with-stats',
 'lab-tat-dataset',
 'airtel-youtube-video-dataset',
 'gradient-descent',
 'lisbon-house-prices',
 'pakistan-cities',
 'blackjack-decision-matrix',
 'housing-simple-regression',
 'fifacsv',


In [6]:
path = 'list-of-countries-by-wealth-equality'
    
dataset_save_path = os.path.join(SAVE_PATH, path)
path = os.path.join(DATA_PATH, path)
df = get_df(path)
n_rows, n_cols = len(df), len(df.columns)
    
print(f'path: {path} | dataset: {path} | n_cols: {n_cols}, n_rows: {n_rows}')

print('\t - Split')
df, df_val = train_test_split(df, test_size=0.3, random_state=121)

print('\t - Create training set')
# train set
great_ds_train = GReaTDataset.from_pandas(df)
great_ds_train.set_tokenizer(tokenizer)

print('\t - Create validation set')
# val set
great_ds_val = GReaTDataset.from_pandas(df_val)
great_ds_val.set_tokenizer(tokenizer)

if 10 < n_cols <= 20:
    MODEL_CONFIG['batch_size'] = 16
    MODEL_CONFIG['batch_size'] = 16

if 20 < n_cols <= 30:
    MODEL_CONFIG['batch_size'] = 8
    MODEL_CONFIG['batch_size'] = 8
    
if n_cols > 30:
    MODEL_CONFIG['batch_size'] = 2
    MODEL_CONFIG['batch_size'] = 2
    
model = AutoModelForCausalLM.from_pretrained(PRETRAIN_PATH)

training_args = TrainingArguments(
            output_dir=dataset_save_path,
            save_strategy='epoch',
            num_train_epochs=MODEL_CONFIG['epochs'],
            per_device_train_batch_size=MODEL_CONFIG['batch_size'],
            per_device_eval_batch_size=MODEL_CONFIG['batch_size'],
            logging_strategy='epoch',
            do_eval=True,
            evaluation_strategy='epoch',
            metric_for_best_model = 'eval_loss',
            save_total_limit=1,
            load_best_model_at_end=True
        )
    
great_trainer = GReaTTrainer(
    model,
    training_args,
    train_dataset=great_ds_train,
    eval_dataset=great_ds_val,
    tokenizer=tokenizer,
    data_collator=GReaTDataCollator(tokenizer),
    callbacks = [EarlyStoppingCallback(early_stopping_patience=1)]
)

print('\t - Training')
# Start training
great_trainer.train()

ds_name = os.path.basename(path)

print('\t - Update training history')
training_hist = merge_training_hist(get_training_hist(great_trainer), ds_name, training_hist)

print('\t -> Finished')

MODEL_CONFIG['batch_size'] = BATCH_SIZE
    
save_training_history(training_hist, SAVE_PATH)
    

path: data/processed_dataset/list-of-countries-by-wealth-equality | dataset: data/processed_dataset/list-of-countries-by-wealth-equality | n_cols: 8, n_rows: 181
	 - Split
	 - Create training set
	 - Create validation set


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


	 - Training


Epoch,Training Loss,Validation Loss
1,3.0238,2.345644
2,2.1212,1.905011
3,1.7282,1.587621
4,1.4393,1.374492
5,1.2753,1.238905
6,1.1755,1.124764
7,1.1051,1.026505
8,1.0272,0.965091
9,0.9813,0.899746
10,0.9168,0.859946


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


	 - Update training history
	 -> Finished


In [7]:
training_hist

Unnamed: 0,loss,grad_norm,learning_rate,epoch,step,eval_loss,eval_runtime,eval_samples_per_second,eval_steps_per_second,train_runtime,train_samples_per_second,train_steps_per_second,total_flos,train_loss,dataset
0,3.6924,14.257809,0.000050,1.0,2,,,,,,,,,,crime-economic-factors-datasets
1,,,,1.0,2,2.557843,0.5009,29.946,1.996,,,,,,crime-economic-factors-datasets
2,2.5445,9.419679,0.000050,2.0,4,,,,,,,,,,crime-economic-factors-datasets
3,,,,2.0,4,2.190998,0.5113,29.336,1.956,,,,,,crime-economic-factors-datasets
4,2.1705,8.770673,0.000050,3.0,6,,,,,,,,,,crime-economic-factors-datasets
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,0.7344,1.683156,0.000048,19.0,76,,,,,,,,,,list-of-countries-by-wealth-equality
74,,,,19.0,76,0.718325,3.0814,17.849,0.649,,,,,,list-of-countries-by-wealth-equality
75,0.7349,1.644449,0.000048,20.0,80,,,,,,,,,,list-of-countries-by-wealth-equality
76,,,,20.0,80,0.724119,3.4132,16.114,0.586,,,,,,list-of-countries-by-wealth-equality


In [None]:
training_hist_ds2 = training_hist.copy()

In [5]:
training_hist_ds1 = training_hist.copy()
training_hist_ds1

Unnamed: 0,loss,grad_norm,learning_rate,epoch,step,eval_loss,eval_runtime,eval_samples_per_second,eval_steps_per_second,train_runtime,train_samples_per_second,train_steps_per_second,total_flos,train_loss,dataset
0,3.6924,14.257809,5e-05,1.0,2,,,,,,,,,,crime-economic-factors-datasets
1,,,,1.0,2,2.557843,0.5009,29.946,1.996,,,,,,crime-economic-factors-datasets
2,2.5445,9.419679,5e-05,2.0,4,,,,,,,,,,crime-economic-factors-datasets
3,,,,2.0,4,2.190998,0.5113,29.336,1.956,,,,,,crime-economic-factors-datasets
4,2.1705,8.770673,5e-05,3.0,6,,,,,,,,,,crime-economic-factors-datasets
5,,,,3.0,6,1.751189,0.5097,29.429,1.962,,,,,,crime-economic-factors-datasets
6,1.8599,8.590057,5e-05,4.0,8,,,,,,,,,,crime-economic-factors-datasets
7,,,,4.0,8,1.514761,0.514,29.182,1.945,,,,,,crime-economic-factors-datasets
8,1.5554,6.336949,5e-05,5.0,10,,,,,,,,,,crime-economic-factors-datasets
9,,,,5.0,10,1.285887,0.5038,29.774,1.985,,,,,,crime-economic-factors-datasets
