# Prepare train/test datasets

In [2]:
import pandas as pd
import os.path
import numpy as np
import itertools

In [3]:
AFGRUNDIR = "/media/vsevolod/T7/work/prj_kn_afterglow/"
sim = {}; sim["name"] = "SFHoTim276_13_14_0025_150mstg_B0_HLLC"
collated_file_path = AFGRUNDIR + sim["name"] + '/' + "collated.csv"

assert os.path.isfile(collated_file_path), "Collated file not found"
df = pd.read_csv(collated_file_path, index_col=0)
print(f"File loaded: {collated_file_path} {print(df.info(memory_usage='deep'))}")

<class 'pandas.core.frame.DataFrame'>
Index: 6480000 entries, 0 to 6479999
Data columns (total 9 columns):
 #   Column     Dtype  
---  ------     -----  
 0   eps_e      float64
 1   eps_b      float64
 2   eps_t      float64
 3   p          float64
 4   theta_obs  float64
 5   n_ism      float64
 6   freq       float64
 7   time       float64
 8   flux       float64
dtypes: float64(9)
memory usage: 494.4 MB
None
File loaded: /media/vsevolod/T7/work/prj_kn_afterglow/SFHoTim276_13_14_0025_150mstg_B0_HLLC/collated.csv None


In [4]:
target = "flux"

In [5]:
def _visualize_df(df:pd.DataFrame, name:str):
    print(f"\t> Visualizing {name} Shape: {df.shape}")

    display(df.head(2))

    print(f"\t Duplicated_rows: {df.duplicated().sum()}")

    # check df properties
    def analyze_df(df : pd.DataFrame)->pd.DataFrame:
        res = pd.DataFrame({
            "is_unique": df.nunique() == len(df),
            "unique": df.nunique(),
            "with_nan":df.isna().any(),
            "percent_nan":round((df.isnull().sum()/len(df))*100,4),
            "min":df.min(),
            "max":df.max(),
            "mean":df.mean(),
            "dtype":df.dtypes
        })
        return res
    print(f"\t> Numeric features: {df.select_dtypes(exclude='object').shape[1]} \n"
          f"{df.select_dtypes(exclude='object').keys()}")
    print(f"\t> Object features: {df.select_dtypes(exclude='number').shape[1]} \n"
          f"{df.select_dtypes(exclude='number').keys()}")
    print(f"\t Analyzing {name} Summary:")
    metadata = analyze_df(df=df)
    return metadata
metadata = _visualize_df(df=df, name=sim["name"])
display(metadata)

	> Visualizing SFHoTim276_13_14_0025_150mstg_B0_HLLC Shape: (6480000, 9)


Unnamed: 0,eps_e,eps_b,eps_t,p,theta_obs,n_ism,freq,time,flux
0,0.001,0.001,0.01,2.2,0.0,0.001,2400000000.0,100000.0,7.278929e-11
1,0.001,0.001,0.01,2.2,0.0,0.001,2400000000.0,106332.657164,8.460537e-11


	 Duplicated_rows: 0
	> Numeric features: 9 
Index(['eps_e', 'eps_b', 'eps_t', 'p', 'theta_obs', 'n_ism', 'freq', 'time',
       'flux'],
      dtype='object')
	> Object features: 0 
Index([], dtype='object')
	 Analyzing SFHoTim276_13_14_0025_150mstg_B0_HLLC Summary:


Unnamed: 0,is_unique,unique,with_nan,percent_nan,min,max,mean,dtype
eps_e,False,5,False,0.0,0.001,0.5,0.1322,float64
eps_b,False,5,False,0.0,0.001,0.5,0.1322,float64
eps_t,False,4,False,0.0,0.01,1.0,0.4025,float64
p,False,4,False,0.0,2.2,2.8,2.5,float64
theta_obs,False,3,False,0.0,0.0,1.570796,0.7853982,float64
n_ism,False,6,False,0.0,0.001,1.0,0.2768333,float64
freq,False,6,False,0.0,2400000000.0,93000000000.0,31233330000.0,float64
time,False,150,False,0.0,100000.0,940444900.0,105263900.0,float64
flux,True,6480000,False,0.0,4.115669e-13,101.5367,0.1489473,float64


# Select and tansform features

In [6]:
# Set target
metadata["target"] = "flux"

In [7]:
# Print total number of lightcurves
n_curves = np.prod([metadata["unique"][key] for key in df.columns if key not in ["flux","time"]])
n_times = metadata["unique"]["time"]
print(f"total number of light curves: {n_curves} times: {n_times}")

total number of light curves: 43200 times: 150


In [8]:
unique_times = np.array([10, 20, 30])
physical_parameters = np.array([1,2,3,4,5])
all_data_input = np.hstack((
    np.repeat(physical_parameters.reshape(1, -1), len(unique_times), axis=0),
    unique_times.reshape(-1, 1)
))
print(all_data_input.shape)
print(all_data_input)

(3, 6)
[[ 1  2  3  4  5 10]
 [ 1  2  3  4  5 20]
 [ 1  2  3  4  5 30]]


In [10]:
# features_names = [col for col in list(df.columns) if col not in ["flux", "time"]]
# def prepare_scaling_metadata(df:pd.DataFrame, features_names:list[str]):
#     metadata = {}
    

In [56]:
# Prepare data in numpy arrays
def LcCollatedDataFrameToNumpyArray(df:pd.DataFrame, metadata:pd.DataFrame,target="flux",time="time"):

    features_names = [col for col in list(df.columns) if col not in [target, time]]
    print(f"Target name: '{target}' features_names: {features_names}")

    n_curves = np.prod([metadata["unique"][key] for key in features_names])#df.columns if key not in ["flux","time"]])
    n_times = metadata["unique"]["time"]
    print(f"Total number of light curves: {n_curves} times: {n_times}")

    grouped = df.groupby(features_names)
    pars = np.vstack([np.array(key) for key, val in grouped.groups.items()])
    lcs  = np.vstack([np.array(df[target].iloc[val]) for key, val in grouped.groups.items()])
    times= np.array(np.array(df[time].unique()))
    assert len(times) == len(lcs[0])
    assert n_curves == len(lcs)
    return (lcs, pars, times)
lcs, pars, times = LcCollatedDataFrameToNumpyArray(df, metadata)  
print(f"lcs={lcs.shape}, pars={pars.shape}, times={times.shape}")

Target name: 'flux' features_names: ['eps_e', 'eps_b', 'eps_t', 'p', 'theta_obs', 'n_ism', 'freq']
Total number of light curves: 43200 times: 150
lcs=(43200, 150), pars=(43200, 7), times=(150,)


In [45]:
from torch.utils.data import DataLoader, TensorDataset, Dataset
from torch.utils.data.sampler import SubsetRandomSampler
from sklearn import preprocessing

In [66]:
lcs_log = np.log10(lcs)
print(np.min(lcs), np.max(lcs))
print(np.min(lcs_log), np.max(lcs_log))
print(np.mean(lcs_log), np.median(lcs_log))

lcs_log_norm = (lcs_log - np.min(lcs_log)) / (np.max(lcs_log) - np.min(lcs_log))
print(np.min(lcs_log_norm), np.max(lcs_log_norm))

lcs_log_rec =  (np.max(lcs_log) - np.min(lcs_log)) * lcs_log_norm + np.min(lcs_log)
lcs_rec = np.power(10., lcs_log_rec)
print(np.min(lcs_log_rec), np.max(lcs_log_rec))
print(np.min(lcs_rec), np.max(lcs_rec))

4.1156688662493273e-13 101.5366661484814
-12.3855595744698 2.0066228996921094
-3.6362396384942115 -3.5715946045498184
0.0 1.0
-12.3855595744698 2.0066228996921094
4.1156688662493283e-13 101.5366661484814


In [55]:
# log_pars = pars.copy()
# log_pars[:,0:4] = np.log10(pars[:,0:4])
# log_pars[:,6:8] = np.log10(pars[:,6:8])

for i in range(len(pars[0,:])):
    print(f"i={i}, min={np.min(pars[:,i])} max={np.max(pars[:,i])}")

# pars[6:8] = np.log10(pars[6:8])
scaler = preprocessing.MinMaxScaler()
scaler.fit(pars)
normalized = scaler.transform(pars)
print(f"scaler.data_max_={scaler.data_max_}")
print(f"scaler.data_min_={scaler.data_min_}")
print(f"scaler.data_range_={scaler.data_range_}")
print(normalized.shape)
print(pars[-1])
print(normalized[-1])

i=0, min=0.001 max=0.5
i=1, min=0.001 max=0.5
i=2, min=0.01 max=1.0
i=3, min=2.2 max=2.8
i=4, min=0.0 max=1.5707963267948966
i=5, min=0.001 max=1.0
i=6, min=2400000000.0 max=93000000000.0
scaler.data_max_=[5.00000000e-01 5.00000000e-01 1.00000000e+00 2.80000000e+00
 1.57079633e+00 1.00000000e+00 9.30000000e+10]
scaler.data_min_=[1.0e-03 1.0e-03 1.0e-02 2.2e+00 0.0e+00 1.0e-03 2.4e+09]
scaler.data_range_=[4.99000000e-01 4.99000000e-01 9.90000000e-01 6.00000000e-01
 1.57079633e+00 9.99000000e-01 9.06000000e+10]
(43200, 7)
[5.00000000e-01 5.00000000e-01 1.00000000e+00 2.80000000e+00
 1.57079633e+00 1.00000000e+00 9.30000000e+10]
[1. 1. 1. 1. 1. 1. 1.]


In [68]:

class LightCurveDataset(Dataset):
    """
    LightCurve dataset
    Dispatches a lightcurve to the appropriate index
    """
    def __init__(self, pars:np.ndarray, lcs:np.ndarray, times:np.ndarray):
        self.pars = np.array(pars)
        self.lcs = np.array(lcs)
        assert self.pars.shape[0] == self.lcs.shape[0], "size mismatch between lcs and pars"
        self.times = times
        self.len = len(self.lcs)
        
        # preprocess parameters
        self.scaler = preprocessing.MinMaxScaler()
        self.scaler.fit(pars)
        self.pars_normed = self.scaler.transform(pars)
        # inverse transform
        # inverse = scaler.inverse_transform(normalized)
        
        # preprocess lcs 
        self._transform_lcs(self.lcs)
        
    def __getitem__(self, index):
        """ returns image/lc, vars(params)[normalized], vars(params)[physical] """
        return (self.lcs_log_norm[index], self.pars_normed[index], self.lcs[index], self.pars[index])
    
    def __len__(self):
        return len(self.lcs)
    
    def _transform_lcs(self, lcs):
        log_lcs = np.log10(lcs)
        self.lc_min = log_lcs.min()
        self.lc_max = log_lcs.max()
        self.lcs_log_norm = (log_lcs - np.min(lcs_log)) / (np.max(lcs_log) - np.min(lcs_log))
         
    def inverse_transform_lc_log(self, lcs_log_normed):
        return np.power(10, lcs_log_normed * (self.lc_max - self.lc_min) + self.lc_min)
    
    def get_dataloader(self, batch_size=32, test_split=0.2):
        dataset_size = len(self)
        indices = list(range(dataset_size))
        split = int(np.floor(test_split * dataset_size))
        np.random.shuffle(indices)
        train_indices, test_indices = indices[split:], indices[:split]

        # Creating PT data samplers and loaders:
        train_sampler = SubsetRandomSampler(train_indices)
        test_sampler = SubsetRandomSampler(test_indices)

        train_loader = DataLoader(self, batch_size=batch_size,
                                  sampler=train_sampler, drop_last=False)
        test_loader = DataLoader(self, batch_size=batch_size,
                                 sampler=test_sampler, drop_last=False)
        
        return (train_loader, test_loader)
dataset = LightCurveDataset(pars, lcs, times)

In [70]:
import torch.optim as optim
from model_cvae import CVAE
model = CVAE(image_size=len(lcs[0]), hidden_dim=200, z_dim=20, c=len(pars[0])) # TODO -------------------  check! 
train_loader, test_loader = dataset.get_dataloader(batch_size=32, test_split=.2)
optimizer = optim.Adam(model.parameters(), lr=1.e-4) # TODO ------------ CHECK

In [71]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
if device.type == 'cuda':
    torch.cuda.empty_cache()

cuda


In [None]:
import torch.nn as nn
import datetime

class EarlyStopping:
    """Early stops the training if validation loss doesn't
    improve after a given patience."""
    def __init__(self, patience=7, min_delta=0, verbose=False):
        """
        Attributes
        ----------
        patience  : int
            How long to wait after last time validation loss improved.
            Default: 7
        min_delta : float
            Minimum change in monitored value to qualify as 
            improvement. This number should be positive.
            Default: 0
        verbose   : bool
            If True, prints a message for each validation loss improvement.
            Default: False
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.min_delta = min_delta

    def __call__(self, val_loss):

        current_loss = val_loss

        if self.best_score is None:
            self.best_score = current_loss
        elif torch.abs(current_loss - self.best_score) < self.min_delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} / {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = current_loss
            self.counter = 0
            
class Trainer:
    def __init__(self, model:CVAE, optimizer, batch_size, scheduler=None, cond_l=False, cond_p=False,
                 beta='step', print_every=50, device='cpu'):
        self.device = device
        self.model = model
        if torch.cuda.device_count() > 1 and True:
            print("Let's use", torch.cuda.device_count(), "GPUs!")
            self.model = nn.DataParallel(self.model)
        self.model.to(self.device)
        print('Is model in cuda? ', next(self.model.parameters()).is_cuda)
        self.opt = optimizer
        self.sch = scheduler
        self.cond_l = cond_l
        self.cond_p = cond_p
        self.batch_size = batch_size
        self.train_loss = {'KL_latent': [], 'BCE': [], 'Loss': [],
                           'MSE': [], 'KL_output': [], 'tMSE': [],
                           'wMSE': []}
        self.test_loss = {'KL_latent': [], 'BCE': [], 'Loss': [],
                          'MSE': [], 'KL_output': [], 'tMSE': [],
                          'wMSE': []}
        self.num_steps = 0
        self.print_every = print_every
        self.beta = beta
    
    def train(self, train_loader, test_loader, epochs, data_ex, save=True, early_stop=False):
        # hold samples, real and generated, for initial plotting
        if early_stop:
            early_stopping = EarlyStopping(patience=10, min_delta=.01, verbose=True)

        # train for n number of epochs
        time_start = datetime.datetime.now()

        time_start = datetime.datetime.now()
        for epoch in range(1, epochs + 1):
            e_time = datetime.datetime.now()
            print('##'*20)
            print("\nEpoch {}".format(epoch))
            print("beta: %.2f" % self._beta_scheduler(epoch))

    def _beta_scheduler(self, epoch, beta0=0., step=50, gamma=0.1):
        """Scheduler for beta value, the sheduler is a step function that
        increases beta value after "step" number of epochs by a factor "gamma"

        Parameters
        ----------
        epoch : int
            epoch value
        beta0 : float
            starting beta value
        step  : int
            epoch step for update
        gamma : float
            linear factor of step scheduler

        Returns
        -------
        beta
            beta value
        """

        if self.beta == 'step':
            return beta0 + gamma * (epoch // step)
        else:
            return float(self.beta)
        
    