# Prepare train/test datasets

In [5]:
import pandas as pd
import os.path
import numpy as np

In [6]:
AFGRUNDIR = "/media/vsevolod/T7/work/prj_kn_afterglow/"
sim = {}; sim["name"] = "SFHoTim276_13_14_0025_150mstg_B0_HLLC"
collated_file_path = AFGRUNDIR + sim["name"] + '/' + "collated.csv"

assert os.path.isfile(collated_file_path), "Collated file not found"
df = pd.read_csv(collated_file_path, index_col=0)
print(f"File loaded: {collated_file_path} {print(df.info(memory_usage='deep'))}")

<class 'pandas.core.frame.DataFrame'>
Index: 6480000 entries, 0 to 6479999
Data columns (total 9 columns):
 #   Column     Dtype  
---  ------     -----  
 0   eps_e      float64
 1   eps_b      float64
 2   eps_t      float64
 3   p          float64
 4   theta_obs  float64
 5   n_ism      float64
 6   freq       float64
 7   time       float64
 8   flux       float64
dtypes: float64(9)
memory usage: 494.4 MB
None
File loaded: /media/vsevolod/T7/work/prj_kn_afterglow/SFHoTim276_13_14_0025_150mstg_B0_HLLC/collated.csv None


In [7]:
target = "flux"

In [8]:
def _visualize_df(df:pd.DataFrame, name:str):
    print(f"\t> Visualizing {name} Shape: {df.shape}")

    display(df.head(2))

    print(f"\t Duplicated_rows: {df.duplicated().sum()}")

    # check df properties
    def analyze_df(df : pd.DataFrame)->pd.DataFrame:
        res = pd.DataFrame({
            "is_unique": df.nunique() == len(df),
            "unique": df.nunique(),
            "with_nan":df.isna().any(),
            "percent_nan":round((df.isnull().sum()/len(df))*100,4),
            "min":df.min(),
            "max":df.max(),
            "mean":df.mean(),
            "dtype":df.dtypes
        })
        return res
    print(f"\t> Numeric features: {df.select_dtypes(exclude='object').shape[1]} \n"
          f"{df.select_dtypes(exclude='object').keys()}")
    print(f"\t> Object features: {df.select_dtypes(exclude='number').shape[1]} \n"
          f"{df.select_dtypes(exclude='number').keys()}")
    print(f"\t Analyzing {name} Summary:")
    metadata = analyze_df(df=df)
    return metadata
metadata = _visualize_df(df=df, name=sim["name"])
display(metadata)

	> Visualizing SFHoTim276_13_14_0025_150mstg_B0_HLLC Shape: (6480000, 9)


Unnamed: 0,eps_e,eps_b,eps_t,p,theta_obs,n_ism,freq,time,flux
0,0.001,0.001,0.01,2.2,0.0,0.001,2400000000.0,100000.0,7.278929e-11
1,0.001,0.001,0.01,2.2,0.0,0.001,2400000000.0,106332.657164,8.460537e-11


	 Duplicated_rows: 0
	> Numeric features: 9 
Index(['eps_e', 'eps_b', 'eps_t', 'p', 'theta_obs', 'n_ism', 'freq', 'time',
       'flux'],
      dtype='object')
	> Object features: 0 
Index([], dtype='object')
	 Analyzing SFHoTim276_13_14_0025_150mstg_B0_HLLC Summary:


Unnamed: 0,is_unique,unique,with_nan,percent_nan,min,max,mean,dtype
eps_e,False,5,False,0.0,0.001,0.5,0.1322,float64
eps_b,False,5,False,0.0,0.001,0.5,0.1322,float64
eps_t,False,4,False,0.0,0.01,1.0,0.4025,float64
p,False,4,False,0.0,2.2,2.8,2.5,float64
theta_obs,False,3,False,0.0,0.0,1.570796,0.7853982,float64
n_ism,False,6,False,0.0,0.001,1.0,0.2768333,float64
freq,False,6,False,0.0,2400000000.0,93000000000.0,31233330000.0,float64
time,False,150,False,0.0,100000.0,940444900.0,105263900.0,float64
flux,True,6480000,False,0.0,4.115669e-13,101.5367,0.1489473,float64


# Select and tansform features

In [9]:
# Set target
metadata["target"] = "flux"

In [10]:
# Print total number of lightcurves
n_curves = np.prod([metadata["unique"][key] for key in df.columns if key not in ["flux","time"]])
n_times = metadata["unique"]["time"]
print(f"total number of light curves: {n_curves} times: {n_times}")

total number of light curves: 43200 times: 150


In [11]:
unique_times = np.array([10, 20, 30])
physical_parameters = np.array([1,2,3,4,5])
all_data_input = np.hstack((
    np.repeat(physical_parameters.reshape(1, -1), len(unique_times), axis=0),
    unique_times.reshape(-1, 1)
))
print(all_data_input.shape)
print(all_data_input)

(3, 6)
[[ 1  2  3  4  5 10]
 [ 1  2  3  4  5 20]
 [ 1  2  3  4  5 30]]


In [12]:
print(metadata["target"])

eps_e        flux
eps_b        flux
eps_t        flux
p            flux
theta_obs    flux
n_ism        flux
freq         flux
time         flux
flux         flux
Name: target, dtype: object


In [25]:
grouped=df.groupby(["eps_e","eps_t","eps_b","n_ism","theta_obs","freq"])
print(df.unique())

AttributeError: 'DataFrame' object has no attribute 'unique'

In [13]:
def LcCollatedDataFrameToNumpyArray(df:pd.DataFrame, metadata:pd.DataFrame,target="flux",time="time"):

    features_names = [col for col in list(df.columns) if col not in [target,time]]
    print(f"Target name: '{target}' features_names: {features_names}")
    
    n_curves = np.prod([metadata["unique"][key] for key in df.columns if key not in ["flux","time"]])
    n_times = metadata["unique"]["time"]
    print(f"Total number of light curves: {n_curves} times: {n_times}")
    
    lcs = []#np.empty(n_curves)
    pars = []#np.empty(n_curves)

    grouped = df.groupby(features_names)
    
    
    
    tmp = {}
    for (i, f) in enumerate(features_names):
        unique_vals = df[f].unique()
        tmp[f] = unique_vals
    
    
    
    
        for (j, val) in enumerate(df[f].unique()):
            print("f={} val={}".format(f, val))
            
            
        
    
LcCollatedDataFrameToNumpyArray(df, metadata)    

Target name: 'flux' features_names: ['eps_e', 'eps_b', 'eps_t', 'p', 'theta_obs', 'n_ism', 'freq']
Total number of light curves: 43200 times: 150
f=eps_e val=0.001
f=eps_e val=0.01
f=eps_e val=0.05
f=eps_e val=0.1
f=eps_e val=0.5


0

In [14]:
from torch.utils.data import DataLoader, TensorDataset, Dataset
from torch.utils.data.sampler import SubsetRandomSampler

In [None]:
# X_train = [i_train_example, ] Y_train = [ flux[t] ]

In [None]:
class LightCurveDataset(Dataset):
    def __init__(self,pars:np.ndarray, lcs:np.ndarray, times:np.ndarray):
        self.pars = np.array(pars)
        self.lcs = np.array(lcs)
        assert self.pars.shape[0] == self.lcs.shape[0], "size mismatch between lcs and pars"
        self.times = times
        self.len = len(lcs)
        
    def __getitem__(self, index):
        """ returns image/lc, vars(params) """
        return (self.lcs[index], self.pars[index])
    
    def __len__(self):
        return len(self.lcs)
    
    def get_dataloader(self, batch_size=32, test_split=0.2):
        dataset_size = len(self)
        indices = list(range(dataset_size))
        split = int(np.floor(test_split * dataset_size))
        np.random.shuffle(indices)
        train_indices, test_indices = indices[split:], indices[:split]

        # Creating PT data samplers and loaders:
        train_sampler = SubsetRandomSampler(train_indices)
        test_sampler = SubsetRandomSampler(test_indices)

        train_loader = DataLoader(self, batch_size=batch_size,
                                  sampler=train_sampler, drop_last=False)
        test_loader = DataLoader(self, batch_size=batch_size,
                                 sampler=test_sampler, drop_last=False)
        
        return train_loader, test_loader