In [1]:
import numpy as np
import h5py as h5
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from sklearn import metrics

In [2]:
train_data = pd.read_pickle("./train_data.pkl")
X_train, y_train = train_data.iloc[:, : -1].to_numpy(), train_data.iloc[:, -1].to_numpy()

validation_data = pd.read_pickle("./validation_data.pkl")
X_valid, y_valid = validation_data.iloc[:, : -1].to_numpy(), validation_data.iloc[:, -1].to_numpy()

test_data = pd.read_pickle("./test_data.pkl")
X_test, y_test = test_data.iloc[:, : -1].to_numpy(), test_data.iloc[:, -1].to_numpy()

In [3]:
with h5.File('../qlk_jetexp_nn_training_database_minimal.h5', "r") as f:
    inputs = f['input']['block0_values'][()]
    input_names = f['input']['block0_items'][()]
    index_inp = f['input']['axis1'][()]   #row number from 0 to len(inputs)

    outputs = f['output']['block0_values'][()]
    output_names = f['output']['block0_items'][()]
    index_out = f['output']['axis1'][()]   #row number from 0 to len(inputs) with some missing rows

In [4]:
good_idx = np.intersect1d(index_inp, index_out)

In [5]:
print(f'Percentage of good inputs: {100*(len(good_idx)/ len(index_inp))}')

Percentage of good inputs: 66.15013647273015


In [6]:
df_in = pd.DataFrame(inputs, index_inp, input_names)
df_in = df_in.loc[good_idx]

df_out = pd.DataFrame(outputs, index_out, output_names)
df_out = df_out.loc[good_idx]

In [7]:
df_out.head()

Unnamed: 0,b'R0',b'efeETG_GB',b'efeTEM_GB',b'efiITG_GB',b'efeITG_GB_div_efiITG_GB',b'eflITG_GB_div_efiITG_GB',b'efhITG_GB_div_efiITG_GB',b'pfeITG_GB_div_efiITG_GB',b'pfiITG_GB_div_efiITG_GB',b'pflITG_GB_div_efiITG_GB',...,b'dfhTEM_GB_div_efeTEM_GB',b'vceTEM_GB_div_efeTEM_GB',b'vciTEM_GB_div_efeTEM_GB',b'vclTEM_GB_div_efeTEM_GB',b'vchTEM_GB_div_efeTEM_GB',b'vtlTEM_GB_div_efeTEM_GB',b'vthTEM_GB_div_efeTEM_GB',b'vriTEM_GB_div_efeTEM_GB',b'vrlTEM_GB_div_efeTEM_GB',b'vrhTEM_GB_div_efeTEM_GB'
0,2.903508,0.0,0.0,0.0,,,,,,,...,,,,,,,,,,
1,2.903508,0.0,0.0,0.0,,,,,,,...,,,,,,,,,,
2,2.903508,0.0,0.0,0.0,,,,,,,...,,,,,,,,,,
3,2.903508,0.0,0.0,0.0,,,,,,,...,,,,,,,,,,
4,2.903508,0.0,0.0,0.0,,,,,,,...,,,,,,,,,,


In [8]:
df_out.rename(columns=lambda x: str(x.decode("utf-8")).lower() , inplace=True)

In [9]:
df_out.head()

Unnamed: 0,r0,efeetg_gb,efetem_gb,efiitg_gb,efeitg_gb_div_efiitg_gb,eflitg_gb_div_efiitg_gb,efhitg_gb_div_efiitg_gb,pfeitg_gb_div_efiitg_gb,pfiitg_gb_div_efiitg_gb,pflitg_gb_div_efiitg_gb,...,dfhtem_gb_div_efetem_gb,vcetem_gb_div_efetem_gb,vcitem_gb_div_efetem_gb,vcltem_gb_div_efetem_gb,vchtem_gb_div_efetem_gb,vtltem_gb_div_efetem_gb,vthtem_gb_div_efetem_gb,vritem_gb_div_efetem_gb,vrltem_gb_div_efetem_gb,vrhtem_gb_div_efetem_gb
0,2.903508,0.0,0.0,0.0,,,,,,,...,,,,,,,,,,
1,2.903508,0.0,0.0,0.0,,,,,,,...,,,,,,,,,,
2,2.903508,0.0,0.0,0.0,,,,,,,...,,,,,,,,,,
3,2.903508,0.0,0.0,0.0,,,,,,,...,,,,,,,,,,
4,2.903508,0.0,0.0,0.0,,,,,,,...,,,,,,,,,,


In [10]:
keys_used = ['dfeitg_gb_div_efiitg_gb', 'dfetem_gb_div_efetem_gb',
       'dfiitg_gb_div_efiitg_gb', 'dfitem_gb_div_efetem_gb', 'efeetg_gb',
       'efeitg_gb_div_efiitg_gb', 'efetem_gb', 'efiitg_gb',
       'efitem_gb_div_efetem_gb', 'pfeitg_gb_div_efiitg_gb',
       'pfetem_gb_div_efetem_gb', 'pfiitg_gb_div_efiitg_gb',
       'pfitem_gb_div_efetem_gb', 'vceitg_gb_div_efiitg_gb',
       'vcetem_gb_div_efetem_gb', 'vciitg_gb_div_efiitg_gb',
       'vcitem_gb_div_efetem_gb', 'vfiitg_gb_div_efiitg_gb',
       'vfitem_gb_div_efetem_gb', 'vriitg_gb_div_efiitg_gb',
       'vritem_gb_div_efetem_gb', 'vteitg_gb_div_efiitg_gb',
        'vtiitg_gb_div_efiitg_gb',]


train_columns = df_in.columns.values.tolist()

In [11]:
train_columns

[b'Ane',
 b'Ate',
 b'Autor',
 b'Machtor',
 b'x',
 b'Zeff',
 b'gammaE',
 b'q',
 b'smag',
 b'alpha',
 b'Ani1',
 b'Ati0',
 b'normni1',
 b'Ti_Te0',
 b'logNustar']

In [12]:
df_used = df_out[keys_used]

df_used.head()

Unnamed: 0,dfeitg_gb_div_efiitg_gb,dfetem_gb_div_efetem_gb,dfiitg_gb_div_efiitg_gb,dfitem_gb_div_efetem_gb,efeetg_gb,efeitg_gb_div_efiitg_gb,efetem_gb,efiitg_gb,efitem_gb_div_efetem_gb,pfeitg_gb_div_efiitg_gb,...,vceitg_gb_div_efiitg_gb,vcetem_gb_div_efetem_gb,vciitg_gb_div_efiitg_gb,vcitem_gb_div_efetem_gb,vfiitg_gb_div_efiitg_gb,vfitem_gb_div_efetem_gb,vriitg_gb_div_efiitg_gb,vritem_gb_div_efetem_gb,vteitg_gb_div_efiitg_gb,vtiitg_gb_div_efiitg_gb
0,,,,,0.0,,0.0,0.0,,,...,,,,,,,,,,
1,,,,,0.0,,0.0,0.0,,,...,,,,,,,,,,
2,,,,,0.0,,0.0,0.0,,,...,,,,,,,,,,
3,,,,,0.0,,0.0,0.0,,,...,,,,,,,,,,
4,,,,,0.0,,0.0,0.0,,,...,,,,,,,,,,


In [13]:
X_train, X_temp, y_train, y_temp = train_test_split(df_in, df_used, test_size = 0.2, random_state = 42)

X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 42)

In [14]:
train_df = pd.concat([X_train, y_train], axis=1)
valid_df = pd.concat([X_valid, y_valid], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

train_df.to_pickle("./QLKNN_train_data.pkl")
valid_df.to_pickle("./QLKNN_validation_data.pkl")
test_df.to_pickle("./QLKNN_test_data.pkl")

In [15]:
train_data = pd.read_pickle("./QLKNN_train_data.pkl")
validation_data = pd.read_pickle("./QLKNN_validation_data.pkl")
test_data = pd.read_pickle("./QLKNN_test_data.pkl")

In [16]:
small = train_data.sample(100)
for target in keys_used:
    X_train = small[train_columns]
    y_train = small[target]
    print(y_train.name)

dfeitg_gb_div_efiitg_gb
dfetem_gb_div_efetem_gb
dfiitg_gb_div_efiitg_gb
dfitem_gb_div_efetem_gb
efeetg_gb
efeitg_gb_div_efiitg_gb
efetem_gb
efiitg_gb
efitem_gb_div_efetem_gb
pfeitg_gb_div_efiitg_gb
pfetem_gb_div_efetem_gb
pfiitg_gb_div_efiitg_gb
pfitem_gb_div_efetem_gb
vceitg_gb_div_efiitg_gb
vcetem_gb_div_efetem_gb
vciitg_gb_div_efiitg_gb
vcitem_gb_div_efetem_gb
vfiitg_gb_div_efiitg_gb
vfitem_gb_div_efetem_gb
vriitg_gb_div_efiitg_gb
vritem_gb_div_efetem_gb
vteitg_gb_div_efiitg_gb
vtiitg_gb_div_efiitg_gb
