In [None]:
import os
from urllib.request import urlretrieve

from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns

sns.set()

### Retrieving and loading the dataset

In [None]:
simultaion_name = 'qgs_gm_pr_v2'
files_to_download = [f'{simultaion_name}_matrices.npz', f'{simultaion_name}_features.npz', f'{simultaion_name}_true_features.npz']
for filename in files_to_download:
    if not os.path.exists(filename):
        print(f'Downloading {filename}... ', end='')
        urlretrieve(f'https://kascade-sim-data.s3.eu-central-1.amazonaws.com/{filename}', filename)
        print('Done!')

In [None]:
matrices = np.load(f'{simultaion_name}_matrices.npz')['matrices']
features = np.load(f'{simultaion_name}_features.npz')['features']
true_features = np.load(f'{simultaion_name}_true_features.npz')['true_features']

In [None]:
matrices.shape, features.shape, true_features.shape

### Data exploration

For all the details, refer to the [KCDC manual](http://kcdc.ikp.kit.edu/static/pdf/kcdc_mainpage/kcdc-Manual.pdf) (starting from page 45).

In [None]:
f = ['part_type', 'E', 'Xc', 'Yc', 'core_dist', 'Ze', 'Az', 'Ne', 'Nmu', 'Age']
tf = ['E', 'part_type', 'Xc', 'Yc', 'Ze', 'Az', 'Ne', 'Np', 'Nmu', 'Nh']

In [None]:
features[0]

In [None]:
for i in range(len(f)):
    plt.hist(features[:, i], label=f[i])
    plt.legend()
    plt.show()

In [None]:
true_features[0]

In [None]:
for i in range(len(tf)):
    if i == 7 or i == 9: 
        continue  # these fields were excluded from simulations
    plt.hist(true_features[:, i], label=tf[i])
    plt.legend()
    plt.show()

#### Matrices
For each recorded event, there are three 16x16 matrices:
1. arrival times per station (ns)
2. e/$\gamma$ energy deposit per station (MeV)
3. $\mu$ energy deposit per station (MeV)

#### Features
A list of reconstructed properties of the each event:
1. **Particle** - particle type (gamma quant or proton)
2. **Energy** - first order energy (logarithm of eV)
3. **Core Position X** - location of the reconstructed shower core x-position (meters)
4. **Core Position Y** - location of the reconstructed shower core y-position (meters)
5. **Core Distance** - L2 of the previous two metrics
6. **Zenith Angle** - reconstructed zenith angle with respect to the vertical (degrees)
7. **Azimuth Angle** - reconstructed azimuth angle with respect to the north (degrees)
8. **Electron Number** - reconstructed number of electrons (logarithm of the number)
9. **Muon Number** - reconstructed number of Muons (logarithm of the number)
10. **Age** - shower shape parameter

In [None]:
sns.heatmap(matrices[..., 0].sum(0))

In [None]:
sns.heatmap(matrices[..., 1].sum(0))

In [None]:
sns.heatmap(matrices[..., 2].sum(0))

In [None]:
sns.heatmap(matrices[3, ..., 1])

## True data

Refer to KCDC Simultaion Manual, page 71 https://kcdc.iap.kit.edu/static/pdf/kcdc_mainpage/kcdc-Simulation-Manual.pdf

### Fields
1. **TrEP** - energy
2. **TrPP** - particle type
3. **TrXc** - Core X
4. **TrYc** - Core Y
5. **TrZe** - Zenith
6. **TrAz** - Azimuth
7. **TrNe** - Number of electrons
8. **TrNg** - Number of photons
9. **TrNm** - Number of muons
10. **TrNh** - Number of hadrons