# Distribution of property values in simulation data

- distributions of labeled events
- accuracy of estimation of line orientation

## Imports
(section not in the report)

In [1]:
import sys
import os
import subprocess
import re
import numpy as np
import psycopg2 as pg
import pandas as pd
import pandas.io.sql as psql
import getpass
import matplotlib as mpl
import argparse
import glob
import traceback
import hashlib
import math
import collections
import functools
import itertools

mpl.rcParams['figure.dpi'] = 80

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
app_base_dir = '/home/spbproc/euso-spb-patt-reco-v1'
if app_base_dir not in sys.path:
    sys.path.append(app_base_dir)

import event_processing_v3
import event_processing_v4
import postgresql_v3_event_storage
import dataset_query_functions_v3

import tool.acqconv
from data_analysis_utils import *
from data_analysis_utils_dataframes import *
# import supervised_classification as supc    
from utility_funtions import key_vals2val_keys

Welcome to JupyROOT 6.13/02


In [3]:
# import sklearn.preprocessing
# import sklearn.feature_selection
# import sklearn.ensemble 
# # import sklearn.neural_network
# import sklearn.discriminant_analysis
# import sklearn.model_selection
# import sklearn.metrics
# import sklearn.pipeline
import physt
from sklearn.externals import joblib

## Data selection

In [4]:
data_snippets_dir = 'ver4_simu_distributions_visible_events_distributions_w_simu_signal'
os.makedirs(data_snippets_dir, exist_ok=True)
os.makedirs(os.path.join(data_snippets_dir, 'figures'), exist_ok=True)

In [5]:
event_processing_cls = event_processing_v4.EventProcessingV4
event_v3_storage_provider_simu_signal = dataset_query_functions_v3.build_event_v3_storage_provider(
    event_storage_provider_config_file=os.path.join(app_base_dir,'config_simu_signal.ini'), 
    table_names_version='ver4',
    event_storage_class=postgresql_v3_event_storage.PostgreSqlEventV3StorageProvider,
    event_processing_class=event_processing_cls,
    no_connection=True
)

query_functions_simu_signal = dataset_query_functions_v3.Ver3DatasetQueryFunctions(event_v3_storage_provider_simu_signal)

In [6]:
event_processing_cls = event_processing_v4.EventProcessingV4
event_v3_storage_provider_simu = dataset_query_functions_v3.build_event_v3_storage_provider(
    event_storage_provider_config_file=os.path.join(app_base_dir,'config_simu_w_flatmap.ini'), 
    table_names_version='ver4',
    event_storage_class=postgresql_v3_event_storage.PostgreSqlEventV3StorageProvider,
    event_processing_class=event_processing_cls
)

query_functions_simu = dataset_query_functions_v3.Ver3DatasetQueryFunctions(event_v3_storage_provider_simu)

### Selected columns

Unlinke machine learning approach, that would be trained directly on pixels and learn to identify important features as a part of a learning (for example convolutional neural network), this approach depends on a set of preselected features. Its possible advantage is that there is no need to discover identified features and after the feature extraction, the training is faster.

One of the sources of possible bias in the analysis might be initial selection of features that are analyzed by feature elimination methods.

For this experiment selected features include:
- number of triggered pixels (`trg_count_nonzero`),
- some properties describing the background frames and background frames projection,
- similarly for all frames of an event
- informations about line orientations in projections of a shower
- informations about precision of estimation the orientation of a shower
- ...

In [7]:
common_included_columns_re_list = [
  ('^$','source_file_(acquisition|trigger)(_full)?|global_gtu|packet_id|gtu_in_packet|event_id|num_gtu'),
#   ('^trg(_box_per_gtu|_pmt_per_gtu|_ec_per_gtu)?$', '^(count_nonzero|min|max|sum|mean)$'),
    
  ('^bg(_x_y)?$','^(mean_gz|mean|max|min|count_nonzero|sum|size)$'),
    
  ('^orig(_x_y)?$','^(count_nonzero|max|mean|mean_gz|sum|size)$'),

  '(proc\d|trg|alt\d)_(x|gtu)_(x|y)_hough_peak_thr[0-3]+_max_(peak|sum)_clu_major_line_(phi|rho)',
  '(proc\d|trg|alt\d)_(x|gtu)_(x|y)_hough_peak_thr[0-3]+_major_line_(phi|rho)',
  '(proc\d|trg|alt\d)_(x|gtu)_(x|y)_hough_peak_thr[0-3]+_line_clusters_((max_(peak|size|sum|area)_clu_(height|width|size))|count|sizes_max|clu_(widths|heights|areas)_max)',

#   ('(proc\d|trg|alt\d)_(gtu|x)_[yx]_clusters',('^(count|sizes_max|sizes_min|clu_areas_max|max_(size|peak)_clu_(width|height|size))$')),  
#   ('^proc\d_(x|gtu)_[yx]_hough_peak_thr3','major_line_coord_.*'),
    
]

In [8]:
simu_signal_included_columns_re_list = [
  '(proc\d|trg|alt\d)_(x|gtu)_(x|y)_hough_peak_thr[0-3]+_max_(peak|sum)_clu_major_line_(phi|rho)',
  '(proc\d|trg|alt\d)_(x|gtu)_(x|y)_hough_peak_thr[0-3]+_major_line_(phi|rho)',  
]

#### List of columns of simu data tables used for analysis

In [9]:
common_columns_for_analysis_dict = query_functions_simu.get_columns_for_classification_dict__by_excluding(
    excluded_columns_re_list=('^.+$',),
    default_excluded_columns_re_list=[],
    included_columns_re_list=common_included_columns_re_list
)

print_columns_dict(common_columns_for_analysis_dict)

spb_processing_v4_simu_flatmap.event
	- event_id
	- source_file_acquisition_full
	- source_file_trigger_full
	- source_file_acquisition
	- source_file_trigger
	- global_gtu
	- packet_id
	- gtu_in_packet
	- num_gtu

spb_processing_v4_simu_flatmap.event_orig_x_y
	- count_nonzero
	- sum
	- max
	- mean
	- mean_gz
	- size

spb_processing_v4_simu_flatmap.event_bg_x_y
	- count_nonzero
	- sum
	- min
	- max
	- mean
	- mean_gz
	- size

spb_processing_v4_simu_flatmap.event_bg
	- count_nonzero
	- sum
	- min
	- max
	- mean
	- mean_gz
	- size

spb_processing_v4_simu_flatmap.event_orig
	- count_nonzero
	- sum
	- max
	- mean
	- mean_gz
	- size

spb_processing_v4_simu_flatmap.event_trg_x_y_hough_peak_thr1
	- major_line_phi
	- major_line_rho
	- max_peak_clu_major_line_phi
	- max_peak_clu_major_line_rho
	- max_sum_clu_major_line_phi
	- max_sum_clu_major_line_rho
	- line_clusters_count
	- line_clusters_sizes_max
	- line_clusters_clu_widths_max
	- line_clusters_clu_heights_max
	- line_clusters_clu_areas_ma

In [10]:
common_df_columns = query_functions_simu.get_dataframe_columns_from_dict(common_columns_for_analysis_dict)

In [11]:
simu_signal_columns_for_analysis_dict = query_functions_simu_signal.get_columns_for_classification_dict__by_excluding(
    excluded_columns_re_list=('^.+$',),
    default_excluded_columns_re_list=[],
    included_columns_re_list=simu_signal_included_columns_re_list
)

print_columns_dict(simu_signal_columns_for_analysis_dict)

spb_processing_v4_simu_signal.event_trg_x_y_hough_peak_thr1
	- major_line_phi
	- major_line_rho
	- max_peak_clu_major_line_phi
	- max_peak_clu_major_line_rho
	- max_sum_clu_major_line_phi
	- max_sum_clu_major_line_rho

spb_processing_v4_simu_signal.event_trg_x_y_hough_peak_thr2
	- major_line_phi
	- major_line_rho
	- max_peak_clu_major_line_phi
	- max_peak_clu_major_line_rho
	- max_sum_clu_major_line_phi
	- max_sum_clu_major_line_rho

spb_processing_v4_simu_signal.event_trg_x_y_hough_peak_thr3
	- major_line_phi
	- major_line_rho
	- max_peak_clu_major_line_phi
	- max_peak_clu_major_line_rho
	- max_sum_clu_major_line_phi
	- max_sum_clu_major_line_rho

spb_processing_v4_simu_signal.event_trg_gtu_x_hough_peak_thr1
	- major_line_phi
	- major_line_rho
	- max_peak_clu_major_line_phi
	- max_peak_clu_major_line_rho
	- max_sum_clu_major_line_phi
	- max_sum_clu_major_line_rho

spb_processing_v4_simu_signal.event_trg_gtu_x_hough_peak_thr2
	- major_line_phi
	- major_line_rho
	- max_peak_clu_major_li

In [12]:
simu_signal_df_columns = query_functions_simu.get_dataframe_columns_from_dict(simu_signal_columns_for_analysis_dict)

### Data selection queries

#### Simu visible events (base)

All positive samples for the training are simulated shower tracks with background from the flight data (see notebook ver4_flatmap_visible_events). Events considered as positive samples have to contain track signal (see ver4_test_selection_visualization__simu_signal notebook) and has to be considered as visible (see ver4_flatmap_simu_visible_events notebook). 

Visibility of the event is decided by a rule that **there should be at least two frames of the event which  contain a signal pixel that is greater or equal to maximum background intensity in the frame**.

Additionally there is rule that the first trigger of a visible event should be in GTU $42\pm10$.

In [55]:
# not in the report

signal_schema = query_functions_simu_signal.event_storage.database_schema_name

common_select_clause_str, common_tables_list = \
    query_functions_simu.get_query_clauses__select(common_columns_for_analysis_dict)

simu_signal_select_clause_str, simu_signal_tables_list = \
    query_functions_simu_signal.get_query_clauses__select(simu_signal_columns_for_analysis_dict, col_prefix='signal_')

simu_signal_tables_list = [(joined_table, query_functions_simu_signal.event_storage.data_table_name, on_column) \
                           for joined_table, base_table, on_column in simu_signal_tables_list]

simu_where_clauses_str, simu_tables_list = \
    query_functions_simu.get_query_clauses__where_simu(
        gtu_in_packet_distacne=(40, 10), 
        num_frames_signals_ge_bg__ge=2, num_frames_signals_ge_bg__le=999,
        simu_event_relation_table_name='{database_schema_name}.simu_event_uniq_relation',
        simu_event_table_name = '{database_schema_name}.simu_event_uniq',
        simu_event_additional_table_name = '{database_schema_name}.simu_event_uniq_additional'
    )

simu_event_select_clause_str = \
    ', '.join(['{{database_schema_name}}.simu_event_uniq.{attr}'.format(attr=attr) for attr in [
        'simu2npy_pathname', 'edetector_numphotons', 'edetector_numcellhits', 'edetector_numfee', 'eptttrigger_fnumtrigg', 
        'etruth_trueenergy', 'etruth_truetheta', 'etruth_truephi', 'etruth_truephilocal', 'egeometry_pos_z',
        'etruth_trueshowermaxpos_x', 'etruth_trueshowermaxpos_y', 'etruth_trueshowermaxpos_z'
    ]])

simu_event_additional_select_clause_str = \
    ', '.join(['{{database_schema_name}}.simu_event_uniq_additional.{}'.format(attr) for attr in [
        'num_frames_counts_gt_bg', 'num_frames_signals_gt_bg', 'num_frames_signals_ge_bg'
    ]])

joined_select_clause_str = ', '.join([
    common_select_clause_str, 
    simu_event_select_clause_str, simu_event_additional_select_clause_str, simu_signal_select_clause_str
])

base_simu_tables_list = [
    ('{database_schema_name}.simu_event_uniq_relation', '{data_table_name}', 'event_id'),
    ('{database_schema_name}.simu_event_uniq', '{database_schema_name}.simu_event_uniq_relation', 'simu_event_uniq_id'),
    ('{database_schema_name}.simu_event_uniq_additional','{database_schema_name}.simu_event_uniq_relation','relation_id'),
]

joined_simu_tables_list = base_simu_tables_list + simu_tables_list

joined_simu_signal_tables_list = [
    (signal_schema + '.simu_event_uniq_relation_simu_flatmap', 
     '{database_schema_name}.simu_event_uniq', 
         'simu_event_uniq_id'),
    (signal_schema + '.event',
     signal_schema + '.simu_event_uniq_relation_simu_flatmap', 
         'event_id'),
] + simu_signal_tables_list

joined_tables_list = common_tables_list + joined_simu_tables_list + joined_simu_signal_tables_list

join_clauses_str = query_functions_simu.get_query_clauses__join(joined_tables_list)

source_data_type_num = 3001

simu_events_selection_query = query_functions_simu.get_events_selection_query_plain(
    source_data_type_num=source_data_type_num,
    select_additional=joined_select_clause_str, 
    join_additional=join_clauses_str,
    where_additional=simu_where_clauses_str,
    order_by='{data_table_name}.event_id', 
    offset=0, 
    limit=350000,
    base_select='')

# # etruth_truephilocal ?

print(simu_events_selection_query)

# '''
# SELECT 
# sfe.event_id, seu.simu_event_uniq_id, ssseursf.event_id, sfe.source_file_acquisition, seu.simu2npy_pathname, sse.source_file_acquisition  
# FROM spb_processing_v4_simu_flatmap.event AS sfe 
# JOIN spb_processing_v4_simu_flatmap.simu_event_uniq_relation AS seur ON seur.event_id = sfe.event_id  
# JOIN spb_processing_v4_simu_signal.simu_event_uniq_relation_simu_flatmap AS ssseursf ON ssseursf.simu_event_uniq_id = seur.simu_event_uniq_id 
# JOIN spb_processing_v4_simu_flatmap.simu_event_uniq AS seu ON seu.simu_event_uniq_id = seur.simu_event_uniq_id 
# JOIN spb_processing_v4_simu_signal.event AS sse ON sse.event_id = ssseursf.event_id  
# ORDER BY sfe.event_id, seu.simu_event_uniq_id ASC  
# LIMIT 50 
# ''';


    SELECT 
        
        spb_processing_v4_simu_flatmap.event.event_id, spb_processing_v4_simu_flatmap.event.source_file_acquisition_full, spb_processing_v4_simu_flatmap.event.source_file_trigger_full, spb_processing_v4_simu_flatmap.event.source_file_acquisition, spb_processing_v4_simu_flatmap.event.source_file_trigger, spb_processing_v4_simu_flatmap.event.global_gtu, spb_processing_v4_simu_flatmap.event.packet_id, spb_processing_v4_simu_flatmap.event.gtu_in_packet, spb_processing_v4_simu_flatmap.event.num_gtu, spb_processing_v4_simu_flatmap.event_orig_x_y.count_nonzero AS orig_x_y_count_nonzero, spb_processing_v4_simu_flatmap.event_orig_x_y.sum AS orig_x_y_sum, spb_processing_v4_simu_flatmap.event_orig_x_y.max AS orig_x_y_max, spb_processing_v4_simu_flatmap.event_orig_x_y.mean AS orig_x_y_mean, spb_processing_v4_simu_flatmap.event_orig_x_y.mean_gz AS orig_x_y_mean_gz, spb_processing_v4_simu_flatmap.event_orig_x_y.size AS orig_x_y_size, spb_processing_v4_simu_flatmap.event_bg_x_y.

In [14]:
simu_df = psql.read_sql(simu_events_selection_query, event_v3_storage_provider_simu.connection)

In [15]:
simu_df.head()

Unnamed: 0,event_id,source_file_acquisition_full,source_file_trigger_full,source_file_acquisition,source_file_trigger,global_gtu,packet_id,gtu_in_packet,num_gtu,orig_x_y_count_nonzero,...,proc1_gtu_y_hough_peak_thr2_max_peak_clu_major_line_phi,proc1_gtu_y_hough_peak_thr2_max_peak_clu_major_line_rho,proc1_gtu_y_hough_peak_thr2_max_sum_clu_major_line_phi,proc1_gtu_y_hough_peak_thr2_max_sum_clu_major_line_rho,proc1_gtu_y_hough_peak_thr3_major_line_phi,proc1_gtu_y_hough_peak_thr3_major_line_rho,proc1_gtu_y_hough_peak_thr3_max_peak_clu_major_line_phi,proc1_gtu_y_hough_peak_thr3_max_peak_clu_major_line_rho,proc1_gtu_y_hough_peak_thr3_max_sum_clu_major_line_phi,proc1_gtu_y_hough_peak_thr3_max_sum_clu_major_line_rho
0,11464,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_27000000.00/energy_1.00e+13/thousnd27E20....,posz_27000000.00/energy_1.00e+13/thousnd27E20....,169,1,41,11,2290,...,0.057631,17.9287,0.057631,17.9287,6.21298,12.7209,6.21298,12.7209,6.21298,12.7209
1,11465,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_27000000.00/energy_1.00e+13/thousnd27E20....,posz_27000000.00/energy_1.00e+13/thousnd27E20....,163,1,35,24,2290,...,5.96849,6.95928,5.96849,6.95928,5.98494,7.39136,5.98494,7.39136,5.98494,7.39136
2,11486,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_27000000.00/energy_1.00e+13/thousnd27E20....,posz_27000000.00/energy_1.00e+13/thousnd27E20....,169,1,41,11,2290,...,0.057631,17.9287,0.057631,17.9287,6.21298,12.7209,6.21298,12.7209,6.21298,12.7209
3,11487,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_27000000.00/energy_1.00e+13/thousnd27E20....,posz_27000000.00/energy_1.00e+13/thousnd27E20....,169,1,41,11,2290,...,0.057631,17.9287,0.057631,17.9287,6.21298,12.7209,6.21298,12.7209,6.21298,12.7209
4,11494,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_27000000.00/energy_1.00e+13/thousnd27E20....,posz_27000000.00/energy_1.00e+13/thousnd27E20....,166,1,38,10,2289,...,6.01707,3.58116,6.01707,3.58116,6.00267,3.12955,6.00267,3.12955,6.00267,3.12955


#### Simu noise events

Simu noise events are events that are caused by a trigger well outside of GTU of shower injection into a packet. 

It is not ideal to use these these events as samples of the dataset because due the way the background of these events is added to the signal. Simply, if there is less packets providing the background than simualated signal tracks then same event might be repeated multiple times in the dataset. 
Besides repetition of a background packet, background of the simualted event is created by repeating sequence of background frames, thus this might cause multiple events in a same packet. How often this situation happens has not been tested. It is not expected to be very typical.

Better method of constructing these events would help validity of this analysis.

In [17]:
# not in the report

# common_select_clause_str, common_tables_list = \
#     query_functions_simu.get_query_clauses__select(common_columns_for_analysis_dict)

# simu_noise_where_clauses_str = ' AND abs(gtu_in_packet-42) >= 20 '

# OPTIMIZATION, ROWS WITH NULL SHOULD BE ALSO ANALYZED 
simu_noise_where_clauses_str = '''
    AND abs({data_table_name}.gtu_in_packet-42) >= 20 
    AND {database_schema_name}.event_trg_gtu_y_hough_peak_thr1.major_line_phi IS NOT NULL 
    AND {database_schema_name}.event_trg_gtu_x_hough_peak_thr2.major_line_phi IS NOT NULL 
    AND {database_schema_name}.event_trg_x_y_hough_peak_thr1.major_line_phi IS NOT NULL
'''

# joined_select_clause_str = ', '.join([
#     common_select_clause_str, 
#     simu_event_select_clause_str, simu_event_additional_select_clause_str, simu_signal_select_clause_str
# ])

# joined_tables_list = common_tables_list + joined_simu_tables_list + joined_simu_signal_tables_list

# join_clauses_str = query_functions_simu.get_query_clauses__join(joined_tables_list)

source_data_type_num = 3001

noise_simu_events_selection_query = query_functions_simu.get_events_selection_query_plain(
    source_data_type_num=source_data_type_num,
    select_additional=joined_select_clause_str, 
    join_additional=join_clauses_str,
    where_additional=simu_noise_where_clauses_str,
    order_by='{data_table_name}.event_id', 
    offset=0, 
    limit=350000,
    base_select='')

# print(noise_simu_events_selection_query)

In [18]:
noise_simu_df = psql.read_sql(noise_simu_events_selection_query, event_v3_storage_provider_simu.connection)

In [19]:
noise_simu_df.head()

Unnamed: 0,event_id,source_file_acquisition_full,source_file_trigger_full,source_file_acquisition,source_file_trigger,global_gtu,packet_id,gtu_in_packet,num_gtu,orig_x_y_count_nonzero,...,proc1_gtu_y_hough_peak_thr2_max_peak_clu_major_line_phi,proc1_gtu_y_hough_peak_thr2_max_peak_clu_major_line_rho,proc1_gtu_y_hough_peak_thr2_max_sum_clu_major_line_phi,proc1_gtu_y_hough_peak_thr2_max_sum_clu_major_line_rho,proc1_gtu_y_hough_peak_thr3_major_line_phi,proc1_gtu_y_hough_peak_thr3_major_line_rho,proc1_gtu_y_hough_peak_thr3_max_peak_clu_major_line_phi,proc1_gtu_y_hough_peak_thr3_max_peak_clu_major_line_rho,proc1_gtu_y_hough_peak_thr3_max_sum_clu_major_line_phi,proc1_gtu_y_hough_peak_thr3_max_sum_clu_major_line_rho
0,11479,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_27000000.00/energy_1.00e+13/thousnd27E20....,posz_27000000.00/energy_1.00e+13/thousnd27E20....,137,1,9,10,2290,...,6.23038,13.2187,6.23038,13.2187,6.23038,13.2187,6.23038,13.2187,6.23038,13.2187
1,11500,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_27000000.00/energy_1.00e+13/thousnd27E20....,posz_27000000.00/energy_1.00e+13/thousnd27E20....,201,1,73,10,2290,...,6.23038,13.2187,6.23038,13.2187,6.23038,13.2187,6.23038,13.2187,6.23038,13.2187
2,11507,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_27000000.00/energy_1.00e+13/thousnd27E20....,posz_27000000.00/energy_1.00e+13/thousnd27E20....,140,1,12,10,2290,...,6.11457,3.73295,6.11457,3.73295,6.12523,3.72091,6.12523,3.72091,6.12523,3.72091
3,11516,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_27000000.00/energy_1.00e+13/thousnd27E20....,posz_27000000.00/energy_1.00e+13/thousnd27E20....,233,1,105,10,2290,...,6.23038,13.2187,6.23038,13.2187,6.23038,13.2187,6.23038,13.2187,6.23038,13.2187
4,11533,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_27000000.00/energy_1.00e+13/thousnd27E20....,posz_27000000.00/energy_1.00e+13/thousnd27E20....,204,1,76,10,2290,...,6.11457,3.73295,6.11457,3.73295,6.12523,3.72091,6.12523,3.72091,6.12523,3.72091


In [20]:
# not in the report

all_simu_where_clauses_str, all_simu_tables_list = \
    query_functions_simu.get_query_clauses__where_simu(
        gtu_in_packet_distacne=None, 
        num_frames_signals_ge_bg__ge=None, num_frames_signals_ge_bg__le=None,
        simu_event_relation_table_name='{database_schema_name}.simu_event_uniq_relation',
        simu_event_table_name = '{database_schema_name}.simu_event_uniq',
        simu_event_additional_table_name = '{database_schema_name}.simu_event_uniq_additional'
    )

all_simu_select_clause_str, all_simu_tables_list = \
    query_functions_simu.get_query_clauses__select(query_functions_simu.get_columns_for_classification_dict__by_excluding(
        excluded_columns_re_list=('^.+$',),
        default_excluded_columns_re_list=[],
        included_columns_re_list=[
            ('^$','source_file_(acquisition|trigger)(_full)?|global_gtu|packet_id|gtu_in_packet|event_id|num_gtu'),
        ]
    ))

all_simu_joined_select_clause_str = ', '.join([
    all_simu_select_clause_str, 
    simu_event_select_clause_str, simu_event_additional_select_clause_str, simu_signal_select_clause_str
])

all_simu_joined_tables_list = all_simu_tables_list + base_simu_tables_list + all_simu_tables_list + joined_simu_signal_tables_list

# (SELECT ss_se.simu2npy_pathname, e.max FROM spb_processing_v4_simu_signal.event_orig AS e JOIN spb_processing_v4_simu_signal.simu_event_relation_simu_flatmap AS ss_sersf ON ss_sersf.event_id = e.event_id  JOIN spb_processing_v4_simu_flatmap.simu_event AS ss_se ON ss_se.simu_event_id = ss_sersf.simu_event_id) AS simu_signal

all_simu_join_clauses_str = query_functions_simu.get_query_clauses__join(all_simu_joined_tables_list)

source_data_type_num = 3001

all_simu_events_selection_query = query_functions_simu.get_events_selection_query_plain(
    source_data_type_num=source_data_type_num,
    select_additional=all_simu_joined_select_clause_str, 
    join_additional=all_simu_join_clauses_str,
    where_additional=all_simu_where_clauses_str,
    order_by='{data_table_name}.event_id', 
    offset=0, 
    limit=350000,
    base_select='')

print(all_simu_events_selection_query)


    SELECT 
        
        spb_processing_v4_simu_flatmap.event.event_id, spb_processing_v4_simu_flatmap.event.source_file_acquisition_full, spb_processing_v4_simu_flatmap.event.source_file_trigger_full, spb_processing_v4_simu_flatmap.event.source_file_acquisition, spb_processing_v4_simu_flatmap.event.source_file_trigger, spb_processing_v4_simu_flatmap.event.global_gtu, spb_processing_v4_simu_flatmap.event.packet_id, spb_processing_v4_simu_flatmap.event.gtu_in_packet, spb_processing_v4_simu_flatmap.event.num_gtu, spb_processing_v4_simu_flatmap.simu_event_uniq.simu2npy_pathname, spb_processing_v4_simu_flatmap.simu_event_uniq.edetector_numphotons, spb_processing_v4_simu_flatmap.simu_event_uniq.edetector_numcellhits, spb_processing_v4_simu_flatmap.simu_event_uniq.edetector_numfee, spb_processing_v4_simu_flatmap.simu_event_uniq.eptttrigger_fnumtrigg, spb_processing_v4_simu_flatmap.simu_event_uniq.etruth_trueenergy, spb_processing_v4_simu_flatmap.simu_event_uniq.etruth_truetheta, spb_pro

In [21]:
all_simu_df = psql.read_sql(all_simu_events_selection_query, event_v3_storage_provider_simu.connection)

In [22]:
extend_simu_signal_by_signals_pathname(all_simu_df);

### Closing connections
(not in the report)

In [23]:
event_v3_storage_provider_simu.connection.close()

### Combined simulations dataset

Simu dataframes are combined because they have same columns. 
Then within this dataset events are classified into four groups based on 
- **Query classification information** - Primary classification based on the original data selection query - original intention of the data selection.
- **Simu signal classification information** - Secondary classification is addition of labeled simu signal events. The events are loaded from tables prepared in ver4_test_selection_visualization__simu_signal notebook.

The groups are the following:
- **simu noise** - data selected by query intended to select visible events but simu signal is classified as noisy simu data
- **simu track** - data selected by query intended to select visible events and simu signal is classified as a signal - <br> *these events will be used as positive samples for machine learning algorithms*
- **noise track** - data selected by query intended to select noise events but simu signal is classified as a shower
- **noise noise** - data selected by query intended to select noise events and contains simu signal classified as noisy simu data (could be used as a part of negative samples dataset, although it is not ideal)
- **simu unclassified**, **noise unclassified** - data without any labelling for simu signal data, generaly should consist of short tracks or noisy tracks, in-between easily recognizable tracks and noise.
- **simu noise underflow**, **simu noise overflow**, **simu track underflow**, **simu track overflow** - data selected by query intended to select visible events but no simu signal is present (ideally should be empty)
- **noise noise underflow**, **noise noise overflow**, **noise track underflow**, **noise track overflow**   - data selected by query intended to select noise events and no simu signal is present - <br> *these events will be used as negative samples but with a low priority*

In [25]:
simu_df.shape

(35607, 1149)

In [26]:
noise_simu_df.shape

(128793, 1149)

In [24]:
combined_simu_df = pd.concat([simu_df, noise_simu_df])

In [None]:
# flight_columns_list = list(lbl_noise_flight_df.columns.values)
# combined_flight_df = pd.concat([unl_noise_flight_df[flight_columns_list], lbl_noise_flight_df[flight_columns_list], unl_flight_df[flight_columns_list]])

In [27]:
print('len(simu_df) =', len(simu_df))
print('len(noise_simu_df) =', len(noise_simu_df))
print('len(combined_simu_df) =', len(combined_simu_df))

len(simu_df) = 35607
len(noise_simu_df) = 128793
len(combined_simu_df) = 164400


#### $R_{max}$ property of simulated showers

In [28]:
# 'etruth_trueshowermaxpos_x', 'etruth_trueshowermaxpos_y', 'etruth_trueshowermaxpos_z'
combined_simu_df['calc_etruth_trueshower_rmax'] = np.hypot(combined_simu_df['etruth_trueshowermaxpos_x'], combined_simu_df['etruth_trueshowermaxpos_y'])

#### Query classification information
Primary classification based on the original data selection query - original intention of the data selection.

In [29]:
combined_simu_df['cond_selection_query'] = 'undefined'
combined_simu_df.loc[combined_simu_df['event_id'].isin(simu_df['event_id']), 'cond_selection_query'] = 'simu'
combined_simu_df.loc[combined_simu_df['event_id'].isin(noise_simu_df['event_id']), 'cond_selection_query'] = 'noise'

In [30]:
# if('simu_df' in locals()): del simu_df
# if('noise_simu_df' in locals()): del noise_simu_df
# if('unl_noise_flight_df' in locals()): del unl_noise_flight_df
# if('lbl_noise_flight_df' in locals()): del lbl_noise_flight_df
# if('unl_flight_df' in locals()): del unl_flight_df

#### Simu signal classification information
Secondary classification is addition of labeled simu signal events.
The events are loaded from tables prepared in ver4_test_selection_visualization__simu_signal notebook.

In [33]:
# [['event_id', 'source_file_acquisition', 'global_gtu', 'packet_id', 'gtu_in_packet', 'num_gtu', 'source_file_acquisition_full']]

simu_signal_data_snippets_dir = 'ver4_simu_signal_data_snippets'
simu_signal_visible_tracks_table_path = os.path.join(simu_signal_data_snippets_dir, 'visible_tracks_table.tsv')
simu_signal_noisy_events_table_path = os.path.join(simu_signal_data_snippets_dir, 'noisy_events_table.tsv')

combined_simu_df, unclassified_simu_df, \
track_simu_df, track_underflow_simu_df, track_overflow_simu_df, \
noise_simu_df, noise_underflow_simu_df, noise_overflow_simu_df, \
simu_signal_track_events_df, simu_signal_noisy_events_df = \
    add_classification_columns(
        combined_simu_df, 
        simu_signal_visible_tracks_table_path, simu_signal_noisy_events_table_path,
        ret_simu_signal=True, ret_under_over_track=True, ret_split_noise=True,
        simu_track_class='track', simu_noise_class='noise',
        simu_track_underflow_class='track_underflow', simu_track_overflow_class='track_overflow',
        simu_noise_underflow_class='noise_underflow', simu_noise_overflow_class='noise_overflow',
        simu_events_file_pathname_dir=data_snippets_dir)

In [34]:
# all_combined_simu_df, unclassified_simu_df, \
# track_simu_df, track_underflow_simu_df, track_overflow_simu_df, \
# noise_simu_df, noise_underflow_simu_df, noise_overflow_simu_df, \
# simu_signal_track_events_df, simu_signal_noisy_events_df = \
#     add_classification_columns(
#         combined_simu_df, 
#         simu_signal_visible_tracks_table_path, simu_signal_noisy_events_table_path,
#         ret_simu_signal=True, ret_under_over_track=True, ret_split_noise=True,
#         simu_track_class='track', simu_noise_class='noise',
#         simu_track_underflow_class='track_underflow', simu_track_overflow_class='track_overflow',
#         simu_noise_underflow_class='noise_underflow', simu_noise_overflow_class='noise_overflow',
#         simu_events_file_pathname_dir=data_snippets_dir)

##### Combined label - joining query and labeled simu class

In [35]:
combined_simu_df['cond_selection_combined'] = 'undefined'

for selection_query in ['simu','noise']:
    for selection_simu_signal in ['noise','track']:
        for simu_signal_sync in ['', 'underflow', 'overflow']:
            t_selection_simu_signal = selection_simu_signal
            if len(simu_signal_sync) > 0:
                t_selection_simu_signal += '_' + simu_signal_sync
            combined_simu_df.loc[
                (combined_simu_df['cond_selection_query'] == selection_query ) & 
                (combined_simu_df['cond_selection_simple'] == t_selection_simu_signal), 
                'cond_selection_combined'] = selection_query + '_' + t_selection_simu_signal

##### Size of the  subsets

###### Simu signal labels

In [36]:
print('len(simu_signal_track_events_df)', len(simu_signal_track_events_df))
print('len(simu_signal_noisy_events_df)', len(simu_signal_noisy_events_df))
print('-'*50)
print('len(combined_simu_df)           ', len(combined_simu_df))
print('-'*50)
print('len(unclassified_simu_df)       ', len(unclassified_simu_df))
print('len(track_simu_df)              ', len(track_simu_df))
print('len(track_underflow_simu_df)    ', len(track_underflow_simu_df))
print('len(track_overflow_simu_df)     ', len(track_overflow_simu_df))
print('len(noise_simu_df)              ', len(noise_simu_df))
print('len(noise_underflow_simu_df)    ', len(noise_underflow_simu_df))
print('len(noise_overflow_simu_df)     ', len(noise_overflow_simu_df))
print('-'*50)
print('                                   ', 
      len(unclassified_simu_df) + \
      len(track_simu_df) + len(track_underflow_simu_df) + len(track_overflow_simu_df) + \
      len(noise_simu_df) + len(noise_underflow_simu_df) + len(noise_overflow_simu_df)
     )
print('-'*50)
print('len(track_simu_df)/len(combined_simu_df)        = ', len(track_simu_df)/len(combined_simu_df))
print('len(unclassified_simu_df)/len(combined_simu_df) = ', len(unclassified_simu_df)/len(combined_simu_df))
print('len(noise_simu_df)/len(combined_simu_df)        = ', len(noise_simu_df)/len(combined_simu_df))

len(simu_signal_track_events_df) 14866
len(simu_signal_noisy_events_df) 59279
--------------------------------------------------
len(combined_simu_df)            164400
--------------------------------------------------
len(unclassified_simu_df)        9288
len(track_simu_df)               35038
len(track_underflow_simu_df)     5983
len(track_overflow_simu_df)      16956
len(noise_simu_df)               1608
len(noise_underflow_simu_df)     24689
len(noise_overflow_simu_df)      70838
--------------------------------------------------
                                    164400
--------------------------------------------------
len(track_simu_df)/len(combined_simu_df)        =  0.2131265206812652
len(unclassified_simu_df)/len(combined_simu_df) =  0.056496350364963505
len(noise_simu_df)/len(combined_simu_df)        =  0.00978102189781022


###### Selection query and simu signal labels

In [37]:
for selection_query in ['simu','noise']:
    for selection_simu_signal in ['noise','track']:
        for simu_signal_sync in ['', 'underflow', 'overflow']:
            t_selection_simu_signal = selection_simu_signal
            if len(simu_signal_sync) > 0:
                t_selection_simu_signal += '_' + simu_signal_sync
            print('{:<30} {}'.format(
                '{} - {}'.format(selection_query, t_selection_simu_signal),
                np.count_nonzero(
                    (combined_simu_df['cond_selection_query'] == selection_query ) & \
                    (combined_simu_df['cond_selection_simple'] == t_selection_simu_signal))
            ))                

simu - noise                   4
simu - noise_underflow         0
simu - noise_overflow          0
simu - track                   34521
simu - track_underflow         0
simu - track_overflow          0
noise - noise                  1604
noise - noise_underflow        24689
noise - noise_overflow         70838
noise - track                  517
noise - track_underflow        5983
noise - track_overflow         16956


##### Example of track underflow subset

In [38]:
track_underflow_simu_df.sort_values('gtu_in_packet', ascending=False).head()

Unnamed: 0,event_id,source_file_acquisition_full,source_file_trigger_full,source_file_acquisition,source_file_trigger,global_gtu,packet_id,gtu_in_packet,num_gtu,orig_x_y_count_nonzero,...,proc1_gtu_y_hough_peak_thr3_major_line_rho,proc1_gtu_y_hough_peak_thr3_max_peak_clu_major_line_phi,proc1_gtu_y_hough_peak_thr3_max_peak_clu_major_line_rho,proc1_gtu_y_hough_peak_thr3_max_sum_clu_major_line_phi,proc1_gtu_y_hough_peak_thr3_max_sum_clu_major_line_rho,calc_etruth_trueshower_rmax,cond_selection_query,simu2npy_signals_pathname,simu2npy_signals_pathname_short,cond_selection_simple
45187,2109098,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_21000000.00/energy_1.41e+12/thousnd21E3.2...,posz_21000000.00/energy_1.41e+12/thousnd21E3.2...,150,1,22,11,2290,...,2.27909,2.33426,2.27909,2.33426,2.27909,206398.3,noise,/mnt/data_sgbc1/SPBDATA_processed/spb_simu/pos...,posz_21000000.00/energy_1.41e+12/thousnd21E3.2...,track_underflow
25116,189885,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_24000000.00/energy_5.95e+12/simu.2017-07-...,posz_24000000.00/energy_5.95e+12/simu.2017-07-...,150,1,22,10,2288,...,1.11742,1.54134,1.11742,1.54134,1.11742,19494760.0,noise,/mnt/data_sgbc1/SPBDATA_processed/spb_simu/pos...,posz_24000000.00/energy_5.95e+12/simu.2017-07-...,track_underflow
21799,154316,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_24000000.00/energy_5.95e+12/simu.2017-07-...,posz_24000000.00/energy_5.95e+12/simu.2017-07-...,150,1,22,9,2287,...,22.7209,6.12523,22.7209,6.12523,22.7209,13082310.0,noise,/mnt/data_sgbc1/SPBDATA_processed/spb_simu/pos...,posz_24000000.00/energy_5.95e+12/simu.2017-07-...,track_underflow
3711,36818,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_27000000.00/energy_1.26e+12/thousnd27E2/n...,posz_27000000.00/energy_1.26e+12/thousnd27E2/l...,150,1,22,10,2290,...,26.28,0.41234,26.28,0.41234,26.28,5740878.0,noise,/mnt/data_sgbc1/SPBDATA_processed/spb_simu/pos...,posz_27000000.00/energy_1.26e+12/thousnd27E2/s...,track_underflow
93171,2411359,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_18000000.00/energy_5.25e+12/simu.2017-07-...,posz_18000000.00/energy_5.25e+12/simu.2017-07-...,150,1,22,10,2290,...,15.0605,5.78592,15.0605,5.78592,15.0605,2049091.0,noise,/mnt/data_sgbc1/SPBDATA_processed/spb_simu/pos...,posz_18000000.00/energy_5.25e+12/simu.2017-07-...,track_underflow


##### Example of track overflow subset

In [39]:
track_overflow_simu_df.sort_values('gtu_in_packet', ascending=True).head()

Unnamed: 0,event_id,source_file_acquisition_full,source_file_trigger_full,source_file_acquisition,source_file_trigger,global_gtu,packet_id,gtu_in_packet,num_gtu,orig_x_y_count_nonzero,...,proc1_gtu_y_hough_peak_thr3_major_line_rho,proc1_gtu_y_hough_peak_thr3_max_peak_clu_major_line_phi,proc1_gtu_y_hough_peak_thr3_max_peak_clu_major_line_rho,proc1_gtu_y_hough_peak_thr3_max_sum_clu_major_line_phi,proc1_gtu_y_hough_peak_thr3_max_sum_clu_major_line_rho,calc_etruth_trueshower_rmax,cond_selection_query,simu2npy_signals_pathname,simu2npy_signals_pathname_short,cond_selection_simple
53058,2156392,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_30000000.00/energy_2.00e+12/thousnd30E6.2...,posz_30000000.00/energy_2.00e+12/thousnd30E6.2...,190,1,62,20,2290,...,37.2791,1.59712,37.2791,1.59712,37.2791,3966318.0,noise,/mnt/data_sgbc1/SPBDATA_processed/spb_simu/pos...,posz_30000000.00/energy_2.00e+12/thousnd30E6.2...,track_overflow
58199,2189735,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_30000000.00/energy_1.58e+12/thousnd30E4.2...,posz_30000000.00/energy_1.58e+12/thousnd30E4.2...,190,1,62,10,2290,...,23.9528,0.210601,23.9528,0.210601,23.9528,10611820.0,noise,/mnt/data_sgbc1/SPBDATA_processed/spb_simu/pos...,posz_30000000.00/energy_1.58e+12/thousnd30E4.2...,track_overflow
58697,2192502,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_30000000.00/energy_1.58e+12/thousnd30E4.2...,posz_30000000.00/energy_1.58e+12/thousnd30E4.2...,190,1,62,14,2290,...,15.2791,0.66693,15.2791,0.66693,15.2791,1363170.0,noise,/mnt/data_sgbc1/SPBDATA_processed/spb_simu/pos...,posz_30000000.00/energy_1.58e+12/thousnd30E4.2...,track_overflow
59486,2196758,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_21000000.00/energy_3.94e+12/ter212.2017-0...,posz_21000000.00/energy_3.94e+12/ter212.2017-0...,190,1,62,12,2289,...,15.7814,0.061318,15.7814,0.061318,15.7814,9784803.0,noise,/mnt/data_sgbc1/SPBDATA_processed/spb_simu/pos...,posz_21000000.00/energy_3.94e+12/ter212.2017-0...,track_overflow
110238,2510779,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_18000000.00/energy_8.50e+12/set18.2017-07...,posz_18000000.00/energy_8.50e+12/set18.2017-07...,190,1,62,14,2290,...,18.1257,6.10785,18.1257,6.10785,18.1257,4385232.0,noise,/mnt/data_sgbc1/SPBDATA_processed/spb_simu/pos...,posz_18000000.00/energy_8.50e+12/set18.2017-07...,track_overflow


##### Visualization of a few events

- Noise entries are sorted by number of simu signal pixles in x-y projection in descending order (`orig_x_y_count_nonzero`, sorted from the most potentially track-like),
- Track entries are sorted by num frames where maximum signal is greater equal maximum background in acsending order (`num_frames_signals_ge_bg`, from the least visible track events). Non-track-like simu signal might not be necessarly incorrectly labeled entries, just a small portion of a track in signal.
- Track underflow, track overflow should all contain empty simu signal data. Entries are sorted by GTU in packet in ascending or descending order, respectively.

In [None]:
def vis_simu_signal_default(i, r, visualized_projections, fig, axs_flattened): 
    show_simu_event_row(i, r, 
        npy_pathname_column='simu2npy_signals_pathname', 
        single_proj_width=4, single_proj_height=4,
        print_info=False, warn_if_not_exact_simu=False)

def vis_simu_signal_with_original(i, r, visualized_projections, fig, axs_flattened):
    vis_simu_signal_default(i, r, visualized_projections, fig, axs_flattened)
    show_simu_event_row(i, r, 
        npy_pathname_column='simu2npy_signals_pathname', 
        single_proj_width=4, single_proj_height=4,
        print_info=False, warn_if_not_exact_simu=False,
        simu_gtu_override=(30,50))

for label, events_to_vis_df in [
        ('noise', noise_simu_df.sort_values('orig_x_y_count_nonzero', ascending=False)), 
        ('track', track_simu_df.sort_values('num_frames_signals_ge_bg', ascending=True)), 
        ('track_underflow', track_underflow_simu_df.sort_values('gtu_in_packet', ascending=False)), 
        ('track_overflow', track_overflow_simu_df.sort_values('gtu_in_packet', ascending=True))
]:
    print('{} ({} entries)'.format(label, len(events_to_vis_df)))
    print('-' * 50)
    vis_events_df(
        events_to_vis_df, 
        events_per_figure=3, max_figures=1, vis_gtux=True, vis_gtuy=True, 
        close_after_vis=False, show=True, 
        additional_printed_columns=[
            'num_frames_signals_ge_bg', 'simu2npy_signals_pathname_short', 
            'cond_selection_query', 'cond_selection_simple'],
        by_one=True,
        extension_func=vis_simu_signal_with_original if label == 'track' else vis_simu_signal_default,
        single_proj_width=4, single_proj_height=4
    )
    print('=' * 50)

### Adding new features

#### Rank column
The principle of a rank column is to combine values of features based on expected or calculated correlation of a column with likeliness of an event being a shower. The lowest value should correspond to the most probable shower track.

In this case, a short set of hand-picked list is utilized. More preferable way of doing this would be to use columns selected by a machine learning approach that calculates feature importance.

The provided features are normalized to 0-1 range, 
optionally, the values of the features are inverted (1-val) and weighted. 
Finally, the summed value is resulting rank of a record.

In [40]:
rank_columns = ['proc1_x_y_hough_peak_thr2_line_clusters_count', 'proc1_x_y_hough_peak_thr2_line_clusters_max_peak_clu_width', 
                 'proc1_gtu_y_hough_peak_thr2_line_clusters_max_peak_clu_width', 'proc1_gtu_x_hough_peak_thr2_line_clusters_max_peak_clu_width',
                 'trg_count_nonzero', 'num_gtu']

def add_rank_column_default(data_df):
    add_rank_column(data_df, 
                    rank_columns, 
                    ascending=[True, True, True, True, False, False], 
                    column_weights=[2,2,1,1,1,1],
                    print_max_vals=True, add_norm_columns=True, do_copy=False)

#### Difference columns
Expected property of air shower event is that at least in one of the shower projections in time should contain a line with a slope different than 0 or 90 degrees. 
In an ideal case an actual slope of a line is not important, only important information is slope difference to 0 or 90 degrees.

In [41]:
# not in the report
area_columns_prefix_list = ('proc1', 'proc2', 'proc3', 'trg')
area_columns_proj_list = ('x_y', 'gtu_x', 'gtu_y')
area_columns_thr_i_list = list(range(1,4))

area_columns_line_types = ['peak', 'sum', 'size']
area_columns_col_name_prefixes = ['{{prefix}}_{{proj}}_hough_peak_thr{{thr_i}}_line_clusters_max_{}_clu_'.format(t) for t in area_columns_line_types]

diff_columns_proc_range = (1,4)
diff_columns_alt_range = (1,1)
diff_columns_thr_range = (1,4)

diff_columns_prefixes = \
    ['proc{}'.format(i) for i in range(*diff_columns_proc_range)] + \
    ['alt{}'.format(i) for i in range(*diff_columns_alt_range)]

diff_columns_projs = ['gtu_x', 'gtu_y']
diff_columns_diff_types = ['pi_over_2', '0']

diff_columns_gtu_y_gtu_x_diff_format = '{prefix}_gtu_y_gtu_x_hough_peak_thr{thr_i}_major_line_phi_diff'
diff_columns_proj_diff_format = '{prefix}_{proj}_hough_peak_thr{thr_i}_major_line_phi_diff_{diff_type}'

#

common_extension_columns = ['event_id', 'rank']

for col in rank_columns:
    common_extension_columns.append('norm_' + col)

for prefix in area_columns_prefix_list:
    for proj in area_columns_proj_list:
        for thr_i in area_columns_thr_i_list:
            for col_name_prefix in area_columns_col_name_prefixes:
                common_extension_columns.append(col_name_prefix.format(prefix=prefix, proj=proj, thr_i=thr_i) + 'area')
    
for prefix in diff_columns_prefixes:
    for thr_i in range(*diff_columns_thr_range):
        common_extension_columns.append(diff_columns_gtu_y_gtu_x_diff_format.format(prefix=prefix, thr_i=thr_i))
        for proj in diff_columns_projs:
            for diff_type in diff_columns_diff_types:
                common_extension_columns.append(diff_columns_proj_diff_format.format(prefix=prefix, thr_i=thr_i, proj=proj, diff_type=diff_type))


simu_extension_columns = common_extension_columns
flight_extension_columns = common_extension_columns

# print(common_extension_columns)

#### Simu dataframe extension columns
(not in the report)

In [42]:
simu_event_ids_md5 = hashlib.md5(pickle.dumps(combined_simu_df['event_id'].values, protocol=0)).hexdigest()
simu_extension_columns_md5 = hashlib.md5(','.join(simu_extension_columns).encode()).hexdigest()
extension_columns_combined_simu_pathname = os.path.join(data_snippets_dir, 'extension_columns_simu_pathname_{}_{}.pkl.gz'.format(simu_event_ids_md5, simu_extension_columns_md5))
print(extension_columns_combined_simu_pathname)

ver4_simu_distributions_visible_events_distributions_w_simu_signal/extension_columns_simu_pathname_771a6a452a651f419ec12b94ac5eb28d_51d1629013c22c624c6bcb50d5b214a0.pkl.gz


In [43]:
if not os.path.exists(extension_columns_combined_simu_pathname):
    print('Building calculating columns ...')
    print('num_frames_signals_ge_bg bin column ...')
    
    add_bin_column(combined_simu_df, 'num_frames_signals_ge_bg', 5)

    print('  area columns ...')
    
    for attr_prefix_format in area_columns_col_name_prefixes:    
        add_area_columns(combined_simu_df, prefix_list=area_columns_prefix_list, proj_list=area_columns_proj_list, thr_i_list=area_columns_thr_i_list,
                        attr_prefix_format=attr_prefix_format) 

    print('  diff columns ...')
    
    add_diff_columns(combined_simu_df, proc_range=diff_columns_proc_range, alt_range=diff_columns_alt_range, hough_peak_thr_range=diff_columns_thr_range)

    print('  rank column ...')
    
    add_rank_column_default(combined_simu_df)

    print('Saving pickle ...')
        
    combined_simu_df[simu_extension_columns].to_pickle(extension_columns_combined_simu_pathname, 'gzip')
    
else:
    print('Loading...')
    simu_extension_columns_df = pd.read_pickle(extension_columns_combined_simu_pathname, 'gzip')
    print('Merging ...')
    combined_simu_df = pd.merge(combined_simu_df, simu_extension_columns_df, on=['event_id'])
    del simu_extension_columns_df
    
combined_simu_df.head()

Loading...
Merging ...


Unnamed: 0,event_id,source_file_acquisition_full,source_file_trigger_full,source_file_acquisition,source_file_trigger,global_gtu,packet_id,gtu_in_packet,num_gtu,orig_x_y_count_nonzero,...,proc3_gtu_y_gtu_x_hough_peak_thr2_major_line_phi_diff,proc3_gtu_x_hough_peak_thr2_major_line_phi_diff_pi_over_2,proc3_gtu_x_hough_peak_thr2_major_line_phi_diff_0,proc3_gtu_y_hough_peak_thr2_major_line_phi_diff_pi_over_2,proc3_gtu_y_hough_peak_thr2_major_line_phi_diff_0,proc3_gtu_y_gtu_x_hough_peak_thr3_major_line_phi_diff,proc3_gtu_x_hough_peak_thr3_major_line_phi_diff_pi_over_2,proc3_gtu_x_hough_peak_thr3_major_line_phi_diff_0,proc3_gtu_y_hough_peak_thr3_major_line_phi_diff_pi_over_2,proc3_gtu_y_hough_peak_thr3_major_line_phi_diff_0
0,11464,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_27000000.00/energy_1.00e+13/thousnd27E20....,posz_27000000.00/energy_1.00e+13/thousnd27E20....,169,1,41,11,2290,...,0.369017,1.271984,0.298812,1.500591,0.070205,0.386129,1.254872,0.315924,1.500591,0.070205
1,11465,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_27000000.00/energy_1.00e+13/thousnd27E20....,posz_27000000.00/energy_1.00e+13/thousnd27E20....,163,1,35,24,2290,...,1.226577,0.029304,1.541493,1.255881,0.314915,1.219437,0.052804,1.517993,1.272241,0.298555
2,11486,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_27000000.00/energy_1.00e+13/thousnd27E20....,posz_27000000.00/energy_1.00e+13/thousnd27E20....,169,1,41,11,2290,...,0.184444,1.210844,0.359952,1.395288,0.175508,0.184444,1.210844,0.359952,1.395288,0.175508
3,11487,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_27000000.00/energy_1.00e+13/thousnd27E20....,posz_27000000.00/energy_1.00e+13/thousnd27E20....,169,1,41,11,2290,...,0.182697,1.315778,0.255018,1.498476,0.072321,0.272988,1.255128,0.315668,1.528116,0.04268
4,11494,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_27000000.00/energy_1.00e+13/thousnd27E20....,posz_27000000.00/energy_1.00e+13/thousnd27E20....,166,1,38,10,2289,...,0.07504,1.338344,0.232453,1.263304,0.307493,0.09002,1.354184,0.216613,1.264164,0.306633


### Number of NaN entries
Events with NaN values in are currently rejected from showers dataset. 
However, final decision about rejection is made considering only columns using in ML algorithm.
Therefore, these numbers are not exactly indicative of the the final number of rejected events - only simu_track and noise_track should be indicative. (TODO requires check)

#### Number of NaN entries by query and simu signal labels

In [None]:
print('{:<30} {:<10} {}'.format(' ', 'NaN', 'Others'))
for selection_query in ['simu','noise']:
    for selection_simu_signal in ['noise','track']:
        for simu_signal_sync in ['', 'underflow', 'overflow']:
            t_selection_simu_signal = selection_simu_signal
            if len(simu_signal_sync) > 0:
                t_selection_simu_signal += '_' + simu_signal_sync
            subset_df = combined_simu_df[
                (combined_simu_df['cond_selection_query'] == selection_query ) & 
                (combined_simu_df['cond_selection_simple'] == t_selection_simu_signal)
            ]
            nan_row_count = np.count_nonzero(subset_df.isnull().any(axis=1))
            print('{:<30} {:<10} {}'.format(
                '{} - {}'.format(selection_query, t_selection_simu_signal),
                nan_row_count, len(subset_df) - nan_row_count
            ))

Flight data were already selected excluding entries with NaN values (actually NULL in PostgreSQL table).

#### NaN columns
Columns with a NaN value are either data from Hough transform on projections of triggered pixels - issue is a single pixel in a projection, thus it is impossible to determine orientation of a line. This impacts usable size of the dataset.
Other source of NaN values are additional information calculated for simulated shower - it is number of frames where number of signal pixels satisfies certain condition. The NaN value is present when there are no signal present in an identified event.

In [None]:
nan_columns = {}

for i, r in combined_simu_df[combined_simu_df.isnull().any(axis=1)].iterrows():
    for col, val in r.iteritems():
        if isinstance(val, numbers_Number) and math.isnan(val):
            if col not in nan_columns:
                nan_columns[col] = 0
            nan_columns[col] += 1

for col, val in nan_columns.items():
    print("{:<120} : {:<d}".format(col, val))

# del nan_columns

### Free memory
(not in the report)

In [None]:
if 'unclassified_simu_df' in locals(): del unclassified_simu_df
if 'track_simu_df' in locals(): del track_simu_df
if 'noisy_simu_df' in locals(): del noisy_simu_df
if 'simu_signal_track_events_df' in locals(): del simu_signal_track_events_df
if 'simu_signal_noisy_events_df' in locals(): del simu_signal_noisy_events_df

In [None]:
# unclassified_simu_df, \
# track_simu_df, track_underflow_simu_df, track_overflow_simu_df, \
# noise_simu_df, noise_underflow_simu_df, noise_overflow_simu_df, \
# simu_signal_track_events_df, simu_signal_noisy_events_df

## Distributions

In [44]:
combined_simu_df.head()

Unnamed: 0,event_id,source_file_acquisition_full,source_file_trigger_full,source_file_acquisition,source_file_trigger,global_gtu,packet_id,gtu_in_packet,num_gtu,orig_x_y_count_nonzero,...,proc3_gtu_y_gtu_x_hough_peak_thr2_major_line_phi_diff,proc3_gtu_x_hough_peak_thr2_major_line_phi_diff_pi_over_2,proc3_gtu_x_hough_peak_thr2_major_line_phi_diff_0,proc3_gtu_y_hough_peak_thr2_major_line_phi_diff_pi_over_2,proc3_gtu_y_hough_peak_thr2_major_line_phi_diff_0,proc3_gtu_y_gtu_x_hough_peak_thr3_major_line_phi_diff,proc3_gtu_x_hough_peak_thr3_major_line_phi_diff_pi_over_2,proc3_gtu_x_hough_peak_thr3_major_line_phi_diff_0,proc3_gtu_y_hough_peak_thr3_major_line_phi_diff_pi_over_2,proc3_gtu_y_hough_peak_thr3_major_line_phi_diff_0
0,11464,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_27000000.00/energy_1.00e+13/thousnd27E20....,posz_27000000.00/energy_1.00e+13/thousnd27E20....,169,1,41,11,2290,...,0.369017,1.271984,0.298812,1.500591,0.070205,0.386129,1.254872,0.315924,1.500591,0.070205
1,11465,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_27000000.00/energy_1.00e+13/thousnd27E20....,posz_27000000.00/energy_1.00e+13/thousnd27E20....,163,1,35,24,2290,...,1.226577,0.029304,1.541493,1.255881,0.314915,1.219437,0.052804,1.517993,1.272241,0.298555
2,11486,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_27000000.00/energy_1.00e+13/thousnd27E20....,posz_27000000.00/energy_1.00e+13/thousnd27E20....,169,1,41,11,2290,...,0.184444,1.210844,0.359952,1.395288,0.175508,0.184444,1.210844,0.359952,1.395288,0.175508
3,11487,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_27000000.00/energy_1.00e+13/thousnd27E20....,posz_27000000.00/energy_1.00e+13/thousnd27E20....,169,1,41,11,2290,...,0.182697,1.315778,0.255018,1.498476,0.072321,0.272988,1.255128,0.315668,1.528116,0.04268
4,11494,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_27000000.00/energy_1.00e+13/thousnd27E20....,posz_27000000.00/energy_1.00e+13/thousnd27E20....,166,1,38,10,2289,...,0.07504,1.338344,0.232453,1.263304,0.307493,0.09002,1.354184,0.216613,1.264164,0.306633


In [45]:
combined_simu_df['cond_selection_simple'].unique().tolist()

['track',
 'unclassified',
 'noise',
 'noise_underflow',
 'noise_overflow',
 'track_underflow',
 'track_overflow']

In [46]:
combined_simu_df[combined_simu_df['cond_selection_combined'] == 'simu_track'].head()

Unnamed: 0,event_id,source_file_acquisition_full,source_file_trigger_full,source_file_acquisition,source_file_trigger,global_gtu,packet_id,gtu_in_packet,num_gtu,orig_x_y_count_nonzero,...,proc3_gtu_y_gtu_x_hough_peak_thr2_major_line_phi_diff,proc3_gtu_x_hough_peak_thr2_major_line_phi_diff_pi_over_2,proc3_gtu_x_hough_peak_thr2_major_line_phi_diff_0,proc3_gtu_y_hough_peak_thr2_major_line_phi_diff_pi_over_2,proc3_gtu_y_hough_peak_thr2_major_line_phi_diff_0,proc3_gtu_y_gtu_x_hough_peak_thr3_major_line_phi_diff,proc3_gtu_x_hough_peak_thr3_major_line_phi_diff_pi_over_2,proc3_gtu_x_hough_peak_thr3_major_line_phi_diff_0,proc3_gtu_y_hough_peak_thr3_major_line_phi_diff_pi_over_2,proc3_gtu_y_hough_peak_thr3_major_line_phi_diff_0
0,11464,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_27000000.00/energy_1.00e+13/thousnd27E20....,posz_27000000.00/energy_1.00e+13/thousnd27E20....,169,1,41,11,2290,...,0.369017,1.271984,0.298812,1.500591,0.070205,0.386129,1.254872,0.315924,1.500591,0.070205
1,11465,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_27000000.00/energy_1.00e+13/thousnd27E20....,posz_27000000.00/energy_1.00e+13/thousnd27E20....,163,1,35,24,2290,...,1.226577,0.029304,1.541493,1.255881,0.314915,1.219437,0.052804,1.517993,1.272241,0.298555
2,11486,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_27000000.00/energy_1.00e+13/thousnd27E20....,posz_27000000.00/energy_1.00e+13/thousnd27E20....,169,1,41,11,2290,...,0.184444,1.210844,0.359952,1.395288,0.175508,0.184444,1.210844,0.359952,1.395288,0.175508
3,11487,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_27000000.00/energy_1.00e+13/thousnd27E20....,posz_27000000.00/energy_1.00e+13/thousnd27E20....,169,1,41,11,2290,...,0.182697,1.315778,0.255018,1.498476,0.072321,0.272988,1.255128,0.315668,1.528116,0.04268
4,11494,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,/home/spbproc/SPBDATA_processed/spb_simu/posz_...,posz_27000000.00/energy_1.00e+13/thousnd27E20....,posz_27000000.00/energy_1.00e+13/thousnd27E20....,166,1,38,10,2289,...,0.07504,1.338344,0.232453,1.263304,0.307493,0.09002,1.354184,0.216613,1.264164,0.306633


In [47]:
combined_simu_df['cond_selection_combined'].unique()

array(['simu_track', 'undefined', 'simu_noise', 'noise_noise_underflow',
       'noise_noise_overflow', 'noise_track_underflow', 'noise_track',
       'noise_track_overflow', 'noise_noise'], dtype=object)

In [48]:
simu_track_df = combined_simu_df[combined_simu_df['cond_selection_combined'] == 'simu_track']

In [49]:
simu_pure_noise_df = combined_simu_df[combined_simu_df['cond_selection_combined'].isin(
    ['noise_noise_underflow', 'noise_noise_overflow', 'noise_track_underflow', 'noise_track_overflow']
)]

In [50]:
simu_noise_df = combined_simu_df[combined_simu_df['cond_selection_combined'] == 'simu_noise']

In [51]:
probable_all_simu_df = all_simu_df[(all_simu_df['gtu_in_packet'] >= 30) & (all_simu_df['gtu_in_packet'] < 50)]

In [52]:
len(probable_all_simu_df)

67196

In [53]:
len(all_simu_df)

350000

### Simulation

TODO 
- all processed events
- all simulated events

In [None]:
# import ROOT
# hist_data_series = simu_track_df['etruth_trueenergy']*10**6
# h = ROOT.TH1I('energy_hist', 'True Primary particle energy', 30, hist_data_series.min(), hist_data_series.max())
# for v in hist_data_series: 
#     h.Fill(v)
# c = ROOT.TCanvas("myCanvasName","The Canvas Title",800,600)

#### Unique simulation files

In [None]:
len(simu_df)

#### Visible simulated packets pre single simulated air shower track

In [None]:
simu2npy_grouped = simu_df.groupby(['simu2npy_pathname'])
num_event_ids_ser = simu2npy_grouped.count().sort_values('event_id', ascending=False)['event_id']

fig, ax = plt.subplots(figsize=(6,3))
num_event_ids_ser.hist(ax=ax, range=(0, num_event_ids_ser.max()), bins=num_event_ids_ser.max())
ax.set_xlabel('Number of events per single simulated event')
ax.set_ylabel('Number of simulation files')
fig.savefig(os.path.join(data_snippets_dir, 'visible_events_num_events_per_simu_events.svg'), dpi=150)
plt.show()

#### Visible simulated packets pre single simulated air shower track and energy

In [None]:
simu2npy_grouped = simu_df.groupby(['simu2npy_pathname', 'etruth_trueenergy'], as_index=False).count().sort_values('event_id', ascending=False)[['simu2npy_pathname', 'etruth_trueenergy', 'event_id']]

fig, ax = plt.subplots(figsize=(6,3))
(simu2npy_grouped['etruth_trueenergy']*10**6).hist(ax=ax, bins=20)
ax.set_xlabel('True primary particle energy [eV]')
ax.set_ylabel('Number of simulation files')
fig.savefig(os.path.join(data_snippets_dir, 'visible_events_num_events_per_single_simu_per_true_energy.svg'), dpi=150)
plt.show()

#### All simulated packets pre single simulated air shower track

In [None]:
simu2npy_grouped = all_simu_df.groupby(['simu2npy_pathname'])
num_event_ids_ser = simu2npy_grouped.count().sort_values('event_id', ascending=False)['event_id']

fig, ax = plt.subplots(figsize=(6,3))
num_event_ids_ser.hist(ax=ax, range=(0, num_event_ids_ser.max()), bins=num_event_ids_ser.max())
ax.set_yscale('log')
ax.set_xlabel('Number of events per single simulated event')
ax.set_ylabel('Number of simulation files')
fig.savefig(os.path.join(data_snippets_dir, 'all_events_num_events_per_simu_events.svg'), dpi=150)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(6,3))

simu2npy_grouped = all_simu_df.groupby(['simu2npy_pathname', 'etruth_trueenergy'], as_index=False).count().sort_values('event_id', ascending=False)[['simu2npy_pathname', 'etruth_trueenergy', 'event_id']]

(simu2npy_grouped['etruth_trueenergy']*10**6).hist(ax=ax, bins=20, color='silver', label='All processed events')

simu2npy_grouped = probable_all_simu_df.groupby(['simu2npy_pathname', 'etruth_trueenergy'], as_index=False).count().sort_values('event_id', ascending=False)[['simu2npy_pathname', 'etruth_trueenergy', 'event_id']]

(simu2npy_grouped['etruth_trueenergy']*10**6).hist(ax=ax, bins=20, color='C2', label='Likely air shower events')

simu2npy_grouped = simu_df.groupby(['simu2npy_pathname', 'etruth_trueenergy'], as_index=False).count().sort_values('event_id', ascending=False)[['simu2npy_pathname', 'etruth_trueenergy', 'event_id']]

(simu2npy_grouped['etruth_trueenergy']*10**6).hist(ax=ax, bins=20, color='C1', label='Visible events')

simu2npy_grouped = simu_track_df.groupby(['simu2npy_pathname', 'etruth_trueenergy'], as_index=False).count().sort_values('event_id', ascending=False)[['simu2npy_pathname', 'etruth_trueenergy', 'event_id']]

(simu2npy_grouped['etruth_trueenergy']*10**6).hist(ax=ax, bins=20, color='C0', label='Visible track events')

ax.set_xlabel('True primary particle energy [eV]')
ax.set_ylabel('Number of simulation files')
ax.legend()
plt.savefig(os.path.join(data_snippets_dir, 'comparison_num_events_per_single_simu_per_true_energy.svg'), dpi=150)
plt.show()

#### By energy

In [None]:
hist_data_series = simu_df['etruth_trueenergy']*10**6
for xscale in ['exponential', 'human', 'scott', 'freedman']:
    fig, ax = plt.subplots(figsize=(6,4)) 
    # h = physt.histogram(hist_data_series, 'scott')
    h = physt.histogram(hist_data_series, xscale)
    h.plot(ax=ax)
    ax.grid(True)
    ax.set_xlabel('True primary particle energy [eV]')
    ax.set_ylabel('Number of event frame sequences')
    fig.savefig(os.path.join(data_snippets_dir, 'simu_etruth_trueenergy_' + xscale + '.svg'), dpi=150)
    plt.show()

In [None]:
hist_data_series = simu_track_df['etruth_trueenergy']*10**6
for xscale in ['exponential', 'human', 'scott', 'freedman']:
    fig, ax = plt.subplots(figsize=(6,4)) 
    # h = physt.histogram(hist_data_series, 'scott')
    h = physt.histogram(hist_data_series, xscale)
    h.plot(ax=ax)
    ax.grid(True)
    ax.set_xlabel('True primary particle energy [eV]')
    ax.set_ylabel('Number of event frame sequences')
    fig.savefig(os.path.join(data_snippets_dir, 'simu_track_etruth_trueenergy_' + xscale + '.svg'), dpi=150)
    plt.show()

In [None]:
for xscale in ['exponential', 'human', 'scott', 'freedman']:
    
    simu_hist_data_series = simu_df['etruth_trueenergy']*10**6
    hist_data_series = simu_track_df['etruth_trueenergy']*10**6
    
    hist_range = (
        min(simu_hist_data_series.min(), hist_data_series.min()), 
        max(simu_hist_data_series.max(), hist_data_series.max()), 
    )
    
    fig, ax = plt.subplots(figsize=(6,4)) 
    h = physt.histogram(simu_hist_data_series, xscale, range=hist_range)
    h.plot(ax=ax, color='C1', label="Visible events")
    
    
    h = physt.histogram(hist_data_series, range=hist_range, bins=h.bins)
    h.plot(ax=ax, alpha=1, color='C0', label="Visible track events")
    
    
    ax.grid(which='major', axis='y', linestyle='--')
    
    ax.set_xlabel('True primary particle energy [eV]')
    ax.set_ylabel('Number of event frame sequences')
    ax.legend()
    fig.savefig(os.path.join(data_snippets_dir, 'simu_track_and_visible_simu_etruth_trueenergy_' + xscale + '.svg'), dpi=150)
    
    plt.show()

In [None]:
for xscale in ['exponential', 'human', 'scott', 'freedman']:
    for yscale in ['symlog', 'linear']:
        all_simu_hist_data_series = all_simu_df['etruth_trueenergy']*10**6
        probable_all_simu_hist_data_series = probable_all_simu_df['etruth_trueenergy']*10**6
        simu_hist_data_series = simu_df['etruth_trueenergy']*10**6
        hist_data_series = simu_track_df['etruth_trueenergy']*10**6

        hist_range = (
            min(all_simu_hist_data_series.min(), probable_all_simu_hist_data_series.min(), simu_hist_data_series.min(), hist_data_series.min()), 
            max(all_simu_hist_data_series.max(), probable_all_simu_hist_data_series.max(), simu_hist_data_series.max(), hist_data_series.max()), 
        )

        fig, ax = plt.subplots(figsize=(6,4))

        h = physt.histogram(all_simu_hist_data_series, xscale, range=hist_range)
        h.plot(ax=ax, color='silver', label="All processed events")

        h = physt.histogram(probable_all_simu_hist_data_series, range=hist_range, bins=h.bins)
        h.plot(ax=ax, color='C2', label="Likely air shower events")
        
        h = physt.histogram(simu_hist_data_series, range=hist_range, bins=h.bins)
        h.plot(ax=ax, color='C1', label="Visible events")

        h = physt.histogram(hist_data_series, range=hist_range, bins=h.bins)
        h.plot(ax=ax, alpha=1, color='C0', label="Visible track events")

        ax.grid(which='major', axis='y', linestyle='--')

        ax.set_xlabel('True primary particle energy [eV]')
        ax.set_ylabel('Number of event frame sequences')
        ax.set_yscale(yscale)
        ax.legend(loc='lower left' if 'log' in yscale else 'best')
            
        fig.savefig(os.path.join(data_snippets_dir, 
                                 'simu_visible_track_and_simu_track_and_simu_all_etruth_trueenergy_{}_{}.svg'.format(
                                     xscale, yscale
                                 )), dpi=150)

        plt.show()

In [None]:
rejected_events_df = \
    all_simu_df[(all_simu_df['etruth_trueenergy']*10**6 > 0.6*1e19) & ~all_simu_df['event_id'].isin(simu_track_df['event_id'])] \
        .sort_values('etruth_trueenergy', ascending=False)

In [None]:
rejected_events_df.head()

In [None]:
def vis_simu_signal_default(i, r, visualized_projections, fig, axs_flattened): 
    show_simu_event_row(i, r, 
        npy_pathname_column='simu2npy_signals_pathname', 
        single_proj_width=4, single_proj_height=4,
        print_info=False, warn_if_not_exact_simu=False)
    
vis_events_df(
    rejected_events_df, 
    events_per_figure=5, max_figures=1, vis_gtux=True, vis_gtuy=True, 
    close_after_vis=False, show=True, 
#     line_columns_x_y=[(angle_prop[:-3]+'rho', angle_prop)],
    additional_printed_columns=['etruth_trueenergy'],
#         'num_frames_signals_ge_bg', 'simu2npy_signals_pathname_short', angle_prop],
    by_one=True,
    single_proj_width=4, single_proj_height=4,
    extension_func=vis_simu_signal_default
)

#### By arrival direction

In [None]:
for angle_prop, angle_prop_label in [('etruth_truephi', 'azimuth'), ('etruth_truetheta', 'zenith')]:
    print('Property: ' + angle_prop)
    hist_data_series = np.rad2deg(simu_track_df[angle_prop]) #*10**6
    for xscale in ['human', 'scott', 'freedman']:
        fig, ax = plt.subplots(figsize=(6,4)) 
        # h = physt.histogram(hist_data_series, 'scott')
        h = physt.histogram(hist_data_series, xscale)
        h.plot(ax=ax)
        ax.grid(True)
        ax.set_xlabel('True '+angle_prop_label+' angle [deg]')
        ax.set_ylabel('Number of event frame sequences')
        fig.savefig(os.path.join(data_snippets_dir, 
                                 'simu_track_' + angle_prop + '_' + xscale + '.svg'), dpi=150)
        plt.show()

In [None]:
for angle_prop, angle_prop_label in [('etruth_truephi', 'azimuth'), ('etruth_truetheta', 'zenith')]:
    print('Property: ' + angle_prop)
        
    all_simu_hist_data_series = np.rad2deg(all_simu_df[angle_prop])
    simu_hist_data_series = np.rad2deg(simu_df[angle_prop])
    hist_data_series = np.rad2deg(simu_track_df[angle_prop])
    
    hist_range = (
        min(all_simu_hist_data_series.min(), simu_hist_data_series.min(), hist_data_series.min()), 
        max(all_simu_hist_data_series.max(), simu_hist_data_series.max(), hist_data_series.max()), 
    )
    
    for xscale in ['human', 'scott']:
        for yscale in ['symlog', 'linear']:

            fig, ax = plt.subplots(figsize=(6,4))

            h = physt.histogram(simu_hist_data_series, xscale, range=hist_range)
            h.plot(ax=ax, color='C1', label="Visible events")

            h = physt.histogram(hist_data_series, range=hist_range, bins=h.bins)
            h.plot(ax=ax, alpha=1, color='C0', label="Visible track events")

            ax.grid(which='major', axis='y', linestyle='--')
            
            ax.set_xlabel('True '+angle_prop_label+' angle [deg]')
            ax.set_ylabel('Number of event frame sequences')
            ax.set_yscale(yscale)
            ax.legend(loc='lower left' if 'log' in yscale else 'best')
            fig.savefig(os.path.join(data_snippets_dir, 'simu_visible_and_track_{}_{}_{}.svg'.format(
                angle_prop, xscale, yscale
            )), dpi=150)
            plt.show()

In [None]:
for angle_prop, angle_prop_label in [('etruth_truephi', 'azimuth'), ('etruth_truetheta', 'zenith')]:
    print('Property: ' + angle_prop)
        
    all_simu_hist_data_series = np.rad2deg(all_simu_df[angle_prop])
    probable_all_simu_hist_data_series = np.rad2deg(probable_all_simu_df[angle_prop])
    simu_hist_data_series = np.rad2deg(simu_df[angle_prop])
    hist_data_series = np.rad2deg(simu_track_df[angle_prop])
    
    hist_range = (
        min(all_simu_hist_data_series.min(), probable_all_simu_hist_data_series.min(), simu_hist_data_series.min(), hist_data_series.min()), 
        max(all_simu_hist_data_series.max(), probable_all_simu_hist_data_series.max(), simu_hist_data_series.max(), hist_data_series.max()), 
    )
    
    for xscale in ['human', 'scott']:
        for yscale in ['symlog', 'linear']:

            fig, ax = plt.subplots(figsize=(6,4))

            h = physt.histogram(all_simu_hist_data_series, xscale, range=hist_range)
            h.plot(ax=ax, color='silver', label="All processed events")
            
            h = physt.histogram(probable_all_simu_hist_data_series, range=hist_range, bins=h.bins)
            h.plot(ax=ax, color='C2', label="Likely air shower events")

            h = physt.histogram(simu_hist_data_series, range=hist_range, bins=h.bins)
            h.plot(ax=ax, color='C1', label="Visible events")

            h = physt.histogram(hist_data_series, range=hist_range, bins=h.bins)
            h.plot(ax=ax, alpha=1, color='C0', label="Visible track events")

            ax.grid(which='major', axis='y', linestyle='--')
            
            ax.set_xlabel('True '+angle_prop_label+' angle [deg]')
            ax.set_ylabel('Number of event frame sequences')
            ax.set_yscale(yscale)
            ax.legend(loc='lower left' if 'log' in yscale else 'best')
            fig.savefig(os.path.join(data_snippets_dir, 'simu_all_visible_track_{}_{}_{}.svg'.format(
                angle_prop, xscale, yscale
            )), dpi=150)
            plt.show()

In [None]:
for angle_prop, angle_prop_label in [('etruth_truephi', 'azimuth'), ]:
    print('Property: ' + angle_prop)
    hist_data_series = simu_track_df[angle_prop] #*10**6
    for xscale in ['human', 'scott', 'freedman']:
        fig, ax = plt.subplots(figsize=(6,4)) 
        
        mt = np.array(hist_data_series)
        mt = 2*np.pi - mt
        mt[mt > np.pi] = mt[mt > np.pi] - np.pi
                
        h = physt.histogram(np.rad2deg(mt), xscale)
        h.plot(ax=ax)
        ax.grid(True)
        ax.set_xlabel('True normalized '+angle_prop_label+' angle [deg]')
        ax.set_ylabel('Number of event frame sequences')
        fig.savefig(os.path.join(data_snippets_dir, 
                                 'simu_track_' + angle_prop + '_' + xscale + '_norm_0_180.svg'), dpi=150)
        plt.show()

In [None]:
for cfg_name, cfg in [
    ('linear_viridis', dict(cmap="viridis", show_zero=True, lw=0)),
    ('linear', dict(cmap="Blues", show_zero=False, lw=0)),
    ('log', dict(cmap="Blues", cmap_normalize="log", show_zero=False, lw=0))
]:
    fig, ax = plt.subplots(figsize=(10, 2.6)) 
    h = physt.h2(
            np.rad2deg(simu_track_df['etruth_truephi']), np.rad2deg(simu_track_df['etruth_truetheta']),
            'fixed_width', (10, 10),
            axis_names=['Azimuth angle', 'Zenith angle'],
        )
    h.plot(ax=ax, **cfg)
    fig.savefig(os.path.join(data_snippets_dir,
                             'simu_track_zenith_angle_azimuth_angle_hist2d_'+cfg_name+'.png'), dpi=150)

    plt.show()

#### By altitude

In [None]:
prop = 'egeometry_pos_z'
prop_label = 'Altitude'
print('Property: ' + prop)

all_simu_hist_data_series = np.rad2deg(all_simu_df[prop])
simu_hist_data_series = np.rad2deg(simu_df[prop])
hist_data_series = np.rad2deg(simu_track_df[prop])

hist_range = (
    min(all_simu_hist_data_series.min(), simu_hist_data_series.min(), hist_data_series.min()), 
    max(all_simu_hist_data_series.max(), simu_hist_data_series.max(), hist_data_series.max()), 
)

for xscale in ['human', 'scott']:
    yscale = 'linear'
#     for yscale in ['symlog', 'linear']:

    fig, ax = plt.subplots(figsize=(6,4))

    h = physt.histogram(simu_hist_data_series, xscale, range=hist_range)
    h.plot(ax=ax, color='C1', label="Visible events")

    h = physt.histogram(hist_data_series, range=hist_range, bins=h.bins)
    h.plot(ax=ax, alpha=1, color='C0', label="Visible track events")

    ax.grid(which='major', axis='y', linestyle='--')

    ax.set_xlabel(prop_label)
    ax.set_ylabel('Number of event frame sequences')
    ax.set_yscale(yscale)
    ax.legend(loc='lower left' if 'log' in yscale else 'best')
    fig.savefig(os.path.join(data_snippets_dir, 'simu_visible_and_track_{}_{}_{}.svg'.format(
        prop, xscale, yscale
    )), dpi=150)
    plt.show()

#### By num frames where signal > noise

In [None]:
# for xscale in ['exponential', 'human', 'scott', 'freedman']:
xscale = 'human'
prop = 'num_frames_signals_ge_bg'

simu_hist_data_series = simu_df[prop]
hist_data_series = simu_track_df[prop]

print (simu_hist_data_series.min(), hist_data_series.min())
print(simu_hist_data_series.max(), hist_data_series.max())

hist_range = (
    min(simu_hist_data_series.min(), hist_data_series.min()), 
    max(simu_hist_data_series.max(), hist_data_series.max()), 
)

fig, ax = plt.subplots(figsize=(6,4)) 
# h = physt.histogram(simu_hist_data_series, xscale, range=hist_range)
xscale = '{}_bins'.format(hist_range[1])
h = physt.histogram(simu_hist_data_series, xscale, range=hist_range)

h.plot(ax=ax, color='C1', label="Visible events")


h = physt.histogram(hist_data_series, range=hist_range, bins=h.bins)
h.plot(ax=ax, alpha=1, color='C0', label="Visible track events")


ax.grid(which='major', axis='y', linestyle='--')

ax.set_xlabel('Number of frames where signal is greater of equal noise')
ax.set_ylabel('Number of event frame sequences')
ax.legend()
fig.savefig(os.path.join(data_snippets_dir, 'simu_track_and_visible_simu_{}_{}.svg'.format(prop, xscale)), dpi=150)

plt.show()

### Extracted features

#### Phi

In [None]:
import scipy.optimize

In [54]:
def exp_func(x, a, b, c, d):
    return a*np.exp(-c*(x-b))+d
def exp_func4(x, c):
    return 1*np.exp(-c*(x))+0

def calcvis_angle_stats(
    subset_df, 
    analyzed_props=(
        'proc1_x_y_hough_peak_thr1_major_line_phi',
        'proc1_x_y_hough_peak_thr1_max_peak_clu_major_line_phi',
        'proc1_x_y_hough_peak_thr2_major_line_phi',
        'proc1_x_y_hough_peak_thr2_max_peak_clu_major_line_phi',
        'proc1_x_y_hough_peak_thr3_major_line_phi',
        'proc1_x_y_hough_peak_thr3_max_peak_clu_major_line_phi',
        'proc2_x_y_hough_peak_thr1_major_line_phi',
        'proc2_x_y_hough_peak_thr1_max_peak_clu_major_line_phi',
        'proc2_x_y_hough_peak_thr2_major_line_phi',
        'proc2_x_y_hough_peak_thr2_max_peak_clu_major_line_phi',
        'proc2_x_y_hough_peak_thr3_major_line_phi',
        'proc2_x_y_hough_peak_thr3_max_peak_clu_major_line_phi',
        'proc3_x_y_hough_peak_thr1_major_line_phi', 
        'proc3_x_y_hough_peak_thr1_max_peak_clu_major_line_phi',
        'proc3_x_y_hough_peak_thr2_major_line_phi', 
        'proc3_x_y_hough_peak_thr2_max_peak_clu_major_line_phi',
        'proc3_x_y_hough_peak_thr3_major_line_phi',  
        'proc3_x_y_hough_peak_thr3_max_peak_clu_major_line_phi',
    #     'proc3_gtu_y_hough_peak_thr3_major_line_phi', # just note the obervation  
    #     'proc3_gtu_x_hough_peak_thr3_major_line_phi', #   when histogrammed in relation to phi sinusoidal patterns appear
        'alt1_x_y_hough_peak_thr1_major_line_phi',
        'alt1_x_y_hough_peak_thr1_max_peak_clu_major_line_phi',
        'alt1_x_y_hough_peak_thr2_major_line_phi',
        'alt1_x_y_hough_peak_thr2_max_peak_clu_major_line_phi',
        'alt1_x_y_hough_peak_thr3_major_line_phi',
        'alt1_x_y_hough_peak_thr3_max_peak_clu_major_line_phi',
        'trg_x_y_hough_peak_thr1_major_line_phi',
        'trg_x_y_hough_peak_thr1_max_peak_clu_major_line_phi',
        'trg_x_y_hough_peak_thr2_major_line_phi',
        'trg_x_y_hough_peak_thr2_max_peak_clu_major_line_phi',
        'trg_x_y_hough_peak_thr3_major_line_phi',
        'trg_x_y_hough_peak_thr3_max_peak_clu_major_line_phi',
    ), 
    savefig_prefix=None, 
    do_fit=True, 
    truephi_prop='etruth_truephi', 
    show_log=False, show_lowres=False, show_hires=True):
    angle_stats = []
    
    truephi_str = '' if truephi_prop == 'etruth_truephi' else ('_'+truephi_prop)
    def get_w_fit_str(draw_fit):
        return '_w_fit' if draw_fit else ''
    
    for angle_prop in analyzed_props:
        print('='*100)
        print('Property: ' + angle_prop)

        simu_track_prop_mask = ~subset_df[angle_prop].isnull() & (subset_df[angle_prop] > 0)

        print('  Entries: {:d}'.format(np.count_nonzero(simu_track_prop_mask)))

        mt = np.array(subset_df[simu_track_prop_mask][truephi_prop])
        ot = np.array(mt)
        mt[mt < 0] = 2*np.pi + mt[mt < 0] 
        mt = 2*np.pi - mt
        mt[mt > np.pi] = mt[mt > np.pi] - np.pi

        me = np.array(subset_df[simu_track_prop_mask][angle_prop])
        oe = np.array(me)
        me_pi2_gz_mask = me - np.pi/2 > 0
        me[me_pi2_gz_mask] = me[me_pi2_gz_mask] - np.pi/2
        me[~me_pi2_gz_mask] = me[~me_pi2_gz_mask] + np.pi/2
        me[me > np.pi] = me[me > np.pi] - np.pi

        deg_mt = np.rad2deg(mt)
        deg_me = np.rad2deg(me)
        
        md = np.abs(mt - me)

        # this should be correct
        # mt = (3/2)*np.pi - ot 
        
        md_pi2_mask = md > np.pi/2
        mt_gz_me_mask = mt > me
        md[md_pi2_mask & mt_gz_me_mask] = \
            np.abs(me[md_pi2_mask & mt_gz_me_mask] - (mt[md_pi2_mask & mt_gz_me_mask] - np.pi) )
        md[md_pi2_mask & ~mt_gz_me_mask] = \
            np.abs(mt[md_pi2_mask & ~mt_gz_me_mask] - (me[md_pi2_mask & ~mt_gz_me_mask] - np.pi))

    #     md[md > np.pi/2] = np.abs(md[md > np.pi/2] - np.pi)

        mse_md = (mt - me)
        mse_md_pi2_mask = mse_md > np.pi/2
        mse_md[mse_md_pi2_mask & mt_gz_me_mask] = \
            me[mse_md_pi2_mask & mt_gz_me_mask] - (mt[mse_md_pi2_mask & mt_gz_me_mask] - np.pi) 
        mse_md[mse_md_pi2_mask & ~mt_gz_me_mask] = \
            mt[mse_md_pi2_mask & ~mt_gz_me_mask] - (me[mse_md_pi2_mask & ~mt_gz_me_mask] - np.pi) 
        mse_md = mse_md**2

        nbef = 20
        nprev = nbef+5
        print('{:<4}\t{:<4}\t{:<4}\t{:<4}\t{:<4}\t{:<4}'.format('OTru', 'OEst', 'True', 'Est.', 'Diff', 'Sq.Diff'))
        for t, _t, e, _e, d, mse_d, (k_ent, ent), in zip(
                np.rad2deg(mt[nbef:nprev]), 
                np.rad2deg(ot[nbef:nprev]),
                np.rad2deg(me[nbef:nprev]),
                np.rad2deg(oe[nbef:nprev]),
                np.rad2deg(md[nbef:nprev]),
                np.rad2deg(mse_md[nbef:nprev]),
                subset_df[simu_track_prop_mask].iloc[nbef:nprev].iterrows()
        ):
            print('{:4.2f}\t{:4.2f}\t{:4.2f}\t{:4.2f}\t{:4.2f}\t{:4.2f}'.format(_t, _e, t, e, d, mse_d))

    #         vis_events_df(
    #             simu_track_df[simu_track_prop_mask & (simu_track_df['event_id']==ent['event_id'])], 
    #             events_per_figure=1, max_figures=1, vis_gtux=False, vis_gtuy=False, 
    #             close_after_vis=False, show=True, 
    #             line_columns_x_y=[(angle_prop[:-3]+'rho', angle_prop)],
    #             additional_printed_columns=[
    #                 'num_frames_signals_ge_bg', 'simu2npy_signals_pathname_short', angle_prop],
    #             by_one=True,
    #             single_proj_width=3, single_proj_height=3
    #         )

        fig, ax = plt.subplots(figsize=(4, 3)) 
        h = physt.h2(
                np.rad2deg(mt), np.rad2deg(me),
                'fixed_width', (5, 5),
                range=(0,180),
                axis_names=['True azimuth angle', 'Estimated orientation of a line'],
            )
        
        h.plot(ax=ax, cmap="Blues", cmap_normalize="log", show_zero=False, lw=0) # viridis  plasma magma
        
        fig.axes[1].set_ylabel('Number of entries')
                
        ax.plot([np.min(deg_mt),np.max(deg_mt)], [np.min(deg_me),np.max(deg_me)], linestyle=':', color="red", alpha=.8)
        
        if savefig_prefix is not None:
            fig.savefig(savefig_prefix + '{}_{}_azimuth_hist2d_log.png'.format(truephi_str, angle_prop), dpi=150)
        
        if show_lowres:
            plt.show()
        
        plt.close('all')
        
        # --------------------

        fig, ax = plt.subplots(figsize=(4, 3)) 
        h = physt.h2(
                deg_mt, deg_me,
                'fixed_width', (2.5, 2.5),
                range=(0,180),
                axis_names=['True azimuth angle', 'Estimated orientation of a line'],
            )
        
        h.plot(ax=ax, cmap="Blues", cmap_normalize="log", show_zero=False, lw=0) # viridis  plasma magma
        
        fig.axes[1].set_ylabel('Number of entries')
                
#         ax.plot([np.min(deg_mt),np.max(deg_mt)], [np.min(deg_me),np.max(deg_me)], linestyle=':', color="red", alpha=.8)
        ax.plot([0,180], [0,180], linestyle=':', color="red", alpha=.8)

        
        if savefig_prefix is not None:
            fig.savefig(savefig_prefix + '{}_{}_azimuth_hist2d_2_5_2_5_log.png'.format(truephi_str, angle_prop), dpi=150)
        
        if show_hires:
            plt.show()
        
        plt.close('all')
        
        rP = None
        plot_shown = False
    
        for draw_fit in (True, False):
            
            if draw_fit and not do_fit:
                continue
                
            fig, ax = plt.subplots(figsize=(4, 2.5)) 
            h = physt.histogram(np.rad2deg(md), 'fixed_width', 1) #'scott'

            if do_fit and rP is None:
                max_freq = max(h.frequencies)
                rX = h.bin_centers
                popt, pcov = scipy.optimize.curve_fit(exp_func4, h.bin_centers, h.frequencies / max_freq , [0])
                c = popt[0]

        #         print(rX[0:-1], rX[1:])
        #         print(len(rX[0:-1]), len(rX[1:]))

        #         rX_betw = (rX[0:-1] + rX[1:]) / 2
        #         rX_vals = np.array(list(itertools.chain.from_iterable(zip(rX, rX_betw))) + [rX[-1]])
        #         rX = rX_vals
        #         rX_betw = (rX[0:-1] + rX[1:]) / 2
        #         rX_vals = np.array(list(itertools.chain.from_iterable(zip(rX, rX_betw))) + [rX[-1]])
        #         rX = rX_vals

                rP = exp_func4(rX, *popt) * max_freq

            h.plot(ax=ax)

            if do_fit and draw_fit:
                ax.plot(rX, rP , color='red', alpha=.5) #

    #         ax.set_xlabel('Angle {} [deg]'.format(angle_prop))
            ax.set_xlabel('Angle difference [deg]')
            ax.set_ylabel('Number of events')

            if savefig_prefix is not None:
                fig.savefig(savefig_prefix + '{}_{}_azimuth_difference_hist{}.png'.format(
                                truephi_str, angle_prop, get_w_fit_str(draw_fit)), 
                            dpi=150)
                
            if not plot_shown:
                plt.show()
                plot_shown = True
            plt.close('all')
            
        # --------------------
        
        plot_shown = False
        
        for draw_fit in (True, False):
            
            if draw_fit and not do_fit:
                continue

            fig, ax = plt.subplots(figsize=(4, 2.5)) 
            h = physt.histogram(np.rad2deg(md), 'fixed_width', 1) #'scott'

            h.plot(ax=ax)
            if do_fit and draw_fit:
                ax.plot(rX, rP , color='red', alpha=.5) #
    #         ax.set_xlabel('Angle {} [deg]'.format(angle_prop))
            ax.set_xlabel('Angle difference [deg]')
            ax.set_ylabel('Number of events')

            ax.set_yscale('symlog')

            if savefig_prefix is not None:
                fig.savefig(savefig_prefix + '{}_{}_azimuth_difference_hist{}_symlog.png'.format(
                                truephi_str, angle_prop, get_w_fit_str(draw_fit)), 
                            dpi=150)
            if not plot_shown and show_log:
                plt.show()
                plot_shown = True
            plt.close('all')
        
        # -----------------------

        angle_stats.append((angle_prop, h, c, rX, rP))

        if do_fit:
            print('c        ', c)
            print('c^-1     ', 1/c)
            print('ln(4/3)/c', np.log(4/3)/c)
        
        print('MAE', np.rad2deg(np.mean(md)))
        print('MSE', np.rad2deg(np.mean(mse_md)))
        print('68  ', np.quantile(np.rad2deg(md), 0.68))
        print('95  ', np.quantile(np.rad2deg(md), 0.95))

    return angle_stats


In [None]:
angle_stats = calcvis_angle_stats(
    simu_track_df, 
    analyzed_props=(
        'proc2_x_y_hough_peak_thr1_max_peak_clu_major_line_phi',
    ), 
    truephi_prop='', 
    savefig_prefix=os.path.join(data_snippets_dir, 'simu_track_x_y_phi_angle_stats'))

In [None]:
angle_stats = calcvis_angle_stats(
    simu_track_df, 
    savefig_prefix=os.path.join(data_snippets_dir, 'simu_track_x_y_phi_angle_stats'))

In [None]:
calcvis_angle_stats(
    simu_pure_noise_df, (
        'proc1_x_y_hough_peak_thr3_major_line_phi',
        'proc3_x_y_hough_peak_thr3_major_line_phi',
        'alt1_x_y_hough_peak_thr3_major_line_phi',
        'trg_x_y_hough_peak_thr3_major_line_phi',
    ), do_fit=False, 
    savefig_prefix=os.path.join(data_snippets_dir, 'simu_pure_noise_x_y_phi_angle_stats')
);

In [None]:
fig, ax = plt.subplots()
handles = []
angle_props = []
for i, (angle_prop, h, c, rX, rP) in enumerate(angle_stats):
    p = ax.plot(rX, rP, label=angle_prop)
    handles.append(p[0])
    angle_props.append(angle_prop)
# ax.legend(handles, angle_props)
fig.savefig(os.path.join(data_snippets_dir, 'simu_track_x_y_phi_angle_stats__fits_line_comparison.svg'), dpi=150)
plt.show()

fig, ax = plt.subplots(figsize=(12,4))
ax.legend(handles, angle_props, 
          loc='center', fontsize='large', mode='expand', ncol=1)
ax.set_axis_off()
fig.savefig(os.path.join(data_snippets_dir, 'simu_track_x_y_phi_angle_stats__fits_line_comparison_legend.svg'), dpi=150)
plt.show()

### GTU Phi

In [None]:
[c for c in simu_track_df.columns if c.startswith('proc1_gtu_y_hough_peak_thr3_')]

In [None]:
def make_line2d_dot(markerfacecolor='red', marker='o', color='white', **kwargs):
    return mpl.lines.Line2D(range(1), range(1), color=color, marker=marker, markerfacecolor=markerfacecolor, **kwargs)


def legend_for_scatterplots(ax, make_handle_func=make_line2d_dot, apply=True, legend_kwargs={}):
    handles = []
    labels = []
    for coll in ax.collections:
        markerfacecolor = np.array(coll.get_facecolors()[0])
        markerfacecolor[3] = 1  # alpha=1
        label = coll.get_label()
        handles.append(make_handle_func(markerfacecolor))
        labels.append(label)

    if apply:
        return ax.legend(handles, labels, **legend_kwargs)
    else:
        return handles, labels

def norm_truephi(prop_ser):
    first_dim = np.array(prop_ser)
    first_dim_pi2_gz_mask = first_dim - np.pi/2 > 0
    first_dim[first_dim_pi2_gz_mask] = first_dim[first_dim_pi2_gz_mask] - np.pi/2
    first_dim[~first_dim_pi2_gz_mask] = first_dim[~first_dim_pi2_gz_mask] + np.pi/2
    first_dim[first_dim > np.pi] = first_dim[first_dim > np.pi] - np.pi
    return first_dim

def calcvis_gtu_angle_stats(
    simu_subset_df, noise_subset_df, analyzed_props=(    
        ('proc1_gtu_y_hough_peak_thr1_major_line_phi', 'proc1_gtu_y_hough_peak_thr1_line_clusters_max_pak_clu_width'),
        ('proc1_gtu_x_hough_peak_thr1_major_line_phi', 'proc1_gtu_x_hough_peak_thr1_line_clusters_max_pak_clu_width'),
    ),
    norm_angle_first_dim=True, first_dim_bins=180, norm_angle_second_dim=False, second_dim_bins=90,
    draw_fit=True, 
    figsize=(7, 7), left=0.1, width=0.7, bottom=0.1, height=0.70, hspacing=0.005, 
    savefig_prefix=None):
    
#     angle_stats = []
    
    for first_dim_prop, second_dim_prop in analyzed_props:
        
        print('Properties: {}, {}'.format(first_dim_prop, second_dim_prop))
        
        
        fig = plt.figure(1, figsize=figsize)
        
#         fig_scatt, ax_scatt = plt.subplots(figsize=(6, 6)) 
        
#         fig_angle_hist, ax_angle_hist = plt.subplots(figsize=(6, 3)) 
#         fig_clu_size_hist, ax_clu_size_hist = plt.subplots(figsize=(6, 3)) 
        
        
        fig_size_inches = fig.get_size_inches()
        vspacing = hspacing * fig_size_inches[0] / fig_size_inches[1]

        rect_scatter = [left, bottom, width, height]
        rect_histx = [left, bottom + height + vspacing, width, 1 - (bottom + height + vspacing)]
        rect_histy = [left + width + hspacing, bottom, 1 - (left + width + hspacing), height]

        ax_scatt = plt.axes(rect_scatter)
        ax_angle_hist = plt.axes(rect_histx, sharex=ax_scatt)
        ax_clu_size_hist = plt.axes(rect_histy, sharey=ax_scatt)
        
#         ax_scatter = plt.axes(rect_scatter)
#         if do_datetime_hist:
#             ax_histx = plt.axes(rect_histx, sharex=ax_scatter)
#         if do_trigger_rate_hist:
#             ax_histy = plt.axes(rect_histy, sharey=ax_scatter)
        
        first_dim_range = [0, 0]
        second_dim_range = [0, 0]
        dim_range_is_init = False
    
        for subset_df in (noise_subset_df, simu_subset_df):
            if subset_df is None:
                continue
                
            simu_track_prop_mask = ~subset_df[first_dim_prop].isnull() & ~subset_df[second_dim_prop].isnull() 

            if norm_angle_first_dim:
                simu_track_prop_mask &= (subset_df[first_dim_prop] > 0)
            if norm_angle_second_dim:
                simu_track_prop_mask &= (subset_df[second_dim_prop] > 0)
                
            if norm_angle_first_dim:
                first_dim=np.rad2deg(norm_truephi(subset_df[simu_track_prop_mask][first_dim_prop]))
            else:
                first_dim = subset_df[simu_track_prop_mask][first_dim_prop]
                
            if norm_angle_second_dim:
                second_dim=np.rad2deg(norm_truephi(subset_df[simu_track_prop_mask][second_dim_prop]))
            else:
                second_dim = subset_df[simu_track_prop_mask][second_dim_prop]
            
            first_dim_min = np.min(first_dim)
            first_dim_max = np.max(first_dim)
            second_dim_min = np.min(second_dim)
            second_dim_max = np.max(second_dim)
            
            if not dim_range_is_init or first_dim_range[0] > first_dim_min:
                first_dim_range[0] = first_dim_min
            if not dim_range_is_init or first_dim_range[1] < first_dim_max:
                first_dim_range[1] = first_dim_max
            if not dim_range_is_init or second_dim_range[0] > second_dim_min:
                second_dim_range[0] = second_dim_min
            if not dim_range_is_init or second_dim_range[1] < second_dim_max:
                second_dim_range[1] = second_dim_max
            
            dim_range_is_init = True
                
#             print('  {} range: {},{}  | {},{}'.format(first_dim_prop, first_dim_range[0], first_dim_range[1], 
#                                                 subset_df[simu_track_prop_mask][first_dim_prop].min(),
#                                                 subset_df[simu_track_prop_mask][first_dim_prop].max()
#                                                ))
#             print('  {} range: {},{}  | {},{}'.format(second_dim_prop, second_dim_range[0], second_dim_range[1], 
#                                                 subset_df[simu_track_prop_mask][second_dim_prop].min(),
#                                                 subset_df[simu_track_prop_mask][second_dim_prop].max()
#                                                ))
            
        for i, (subset_label, subset_df, subset_alpha, subset_color) in enumerate((
                ('Noise', noise_subset_df, 0.6, 'C0'), ('Simulations', simu_subset_df, 0.5, 'C1'),
        )):
            if subset_df is None:
                continue
                
            print('  Subset: ' + subset_label)
            
            simu_track_prop_mask = ~subset_df[first_dim_prop].isnull() & ~subset_df[second_dim_prop].isnull() 

            if norm_angle_first_dim:
                simu_track_prop_mask &= (subset_df[first_dim_prop] > 0)
            if norm_angle_second_dim:
                simu_track_prop_mask &= (subset_df[second_dim_prop] > 0)
                
            if norm_angle_first_dim:
                first_dim=np.rad2deg(norm_truephi(subset_df[simu_track_prop_mask][first_dim_prop]))
            else:
                first_dim = subset_df[simu_track_prop_mask][first_dim_prop]
                
            if norm_angle_second_dim:
                second_dim=np.rad2deg(norm_truephi(subset_df[simu_track_prop_mask][second_dim_prop]))
            else:
                second_dim = subset_df[simu_track_prop_mask][second_dim_prop]
                
            print('     Entries: {:d}'.format(np.count_nonzero(simu_track_prop_mask)))

#             h = physt.histogram(np.rad2deg(me), 'fixed_width', 1, density=True) #'scott'
#             h.plot(ax=ax, alpha=0.5)

            ax_angle_hist.hist(
                first_dim, 
                bins=first_dim_bins, alpha=subset_alpha, density=True, 
                label=subset_label, color=subset_color,
                range=first_dim_range
            )
            ax_angle_hist.set_yscale('log')
#             ax_angle_hist.set_xlabel('Angle {} [deg]'.format(first_dim_prop))
            ax_angle_hist.set_ylabel('Norm. ev. count') # \textnumero
            
        
            ax_clu_size_hist.hist(
                second_dim, 
                bins=second_dim_bins, alpha=subset_alpha, density=True, 
                label=subset_label, color=subset_color,
                orientation='horizontal',
                range=second_dim_range
            )
#             ax_clu_size_hist.set_yscale('log')
#             ax_clu_size_hist.set_xlabel(second_dim_prop)
#             ax_clu_size_hist.set_ylabel('Number of events')

            ax_clu_size_hist.set_xscale('log')
            ax_clu_size_hist.set_xlabel('Norm. ev. count')
        
            ax_scatt.scatter(
                first_dim, second_dim,
                color=subset_color,
                label=subset_label,
                alpha=subset_alpha/5
            )
            ax_scatt.set_ylabel(second_dim_prop)
            ax_scatt.set_xlabel(first_dim_prop)
        
        
        ax_scatt.tick_params(which='both', axis='both', direction='in', top=True, right=True)

        ax_angle_hist.tick_params(which='both', axis='both', direction='in', labelbottom=False)
        ax_angle_hist.set_xlabel(None)

        ax_clu_size_hist.tick_params(which='both', axis='both', direction='in', labelleft=False)
        ax_clu_size_hist.set_ylabel(None)
        
        legend_for_scatterplots(ax_scatt)
        
        
#         ax_scatt.legend()
#         ax_angle_hist.legend()
#         ax_clu_size_hist.legend()
        
        if savefig_prefix is not None:
            fig.savefig(savefig_prefix + '_{}_{}_scatter_w_dist.png'.format(first_dim_prop, second_dim_prop), 
                        dpi=150)
            
        plt.show()

        print('='*100)
            
    return angle_stats
# # fig.savefig(os.path.join(data_snippets_dir, 
# #                          'simu_track_zenith_angle_azimuth_angle_hist2d.png'), dpi=150)

# plt.show()

In [None]:
plt.close('all')

In [None]:
calcvis_gtu_angle_stats(
    simu_track_df, simu_pure_noise_df, (
        ('proc1_gtu_y_hough_peak_thr1_major_line_phi', 'proc1_gtu_y_hough_peak_thr1_line_clusters_max_peak_clu_width'),
        ('proc1_gtu_x_hough_peak_thr1_major_line_phi', 'proc1_gtu_x_hough_peak_thr1_line_clusters_max_peak_clu_width'),
        ('proc1_gtu_y_hough_peak_thr2_major_line_phi', 'proc1_gtu_y_hough_peak_thr2_line_clusters_max_peak_clu_width'),
        ('proc1_gtu_x_hough_peak_thr2_major_line_phi', 'proc1_gtu_x_hough_peak_thr2_line_clusters_max_peak_clu_width'),
        ('proc1_gtu_y_hough_peak_thr3_major_line_phi', 'proc1_gtu_y_hough_peak_thr3_line_clusters_max_peak_clu_width'),
        ('proc1_gtu_x_hough_peak_thr3_major_line_phi', 'proc1_gtu_x_hough_peak_thr3_line_clusters_max_peak_clu_width'),
        ('proc2_gtu_y_hough_peak_thr1_major_line_phi', 'proc2_gtu_y_hough_peak_thr1_line_clusters_max_peak_clu_width'),
        ('proc2_gtu_x_hough_peak_thr1_major_line_phi', 'proc2_gtu_x_hough_peak_thr1_line_clusters_max_peak_clu_width'),
        ('proc2_gtu_y_hough_peak_thr2_major_line_phi', 'proc2_gtu_y_hough_peak_thr2_line_clusters_max_peak_clu_width'),
        ('proc2_gtu_x_hough_peak_thr2_major_line_phi', 'proc2_gtu_x_hough_peak_thr2_line_clusters_max_peak_clu_width'),
        ('proc2_gtu_y_hough_peak_thr3_major_line_phi', 'proc2_gtu_y_hough_peak_thr3_line_clusters_max_peak_clu_width'),
        ('proc2_gtu_x_hough_peak_thr3_major_line_phi', 'proc2_gtu_x_hough_peak_thr3_line_clusters_max_peak_clu_width'),
        ('proc3_gtu_y_hough_peak_thr1_major_line_phi', 'proc3_gtu_y_hough_peak_thr1_line_clusters_max_peak_clu_width'),
        ('proc3_gtu_x_hough_peak_thr1_major_line_phi', 'proc3_gtu_x_hough_peak_thr1_line_clusters_max_peak_clu_width'),
        ('proc3_gtu_y_hough_peak_thr2_major_line_phi', 'proc3_gtu_y_hough_peak_thr2_line_clusters_max_peak_clu_width'),
        ('proc3_gtu_x_hough_peak_thr2_major_line_phi', 'proc3_gtu_x_hough_peak_thr2_line_clusters_max_peak_clu_width'),
        ('proc3_gtu_y_hough_peak_thr3_major_line_phi', 'proc3_gtu_y_hough_peak_thr3_line_clusters_max_peak_clu_width'),
        ('proc3_gtu_x_hough_peak_thr3_major_line_phi', 'proc3_gtu_x_hough_peak_thr3_line_clusters_max_peak_clu_width'),
        ('alt1_gtu_y_hough_peak_thr1_major_line_phi', 'alt1_gtu_y_hough_peak_thr1_line_clusters_max_peak_clu_width'),
        ('alt1_gtu_x_hough_peak_thr1_major_line_phi', 'alt1_gtu_x_hough_peak_thr1_line_clusters_max_peak_clu_width'),
        ('alt1_gtu_y_hough_peak_thr2_major_line_phi', 'alt1_gtu_y_hough_peak_thr2_line_clusters_max_peak_clu_width'),
        ('alt1_gtu_x_hough_peak_thr2_major_line_phi', 'alt1_gtu_x_hough_peak_thr2_line_clusters_max_peak_clu_width'),
        ('alt1_gtu_y_hough_peak_thr3_major_line_phi', 'alt1_gtu_y_hough_peak_thr3_line_clusters_max_peak_clu_width'),
        ('alt1_gtu_x_hough_peak_thr3_major_line_phi', 'alt1_gtu_x_hough_peak_thr3_line_clusters_max_peak_clu_width'),
        ('trg_gtu_y_hough_peak_thr1_major_line_phi', 'trg_gtu_y_hough_peak_thr1_line_clusters_max_peak_clu_width'),
        ('trg_gtu_x_hough_peak_thr1_major_line_phi', 'trg_gtu_x_hough_peak_thr1_line_clusters_max_peak_clu_width'),
        ('trg_gtu_y_hough_peak_thr2_major_line_phi', 'trg_gtu_y_hough_peak_thr2_line_clusters_max_peak_clu_width'),
        ('trg_gtu_x_hough_peak_thr2_major_line_phi', 'trg_gtu_x_hough_peak_thr2_line_clusters_max_peak_clu_width'),
        ('trg_gtu_y_hough_peak_thr3_major_line_phi', 'trg_gtu_y_hough_peak_thr3_line_clusters_max_peak_clu_width'),
        ('trg_gtu_x_hough_peak_thr3_major_line_phi', 'trg_gtu_x_hough_peak_thr3_line_clusters_max_peak_clu_width'),
    ),
    savefig_prefix=os.path.join(data_snippets_dir, 'simu_track_and_simu_pure_noise'));

In [None]:
calcvis_gtu_angle_stats(
    simu_track_df, simu_pure_noise_df, (
        ('proc1_gtu_y_hough_peak_thr1_major_line_phi', 'proc1_gtu_y_hough_peak_thr1_line_clusters_max_peak_clu_size'),
        ('proc1_gtu_x_hough_peak_thr1_major_line_phi', 'proc1_gtu_x_hough_peak_thr1_line_clusters_max_peak_clu_size'),
        ('proc1_gtu_y_hough_peak_thr2_major_line_phi', 'proc1_gtu_y_hough_peak_thr2_line_clusters_max_peak_clu_size'),
        ('proc1_gtu_x_hough_peak_thr2_major_line_phi', 'proc1_gtu_x_hough_peak_thr2_line_clusters_max_peak_clu_size'),
        ('proc1_gtu_y_hough_peak_thr3_major_line_phi', 'proc1_gtu_y_hough_peak_thr3_line_clusters_max_peak_clu_size'),
        ('proc1_gtu_x_hough_peak_thr3_major_line_phi', 'proc1_gtu_x_hough_peak_thr3_line_clusters_max_peak_clu_size'),
        ('proc2_gtu_y_hough_peak_thr1_major_line_phi', 'proc2_gtu_y_hough_peak_thr1_line_clusters_max_peak_clu_size'),
        ('proc2_gtu_x_hough_peak_thr1_major_line_phi', 'proc2_gtu_x_hough_peak_thr1_line_clusters_max_peak_clu_size'),
        ('proc2_gtu_y_hough_peak_thr2_major_line_phi', 'proc2_gtu_y_hough_peak_thr2_line_clusters_max_peak_clu_size'),
        ('proc2_gtu_x_hough_peak_thr2_major_line_phi', 'proc2_gtu_x_hough_peak_thr2_line_clusters_max_peak_clu_size'),
        ('proc2_gtu_y_hough_peak_thr3_major_line_phi', 'proc2_gtu_y_hough_peak_thr3_line_clusters_max_peak_clu_size'),
        ('proc2_gtu_x_hough_peak_thr3_major_line_phi', 'proc2_gtu_x_hough_peak_thr3_line_clusters_max_peak_clu_size'),
        ('proc3_gtu_y_hough_peak_thr1_major_line_phi', 'proc3_gtu_y_hough_peak_thr1_line_clusters_max_peak_clu_size'),
        ('proc3_gtu_x_hough_peak_thr1_major_line_phi', 'proc3_gtu_x_hough_peak_thr1_line_clusters_max_peak_clu_size'),
        ('proc3_gtu_y_hough_peak_thr2_major_line_phi', 'proc3_gtu_y_hough_peak_thr2_line_clusters_max_peak_clu_size'),
        ('proc3_gtu_x_hough_peak_thr2_major_line_phi', 'proc3_gtu_x_hough_peak_thr2_line_clusters_max_peak_clu_size'),
        ('proc3_gtu_y_hough_peak_thr3_major_line_phi', 'proc3_gtu_y_hough_peak_thr3_line_clusters_max_peak_clu_size'),
        ('proc3_gtu_x_hough_peak_thr3_major_line_phi', 'proc3_gtu_x_hough_peak_thr3_line_clusters_max_peak_clu_size'),
        ('alt1_gtu_y_hough_peak_thr1_major_line_phi', 'alt1_gtu_y_hough_peak_thr1_line_clusters_max_peak_clu_size'),
        ('alt1_gtu_x_hough_peak_thr1_major_line_phi', 'alt1_gtu_x_hough_peak_thr1_line_clusters_max_peak_clu_size'),
        ('alt1_gtu_y_hough_peak_thr2_major_line_phi', 'alt1_gtu_y_hough_peak_thr2_line_clusters_max_peak_clu_size'),
        ('alt1_gtu_x_hough_peak_thr2_major_line_phi', 'alt1_gtu_x_hough_peak_thr2_line_clusters_max_peak_clu_size'),
        ('alt1_gtu_y_hough_peak_thr3_major_line_phi', 'alt1_gtu_y_hough_peak_thr3_line_clusters_max_peak_clu_size'),
        ('alt1_gtu_x_hough_peak_thr3_major_line_phi', 'alt1_gtu_x_hough_peak_thr3_line_clusters_max_peak_clu_size'),
        ('trg_gtu_y_hough_peak_thr1_major_line_phi', 'trg_gtu_y_hough_peak_thr1_line_clusters_max_peak_clu_size'),
        ('trg_gtu_x_hough_peak_thr1_major_line_phi', 'trg_gtu_x_hough_peak_thr1_line_clusters_max_peak_clu_size'),
        ('trg_gtu_y_hough_peak_thr2_major_line_phi', 'trg_gtu_y_hough_peak_thr2_line_clusters_max_peak_clu_size'),
        ('trg_gtu_x_hough_peak_thr2_major_line_phi', 'trg_gtu_x_hough_peak_thr2_line_clusters_max_peak_clu_size'),
        ('trg_gtu_y_hough_peak_thr3_major_line_phi', 'trg_gtu_y_hough_peak_thr3_line_clusters_max_peak_clu_size'),
        ('trg_gtu_x_hough_peak_thr3_major_line_phi', 'trg_gtu_x_hough_peak_thr3_line_clusters_max_peak_clu_size'),
    ),               
    savefig_prefix=os.path.join(data_snippets_dir, 'simu_track_and_simu_pure_noise'));

In [None]:
plt.close('all')

In [None]:
calcvis_gtu_angle_stats(
    simu_track_df, simu_pure_noise_df, (
        ('proc1_gtu_y_hough_peak_thr1_major_line_phi', 'proc1_gtu_y_hough_peak_thr1_line_clusters_count'),
        ('proc1_gtu_x_hough_peak_thr1_major_line_phi', 'proc1_gtu_x_hough_peak_thr1_line_clusters_count'),
        ('proc1_gtu_y_hough_peak_thr2_major_line_phi', 'proc1_gtu_y_hough_peak_thr2_line_clusters_count'),
        ('proc1_gtu_x_hough_peak_thr2_major_line_phi', 'proc1_gtu_x_hough_peak_thr2_line_clusters_count'),
        ('proc1_gtu_y_hough_peak_thr3_major_line_phi', 'proc1_gtu_y_hough_peak_thr3_line_clusters_count'),
        ('proc1_gtu_x_hough_peak_thr3_major_line_phi', 'proc1_gtu_x_hough_peak_thr3_line_clusters_count'),
        ('proc2_gtu_y_hough_peak_thr1_major_line_phi', 'proc2_gtu_y_hough_peak_thr1_line_clusters_count'),
        ('proc2_gtu_x_hough_peak_thr1_major_line_phi', 'proc2_gtu_x_hough_peak_thr1_line_clusters_count'),
        ('proc2_gtu_y_hough_peak_thr2_major_line_phi', 'proc2_gtu_y_hough_peak_thr2_line_clusters_count'),
        ('proc2_gtu_x_hough_peak_thr2_major_line_phi', 'proc2_gtu_x_hough_peak_thr2_line_clusters_count'),
        ('proc2_gtu_y_hough_peak_thr3_major_line_phi', 'proc2_gtu_y_hough_peak_thr3_line_clusters_count'),
        ('proc2_gtu_x_hough_peak_thr3_major_line_phi', 'proc2_gtu_x_hough_peak_thr3_line_clusters_count'),
        ('proc3_gtu_y_hough_peak_thr1_major_line_phi', 'proc3_gtu_y_hough_peak_thr1_line_clusters_count'),
        ('proc3_gtu_x_hough_peak_thr1_major_line_phi', 'proc3_gtu_x_hough_peak_thr1_line_clusters_count'),
        ('proc3_gtu_y_hough_peak_thr2_major_line_phi', 'proc3_gtu_y_hough_peak_thr2_line_clusters_count'),
        ('proc3_gtu_x_hough_peak_thr2_major_line_phi', 'proc3_gtu_x_hough_peak_thr2_line_clusters_count'),
        ('proc3_gtu_y_hough_peak_thr3_major_line_phi', 'proc3_gtu_y_hough_peak_thr3_line_clusters_count'),
        ('proc3_gtu_x_hough_peak_thr3_major_line_phi', 'proc3_gtu_x_hough_peak_thr3_line_clusters_count'),
        ('alt1_gtu_y_hough_peak_thr1_major_line_phi', 'alt1_gtu_y_hough_peak_thr1_line_clusters_count'),
        ('alt1_gtu_x_hough_peak_thr1_major_line_phi', 'alt1_gtu_x_hough_peak_thr1_line_clusters_count'),
        ('alt1_gtu_y_hough_peak_thr2_major_line_phi', 'alt1_gtu_y_hough_peak_thr2_line_clusters_count'),
        ('alt1_gtu_x_hough_peak_thr2_major_line_phi', 'alt1_gtu_x_hough_peak_thr2_line_clusters_count'),
        ('alt1_gtu_y_hough_peak_thr3_major_line_phi', 'alt1_gtu_y_hough_peak_thr3_line_clusters_count'),
        ('alt1_gtu_x_hough_peak_thr3_major_line_phi', 'alt1_gtu_x_hough_peak_thr3_line_clusters_count'),
        ('trg_gtu_y_hough_peak_thr1_major_line_phi', 'trg_gtu_y_hough_peak_thr1_line_clusters_count'),
        ('trg_gtu_x_hough_peak_thr1_major_line_phi', 'trg_gtu_x_hough_peak_thr1_line_clusters_count'),
        ('trg_gtu_y_hough_peak_thr2_major_line_phi', 'trg_gtu_y_hough_peak_thr2_line_clusters_count'),
        ('trg_gtu_x_hough_peak_thr2_major_line_phi', 'trg_gtu_x_hough_peak_thr2_line_clusters_count'),
        ('trg_gtu_y_hough_peak_thr3_major_line_phi', 'trg_gtu_y_hough_peak_thr3_line_clusters_count'),
        ('trg_gtu_x_hough_peak_thr3_major_line_phi', 'trg_gtu_x_hough_peak_thr3_line_clusters_count'),
    ), second_dim_bins=20,               
    savefig_prefix=os.path.join(data_snippets_dir, 'simu_track_and_simu_pure_noise'));

In [None]:
calcvis_gtu_angle_stats(
    simu_track_df, simu_pure_noise_df, (
        ('proc1_gtu_y_hough_peak_thr1_line_clusters_max_peak_clu_width', 'proc1_gtu_y_hough_peak_thr1_line_clusters_count'),
        ('proc1_gtu_x_hough_peak_thr1_line_clusters_max_peak_clu_width', 'proc1_gtu_x_hough_peak_thr1_line_clusters_count'),
        ('proc1_gtu_y_hough_peak_thr2_line_clusters_max_peak_clu_width', 'proc1_gtu_y_hough_peak_thr2_line_clusters_count'),
        ('proc1_gtu_x_hough_peak_thr2_line_clusters_max_peak_clu_width', 'proc1_gtu_x_hough_peak_thr2_line_clusters_count'),
        ('proc1_gtu_y_hough_peak_thr3_line_clusters_max_peak_clu_width', 'proc1_gtu_y_hough_peak_thr3_line_clusters_count'),
        ('proc1_gtu_x_hough_peak_thr3_line_clusters_max_peak_clu_width', 'proc1_gtu_x_hough_peak_thr3_line_clusters_count'),
        ('proc2_gtu_y_hough_peak_thr1_line_clusters_max_peak_clu_width', 'proc2_gtu_y_hough_peak_thr1_line_clusters_count'),
        ('proc2_gtu_x_hough_peak_thr1_line_clusters_max_peak_clu_width', 'proc2_gtu_x_hough_peak_thr1_line_clusters_count'),
        ('proc2_gtu_y_hough_peak_thr2_line_clusters_max_peak_clu_width', 'proc2_gtu_y_hough_peak_thr2_line_clusters_count'),
        ('proc2_gtu_x_hough_peak_thr2_line_clusters_max_peak_clu_width', 'proc2_gtu_x_hough_peak_thr2_line_clusters_count'),
        ('proc2_gtu_y_hough_peak_thr3_line_clusters_max_peak_clu_width', 'proc2_gtu_y_hough_peak_thr3_line_clusters_count'),
        ('proc2_gtu_x_hough_peak_thr3_line_clusters_max_peak_clu_width', 'proc2_gtu_x_hough_peak_thr3_line_clusters_count'),
        ('proc3_gtu_y_hough_peak_thr1_line_clusters_max_peak_clu_width', 'proc3_gtu_y_hough_peak_thr1_line_clusters_count'),
        ('proc3_gtu_x_hough_peak_thr1_line_clusters_max_peak_clu_width', 'proc3_gtu_x_hough_peak_thr1_line_clusters_count'),
        ('proc3_gtu_y_hough_peak_thr2_line_clusters_max_peak_clu_width', 'proc3_gtu_y_hough_peak_thr2_line_clusters_count'),
        ('proc3_gtu_x_hough_peak_thr2_line_clusters_max_peak_clu_width', 'proc3_gtu_x_hough_peak_thr2_line_clusters_count'),
        ('proc3_gtu_y_hough_peak_thr3_line_clusters_max_peak_clu_width', 'proc3_gtu_y_hough_peak_thr3_line_clusters_count'),
        ('proc3_gtu_x_hough_peak_thr3_line_clusters_max_peak_clu_width', 'proc3_gtu_x_hough_peak_thr3_line_clusters_count'),
        ('alt1_gtu_y_hough_peak_thr1_line_clusters_max_peak_clu_width', 'alt1_gtu_y_hough_peak_thr1_line_clusters_count'),
        ('alt1_gtu_x_hough_peak_thr1_line_clusters_max_peak_clu_width', 'alt1_gtu_x_hough_peak_thr1_line_clusters_count'),
        ('alt1_gtu_y_hough_peak_thr2_line_clusters_max_peak_clu_width', 'alt1_gtu_y_hough_peak_thr2_line_clusters_count'),
        ('alt1_gtu_x_hough_peak_thr2_line_clusters_max_peak_clu_width', 'alt1_gtu_x_hough_peak_thr2_line_clusters_count'),
        ('alt1_gtu_y_hough_peak_thr3_line_clusters_max_peak_clu_width', 'alt1_gtu_y_hough_peak_thr3_line_clusters_count'),
        ('alt1_gtu_x_hough_peak_thr3_line_clusters_max_peak_clu_width', 'alt1_gtu_x_hough_peak_thr3_line_clusters_count'),
        ('trg_gtu_y_hough_peak_thr1_line_clusters_max_peak_clu_width', 'trg_gtu_y_hough_peak_thr1_line_clusters_count'),
        ('trg_gtu_x_hough_peak_thr1_line_clusters_max_peak_clu_width', 'trg_gtu_x_hough_peak_thr1_line_clusters_count'),
        ('trg_gtu_y_hough_peak_thr2_line_clusters_max_peak_clu_width', 'trg_gtu_y_hough_peak_thr2_line_clusters_count'),
        ('trg_gtu_x_hough_peak_thr2_line_clusters_max_peak_clu_width', 'trg_gtu_x_hough_peak_thr2_line_clusters_count'),
        ('trg_gtu_y_hough_peak_thr3_line_clusters_max_peak_clu_width', 'trg_gtu_y_hough_peak_thr3_line_clusters_count'),
        ('trg_gtu_x_hough_peak_thr3_line_clusters_max_peak_clu_width', 'trg_gtu_x_hough_peak_thr3_line_clusters_count'),
    ), 
    norm_angle_first_dim=False, first_dim_bins=90, second_dim_bins=20,            
    savefig_prefix=os.path.join(data_snippets_dir, 'simu_track_and_simu_pure_noise'));

In [None]:
plt.close('all')
calcvis_gtu_angle_stats(
    simu_track_df, simu_pure_noise_df, (
        ('proc1_gtu_y_hough_peak_thr1_major_line_phi', 'num_gtu'),
        ('proc1_gtu_x_hough_peak_thr1_major_line_phi', 'num_gtu'),
        ('proc1_gtu_y_hough_peak_thr2_major_line_phi', 'num_gtu'),
        ('proc1_gtu_x_hough_peak_thr2_major_line_phi', 'num_gtu'),
        ('proc1_gtu_y_hough_peak_thr3_major_line_phi', 'num_gtu'),
        ('proc1_gtu_x_hough_peak_thr3_major_line_phi', 'num_gtu'),
        ('proc2_gtu_y_hough_peak_thr1_major_line_phi', 'num_gtu'),
        ('proc2_gtu_x_hough_peak_thr1_major_line_phi', 'num_gtu'),
        ('proc2_gtu_y_hough_peak_thr2_major_line_phi', 'num_gtu'),
        ('proc2_gtu_x_hough_peak_thr2_major_line_phi', 'num_gtu'),
        ('proc2_gtu_y_hough_peak_thr3_major_line_phi', 'num_gtu'),
        ('proc2_gtu_x_hough_peak_thr3_major_line_phi', 'num_gtu'),
        ('proc3_gtu_y_hough_peak_thr1_major_line_phi', 'num_gtu'),
        ('proc3_gtu_x_hough_peak_thr1_major_line_phi', 'num_gtu'),
        ('proc3_gtu_y_hough_peak_thr2_major_line_phi', 'num_gtu'),
        ('proc3_gtu_x_hough_peak_thr2_major_line_phi', 'num_gtu'),
        ('proc3_gtu_y_hough_peak_thr3_major_line_phi', 'num_gtu'),
        ('proc3_gtu_x_hough_peak_thr3_major_line_phi', 'num_gtu'),
        ('alt1_gtu_y_hough_peak_thr1_major_line_phi', 'num_gtu'),
        ('alt1_gtu_x_hough_peak_thr1_major_line_phi', 'num_gtu'),
        ('alt1_gtu_y_hough_peak_thr2_major_line_phi', 'num_gtu'),
        ('alt1_gtu_x_hough_peak_thr2_major_line_phi', 'num_gtu'),
        ('alt1_gtu_y_hough_peak_thr3_major_line_phi', 'num_gtu'),
        ('alt1_gtu_x_hough_peak_thr3_major_line_phi', 'num_gtu'),
        ('trg_gtu_y_hough_peak_thr1_major_line_phi', 'num_gtu'),
        ('trg_gtu_x_hough_peak_thr1_major_line_phi', 'num_gtu'),
        ('trg_gtu_y_hough_peak_thr2_major_line_phi', 'num_gtu'),
        ('trg_gtu_x_hough_peak_thr2_major_line_phi', 'num_gtu'),
        ('trg_gtu_y_hough_peak_thr3_major_line_phi', 'num_gtu'),
        ('trg_gtu_x_hough_peak_thr3_major_line_phi', 'num_gtu'),
    ), 
    first_dim_bins=180+1, 
    second_dim_bins=max(simu_track_df.num_gtu.max(), simu_pure_noise_df.num_gtu.max())*2 + 1,
    norm_angle_first_dim=True, norm_angle_second_dim=False,
    savefig_prefix=os.path.join(data_snippets_dir, 'simu_track_and_simu_pure_noise'));

In [None]:
# calcvis_gtu_angle_stats(simu_track_df, None);

In [None]:
# calcvis_gtu_angle_stats(None, simu_pure_noise_df);

In [None]:
# calcvis_gtu_angle_stats(simu_pure_noise_df);

#### GTU-... Phi difference

In [None]:
subset_df = simu_track_df


diff_columns_projs = ['gtu_x', 'gtu_y']
diff_columns_diff_types = ['pi_over_2', '0']

diff_columns_gtu_y_gtu_x_diff_format = '{prefix}_gtu_y_gtu_x_hough_peak_thr{thr_i}_major_line_phi_diff'
diff_columns_proj_diff_format = '{prefix}_{proj}_hough_peak_thr{thr_i}_major_line_phi_diff_{diff_type}'

diff_columns_proc_range = (1,4)
diff_columns_thr_range = (3,4) #(1,4)

diff_columns_prefixes = \
    ['proc{}'.format(i) for i in range(*diff_columns_proc_range)] 
#

for prefix in diff_columns_prefixes:
    for thr_i in range(*diff_columns_thr_range):
#         common_extension_columns.append(diff_columns_gtu_y_gtu_x_diff_format.format(prefix=prefix, thr_i=thr_i))
        for diff_type in diff_columns_diff_types:
            print(diff_columns_proj_diff_format.format(prefix=prefix, thr_i=thr_i, proj=proj, diff_type=diff_type))
            
            prop_names = [diff_columns_proj_diff_format.format(prefix=prefix, thr_i=thr_i, proj=proj, diff_type=diff_type) \
                          for proj in diff_columns_projs]
            
            fig, ax = plt.subplots(figsize=(4.8,4.2))  #(6, 5) # figsize=(8, 6)
            h = physt.h2(
                    *[np.rad2deg(subset_df[prop_name]) \
                      for prop_name in prop_names
                     ],
                    'fixed_width', (1.25, 1.25),
                    range=(0,90),
                    axis_names=prop_names,
                )
            h.plot(ax=ax, cmap="Blues", cmap_normalize="log", show_zero=False, lw=0) # viridis  plasma magma
            
            ax.xaxis.label.set_size(8)
            ax.yaxis.label.set_size(8)
            
            fig.axes[1].set_ylabel('Number of entries')
            
            fig.savefig(os.path.join(data_snippets_dir, '{}__{}_1_25_bins.svg'.format(*prop_names)), dpi=150)
            plt.show()

### Relation between two features

#### Top features from classifiers

Top features from ver4_machine_learning_w_labeled_flight_201906122_2.ipynb: 
```
1     proc1_x_y_hough_peak_thr1_line_clusters_clu_heights_max               0.0352   0.826245  ||||||
2     proc1_x_y_hough_peak_thr1_line_clusters_max_area_clu_width            0.0318   0.839115  ||||||||
3     proc2_x_y_hough_peak_thr1_line_clusters_max_area_clu_width            0.0272   0.865225  ||||||||||
4     proc1_x_y_hough_peak_thr1_major_line_phi                              0.0255   0.886833  |||||||||||||
5     proc3_x_y_hough_peak_thr1_line_clusters_max_area_clu_width            0.0246   0.885753  |||||||||||||
6     proc1_x_y_hough_peak_thr3_line_clusters_max_peak_clu_size             0.0238   0.886730  |||||||||||||
7     proc2_x_y_hough_peak_thr1_line_clusters_sizes_max                     0.0215   0.883457  ||||||||||||
8     proc3_x_y_hough_peak_thr1_line_clusters_max_size_clu_width            0.0208   0.883036  ||||||||||||
9     alt1_x_y_hough_peak_thr1_major_line_phi                               0.0203   0.887794  |||||||||||||
10    proc1_x_y_hough_peak_thr1_line_clusters_sizes_max                     0.0191   0.888682  |||||||||||||
11    proc2_x_y_hough_peak_thr1_major_line_phi                              0.0177   0.888736  |||||||||||||
12    proc2_x_y_hough_peak_thr1_line_clusters_max_peak_clu_size             0.0169   0.888474  |||||||||||||
13    proc1_x_y_hough_peak_thr1_line_clusters_max_size_clu_height           0.0168   0.889518  |||||||||||||
14    proc1_x_y_hough_peak_thr1_line_clusters_max_size_clu_width            0.0157   0.890628  |||||||||||||
15    proc3_x_y_hough_peak_thr1_major_line_phi                              0.0155   0.892701  |||||||||||||
16    proc3_x_y_hough_peak_thr1_line_clusters_clu_heights_max               0.0154   0.896667  ||||||||||||||
17    proc3_x_y_hough_peak_thr1_line_clusters_sizes_max                     0.0154   0.898320  ||||||||||||||
18    alt1_x_y_hough_peak_thr1_line_clusters_clu_heights_max                0.0150   0.897415  ||||||||||||||
19    proc2_x_y_hough_peak_thr1_line_clusters_clu_heights_max               0.0123   0.899538  ||||||||||||||
20    alt1_x_y_hough_peak_thr1_line_clusters_max_size_clu_width             0.0119   0.904400  |||||||||||||||
21    proc2_x_y_hough_peak_thr1_line_clusters_max_sum_clu_width             0.0119   0.904004  ||||||||||||||
22    proc2_x_y_hough_peak_thr3_line_clusters_max_peak_clu_size             0.0115   0.908018  |||||||||||||||
23    proc1_x_y_hough_peak_thr1_line_clusters_max_peak_clu_width            0.0115   0.907810  |||||||||||||||
24    proc1_x_y_hough_peak_thr1_line_clusters_clu_widths_max                0.0112   0.909092  |||||||||||||||
25    proc1_x_y_clusters_sizes_max                                          0.0108   0.914271  ||||||||||||||||
26    proc2_x_y_hough_peak_thr1_line_clusters_max_size_clu_width            0.0102   0.916296  ||||||||||||||||
27    proc1_x_y_clusters_sizes_min                                          0.0100   0.916864  ||||||||||||||||
28    alt1_x_y_hough_peak_thr1_line_clusters_max_area_clu_width             0.0100   0.919767  ||||||||||||||||
29    proc1_x_y_hough_peak_thr1_line_clusters_clu_areas_max                 0.0100   0.921139  ||||||||||||||||
30    proc1_x_y_hough_peak_thr1_line_clusters_max_area_clu_height           0.0097   0.921740  ||||||||||||||||
31    proc3_x_y_hough_peak_thr3_major_line_coord_1_x                        0.0095   0.924703  |||||||||||||||||
```

(the association between the feature name and accuracy is not guaranteed)

Top features from ver4_machine_learning_w_labeled_flight_20190628_2.ipynb: 
```
1     proc1_x_y_hough_peak_thr1_line_clusters_max_sum_clu_width             0.0305   0.794179  |||||
2     alt1_x_y_hough_peak_thr1_line_clusters_max_size_clu_width             0.0238   0.859615  ||||||||||||
3     alt1_x_y_hough_peak_thr1_line_clusters_clu_widths_max                 0.0223   0.859610  ||||||||||||
4     proc1_x_y_hough_peak_thr1_line_clusters_max_area_clu_height           0.0215   0.865060  ||||||||||||
5     proc1_x_y_hough_peak_thr1_line_clusters_clu_widths_max                0.0202   0.868033  |||||||||||||
6     proc1_x_y_hough_peak_thr1_line_clusters_max_area_clu_width            0.0199   0.871295  |||||||||||||
7     alt1_x_y_hough_peak_thr1_line_clusters_max_sum_clu_width              0.0199   0.875662  ||||||||||||||
8     proc1_x_y_hough_peak_thr1_line_clusters_max_peak_clu_width            0.0187   0.879528  ||||||||||||||
9     trg_x_y_clusters_max_size_clu_height                                  0.0177   0.886067  |||||||||||||||
10    proc2_x_y_hough_peak_thr1_line_clusters_max_size_clu_width            0.0169   0.888426  |||||||||||||||
11    proc2_x_y_hough_peak_thr1_line_clusters_count                         0.0163   0.891369  |||||||||||||||
12    proc2_x_y_hough_peak_thr2_line_clusters_count                         0.0158   0.897147  ||||||||||||||||
13    proc1_x_y_clusters_max_peak_clu_size                                  0.0149   0.899759  ||||||||||||||||
14    proc1_x_y_clusters_sizes_min                                          0.0143   0.900337  ||||||||||||||||
15    proc3_x_y_hough_peak_thr1_line_clusters_count                         0.0138   0.902639  ||||||||||||||||
16    proc3_x_y_hough_peak_thr1_line_clusters_max_sum_clu_width             0.0134   0.905293  |||||||||||||||||
17    proc2_x_y_hough_peak_thr1_line_clusters_max_sum_clu_width             0.0134   0.905644  |||||||||||||||||
18    proc3_x_y_hough_peak_thr1_line_clusters_max_size_clu_width            0.0134   0.905305  |||||||||||||||||
19    proc2_x_y_hough_peak_thr1_line_clusters_clu_widths_max                0.0129   0.905299  |||||||||||||||||
20    proc1_x_y_clusters_sizes_max                                          0.0116   0.905530  |||||||||||||||||
21    proc2_x_y_hough_peak_thr1_line_clusters_max_area_clu_width            0.0114   0.904932  |||||||||||||||||
22    alt1_gtu_y_clusters_max_size_clu_width                                0.0112   0.905565  |||||||||||||||||
23    proc1_x_y_hough_peak_thr1_line_clusters_max_size_clu_width            0.0105   0.906498  |||||||||||||||||
24    proc3_x_y_hough_peak_thr1_line_clusters_max_area_clu_width            0.0104   0.908102  |||||||||||||||||
25    alt1_x_y_hough_peak_thr1_line_clusters_count                          0.0098   0.908935  |||||||||||||||||
26    proc1_x_y_hough_peak_thr1_line_clusters_max_size_clu_height           0.0098   0.908691  |||||||||||||||||
27    orig_size                                                             0.0098   0.908860  |||||||||||||||||
28    proc2_x_y_hough_peak_thr3_line_clusters_count                         0.0096   0.908639  |||||||||||||||||
29    proc3_x_y_hough_peak_thr2_line_clusters_count                         0.0095   0.909718  |||||||||||||||||
30    bg_x_y_sum                                                            0.0094   0.911368  |||||||||||||||||
31    proc2_x_y_clusters_sizes_max                                          0.0094   0.910932  |||||||||||||||||
32    trg_gtu_y_clusters_max_size_clu_height                                0.0093   0.911041  |||||||||||||||||
33    proc1_x_y_hough_peak_thr1_line_clusters_max_sum_clu_height            0.0090   0.910698  |||||||||||||||||
34    proc2_x_y_hough_peak_thr1_line_clusters_max_peak_clu_width            0.0089   0.911776  |||||||||||||||||
35    proc2_gtu_x_clusters_max_peak_clu_width                               0.0089   0.912840  |||||||||||||||||
36    proc2_gtu_y_clusters_max_peak_clu_width                               0.0088   0.913672  ||||||||||||||||||
37    trg_gtu_y_clusters_max_peak_clu_width                                 0.0088   0.915336  ||||||||||||||||||
38    proc1_gtu_y_clusters_clu_areas_max                                    0.0085   0.914241  ||||||||||||||||||
39    proc1_x_y_hough_peak_thr1_line_clusters_count                         0.0084   0.915762  ||||||||||||||||||
40    proc1_x_y_hough_peak_thr1_line_clusters_max_sum_clu_size              0.0082   0.915416  ||||||||||||||||||
41    trg_x_y_clusters_max_peak_clu_width                                   0.0080   0.915524  ||||||||||||||||||
42    proc1_gtu_y_clusters_max_size_clu_height                              0.0079   0.916438  ||||||||||||||||||
43    proc3_x_y_hough_peak_thr1_line_clusters_clu_widths_max                0.0079   0.917631  ||||||||||||||||||
44    proc1_x_y_hough_peak_thr2_line_clusters_clu_widths_max                0.0079   0.918485  ||||||||||||||||||
45    trg_x_y_clusters_max_size_clu_width                                   0.0078   0.920194  ||||||||||||||||||
46    proc1_x_y_hough_peak_thr2_line_clusters_count                         0.0078   0.921580  ||||||||||||||||||
47    alt1_x_y_hough_peak_thr1_line_clusters_max_peak_clu_width             0.0077   0.923485  |||||||||||||||||||
48    proc1_x_y_hough_peak_thr1_line_clusters_max_area_clu_size             0.0076   0.924160  |||||||||||||||||||
49    trg_max                                                               0.0075   0.926883  |||||||||||||||||||
```
(the association between the feature name and accuracy is not guaranteed)

#### Plots

In [None]:
calcvis_gtu_angle_stats(
    simu_track_df, simu_pure_noise_df, (
        ('proc1_x_y_hough_peak_thr1_line_clusters_clu_heights_max', 'proc1_x_y_hough_peak_thr1_line_clusters_max_area_clu_width'),
        ('proc1_x_y_hough_peak_thr1_line_clusters_max_area_clu_width', 'proc2_x_y_hough_peak_thr1_line_clusters_max_area_clu_width'),
        ('proc2_x_y_hough_peak_thr1_line_clusters_max_area_clu_width', 'proc3_x_y_hough_peak_thr1_line_clusters_max_area_clu_width'),
    ), 
    norm_angle_first_dim=False, norm_angle_second_dim=False, first_dim_bins=90, second_dim_bins=90,            
    savefig_prefix=os.path.join(data_snippets_dir, 'simu_track_and_simu_pure_noise'));

In [None]:
# TODO
# proc1_x_y_hough_peak_thr1_line_clusters_max_peak_clu_width  !!!

In [None]:
calcvis_gtu_angle_stats(
    simu_track_df, simu_pure_noise_df, (
        ('proc1_x_y_hough_peak_thr1_major_line_phi', 'proc2_x_y_hough_peak_thr1_line_clusters_max_area_clu_width'),
        ('proc1_x_y_hough_peak_thr1_major_line_phi', 'proc3_x_y_hough_peak_thr1_line_clusters_max_area_clu_width'),
    ), 
    norm_angle_first_dim=False, norm_angle_second_dim=False,            
    savefig_prefix=os.path.join(data_snippets_dir, 'simu_track_and_simu_pure_noise'));

In [None]:
plt.close('all')

In [None]:
# angle_prop = ''

# simu_track_prop_mask = ~simu_track_df[angle_prop].isnull() & (simu_track_df[angle_prop] > 0)

#     fig, ax = plt.subplots(figsize=(10, 8)) 
#     h = physt.h2(
#             simu_track_df[simu_track_prop_mask]['etruth_truephi'], 
#             simu_track_df[simu_track_prop_mask][angle_prop],
#             'fixed_width', (10, 10),
#             axis_names=['True azimuth angle', 'Estimated orientation of a line'],
#         )
#     h.plot(ax=ax, cmap="Blues", cmap_normalize="log", show_zero=False, lw=0) # viridis  plasma magma
    
#     plt.show()

In [None]:
# 18 proc3_gtu_x_hough_peak_thr1_major_line_phi
# 61 proc2_gtu_x_hough_peak_thr2_major_line_phi
# 65 proc3_gtu_x_hough_peak_thr2_major_line_phi
# 69 proc2_gtu_y_hough_peak_thr1_major_line_phi
# 73 proc3_gtu_y_hough_peak_thr2_major_line_phi
# 74 proc3_gtu_y_hough_peak_thr3_major_line_phi
# 77 alt1_gtu_y_hough_peak_thr1_major_line_phi
# 369 proc1_gtu_x_hough_peak_thr1_major_line_phi
# 386 proc1_gtu_x_hough_peak_thr2_major_line_phi
# 406 proc1_gtu_x_hough_peak_thr3_major_line_phi

In [None]:
# combined_simu_df[combined_simu_df['cond_selection_combined'] == 'simu_track'][['etruth_truephi', 'proc3_gtu_x_hough_peak_thr1_major_line_phi']]