#### Comparing Reference Data ####

**Ensure that you are in the root of tardis-refdata!**

The use the comparer to load the reference data. Use teardown to delete the reference data afterwards

In [28]:
from __future__ import print_function
import pandas as pd
import subprocess
import tempfile
import shlex, os, shutil
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

ImportError: cannot import name 'reindex'

In [2]:
cd /Users/nathan/Documents/Code/tardis-refdata


/Users/nathan/Documents/Code/tardis-refdata


In [3]:
pwd # you should be in the root

'/Users/nathan/Documents/Code/tardis-refdata'

In [None]:
def highlight_missing(val):
    if val == True:
        return 'background-color: #BCF5A9'
    else:
        return 'background-color: #F5A9A9'
    
def highlight_relative_difference(val):
    ret = 'background-color: #BCF5A9'
    if val > 1e-2:
        ret = 'background-color: #F2F5A9'
    if val > 1e-1:
        ret = 'background-color: #F5D0A9'
    if val > 1:
        ret = 'background-color: #F5A9A9'
    return ret

In [152]:
class ReferenceComparer(object):

    def __init__(self, ref1_hash=None, ref2_hash=None, compare_path='unit_test_data.h5'):
        assert not ((ref1_hash is None) and (ref2_hash is None)), "One hash can not be None"
        self.ref1_hash = ref1_hash
        self.ref2_hash = ref2_hash
        self.compare_path = compare_path
        #subprocess.Popen('git --work-tree=/Users/wkerzend/tmp/ checkout upstream/pr/8 {cpath}', shell=True)
        self.tmp_dir = None
        self.setup()
    
    def setup(self):
        self.tmp_dir = tempfile.mkdtemp()
        print('Created temporary directory at {0}. Delete after use with .teardown'.format(self.tmp_dir))
        for ref_id, ref_hash in enumerate([self.ref1_hash, self.ref2_hash]):
            ref_id += 1
            if ref_hash is not None:
                self._copy_data_from_hash(ref_hash, 'ref{0}_'.format(ref_id))
            else:
                subprocess.Popen('cp {0} {1}'.format(self.compare_path, 
                                                     os.path.join(self.tmp_dir, 
                                                                  'ref{0}_{1}'.format(ref_id, self.compare_path))), 
                                                     shell=True)
            setattr(self, 'ref{0}_fname'.format(ref_id), 
                    os.path.join(self.tmp_dir, 'ref{0}_{1}'.format(ref_id, self.compare_path)))
    def teardown(self):
        shutil.rmtree(self.tmp_dir)
    def _copy_data_from_hash(self, ref_hash, prefix):
        git_cmd = ['git']
        git_cmd.append('--work-tree={0}'.format(self.tmp_dir))
        git_cmd += ['checkout', ref_hash, self.compare_path]
        p = subprocess.Popen(git_cmd)
        p.wait()
        shutil.move(os.path.join(self.tmp_dir, self.compare_path), 
                    os.path.join(self.tmp_dir, prefix + self.compare_path))

    def generate_test_table(self):
        rd1_hdfs = pd.HDFStore(self.ref1_fname, mode='r')
        rd2_hdfs = pd.HDFStore(self.ref2_fname, mode='r')
        rd1_keys = rd1_hdfs.keys()
        rd2_keys = rd2_hdfs.keys()
        rd1_hdfs.close()
        rd2_hdfs.close()
        rd1_df = pd.DataFrame(index=rd1_keys, columns=['exists'])
        rd2_df = pd.DataFrame(index=rd2_keys, columns=['exists'])
        rd1_df['exists'] = True
        rd2_df['exists'] = True
        joined_df = rd1_df.join(rd2_df, how='outer', lsuffix='_1', rsuffix='_2')
        joined_df = joined_df.fillna(False)
        return joined_df
    
    def compare_refdata(self, test_table):
        test_table['match'] = None
        test_table['abs_diff_mean'] = None
        test_table['abs_diff_max'] = None
        test_table['rel_diff_mean'] = None
        test_table['rel_diff_max'] = None
        for row_id, row in test_table.iterrows():
            if row[['exists_1', 'exists_2']].all():
                ref1_df = pd.read_hdf(self.ref1_fname, row_id)
                ref2_df = pd.read_hdf(self.ref2_fname, row_id)
                print(ref2_df)
                if isinstance(ref1_df, pd.Series):
                    try:
                        pd.util.testing.assert_series_equal(ref1_df, ref2_df)
                    except AssertionError:
                        test_table.loc[row_id, 'match'] = False
                        abs_diff = np.fabs(ref1_df - ref2_df)
                        rel_diff = (abs_diff / np.fabs(ref1_df))[ref1_df != 0]
                        test_table.loc[row_id, 'abs_diff_mean'] = abs_diff.mean()
                        test_table.loc[row_id, 'abs_diff_max'] = abs_diff.max()
                        test_table.loc[row_id, 'rel_diff_mean'] = rel_diff.mean()
                        test_table.loc[row_id, 'rel_diff_max'] = rel_diff.max()
                    else:
                        test_table.loc[row_id, 'match'] = True

                elif isinstance(ref1_df, pd.DataFrame):
                    try:
                        pd.util.testing.assert_frame_equal(ref1_df, ref2_df)
                    except AssertionError:
                        test_table.loc[row_id, 'match'] = False
                        abs_diff = np.fabs(ref1_df - ref2_df)
                        rel_diff = (abs_diff / np.fabs(ref1_df))[ref1_df != 0]
                        test_table.loc[row_id, 'abs_diff_mean'] = abs_diff.mean(skipna=True).mean()
                        test_table.loc[row_id, 'abs_diff_max'] = abs_diff.max(skipna=True).max()
                        test_table.loc[row_id, 'rel_diff_mean'] = rel_diff.mean(skipna=True).mean()
                        test_table.loc[row_id, 'rel_diff_max'] = rel_diff.max(skipna=True).max()
                    else:
                        test_table.loc[row_id, 'match'] = True

                else:
                    raise ValueError('Needs to be a Series or DataFrame but is' + str(type(ref1_df)))
        return test_table
                

In [153]:
#you can add ref1_hash and ref2_hash. If either is set to None it will just use the current data in the directory
# other than that you can use any git label (e.g. hash, tag, branch name)
comparer = ReferenceComparer(ref1_hash='upstream/pr/11')

Created temporary directory at /var/folders/p9/vg8qxdhn1yx839m0vj77tq700000gn/T/tmp9n50uymj. Delete after use with .teardown


In [154]:
#comparer.teardown()

In [155]:
tt = comparer.generate_test_table()

In [156]:
tt = comparer.compare_refdata(tt)

                 0     1     2     3     4     5     6     7     8     9   \
atomic_number                                                               
8              0.19  0.19  0.19  0.19  0.19  0.19  0.19  0.19  0.19  0.19   
12             0.03  0.03  0.03  0.03  0.03  0.03  0.03  0.03  0.03  0.03   
14             0.52  0.52  0.52  0.52  0.52  0.52  0.52  0.52  0.52  0.52   
16             0.19  0.19  0.19  0.19  0.19  0.19  0.19  0.19  0.19  0.19   
18             0.04  0.04  0.04  0.04  0.04  0.04  0.04  0.04  0.04  0.04   
20             0.03  0.03  0.03  0.03  0.03  0.03  0.03  0.03  0.03  0.03   

                 10    11    12    13    14    15    16    17    18    19  
atomic_number                                                              
8              0.19  0.19  0.19  0.19  0.19  0.19  0.19  0.19  0.19  0.19  
12             0.03  0.03  0.03  0.03  0.03  0.03  0.03  0.03  0.03  0.03  
14             0.52  0.52  0.52  0.52  0.52  0.52  0.52  0.52  0.52  0.52  
16 

0     7.542804e-14
1     5.728475e-14
2     4.396074e-14
3     3.406287e-14
4     2.663135e-14
5     2.099596e-14
6     1.668287e-14
7     1.335311e-14
8     1.076154e-14
9     8.729085e-15
10    7.123652e-15
11    5.846921e-15
12    4.825093e-15
13    4.002324e-15
14    3.336039e-15
15    2.793540e-15
16    2.349550e-15
17    1.984397e-15
18    1.682677e-15
19    1.432260e-15
dtype: float64
0     2.929904e+09
1     2.233626e+09
2     1.718583e+09
3     1.334658e+09
4     1.045840e+09
5     8.257762e+08
6     6.566420e+08
7     5.258841e+08
8     4.240821e+08
9     3.437620e+08
10    2.806206e+08
11    2.302670e+08
12    1.899100e+08
13    1.574609e+08
14    1.312046e+08
15    1.097878e+08
16    9.227420e+07
17    7.786166e+07
18    6.597177e+07
19    5.609639e+07
dtype: float64
atomic_number  ion_number  level_number
8              0           0               0.000000e+00
                           1               3.143848e-14
                           2               4.508775e-14
  

                                    0             1             2   \
atomic_number ion_number                                             
8             0           9.975603e+02  4.616053e+02  2.449330e+02   
              1           5.392130e+08  4.093102e+08  3.139188e+08   
              2           2.142490e+05  3.647079e+05  4.688993e+05   
              3           2.398524e-08  1.226711e-07  3.037647e-07   
              4           0.000000e+00  0.000000e+00  0.000000e+00   
              5           0.000000e+00  0.000000e+00  0.000000e+00   
              6           0.000000e+00  0.000000e+00  0.000000e+00   
              7           0.000000e+00  0.000000e+00  0.000000e+00   
              8           0.000000e+00  0.000000e+00  0.000000e+00   
12            0           8.386981e-07  2.676280e-07  1.043017e-07   
              1           4.288168e+02  1.951875e+02  1.028105e+02   
              2           5.606568e+07  4.257990e+07  3.267618e+07   
              3     

                                                 0             1   \
atomic_number ion_number level_number                               
8             0          0             5.000000e+00  5.000000e+00   
                         1             2.938856e+00  2.939705e+00   
                         2             9.708997e-01  9.713022e-01   
                         3             6.343615e-01  6.530091e-01   
                         4             1.231709e-02  1.310099e-02   
                         5             3.393176e-04  3.882406e-04   
                         6             1.373175e-04  1.579867e-04   
                         7             3.821426e-05  4.476261e-05   
                         8             6.367369e-05  7.458501e-05   
                         9             8.910054e-05  1.043698e-04   
                         10            2.944015e-05  3.461144e-05   
                         11            4.906336e-05  5.768160e-05   
                         12       

                                               0           1           2   \
atomic_number ion_number level_number                                       
8             0          0             521.841000  240.944635  127.712137   
                         1             306.723102  141.661243   75.097504   
                         2             101.331058   46.806010   24.814294   
                         3              66.207164   31.467807   16.910313   
                         4               1.285513    0.631323    0.344572   
                         5               0.035414    0.018709    0.010571   
                         6               0.014332    0.007613    0.004313   
                         7               0.003988    0.002157    0.001232   
                         8               0.006646    0.003594    0.002054   
                         9               0.009299    0.005029    0.002874   
                         10              0.003073    0.001668    0.000955   

                0
0       (8, 0, 0)
1       (8, 0, 1)
2       (8, 0, 2)
3       (8, 0, 3)
4       (8, 0, 4)
5       (8, 0, 5)
6       (8, 0, 6)
7       (8, 0, 7)
8       (8, 0, 8)
9       (8, 0, 9)
10     (8, 0, 10)
11     (8, 0, 11)
12     (8, 0, 12)
13     (8, 0, 13)
14     (8, 0, 14)
15     (8, 0, 15)
16     (8, 0, 16)
17     (8, 0, 17)
18     (8, 0, 18)
19     (8, 0, 19)
20     (8, 0, 20)
21     (8, 0, 21)
22     (8, 0, 22)
23     (8, 0, 23)
24     (8, 0, 24)
25     (8, 0, 25)
26     (8, 0, 26)
27     (8, 0, 27)
28     (8, 0, 28)
29     (8, 0, 29)
...           ...
4405  (20, 8, 53)
4406  (20, 8, 54)
4407  (20, 8, 55)
4408  (20, 8, 56)
4409  (20, 8, 57)
4410  (20, 8, 58)
4411  (20, 8, 59)
4412  (20, 8, 60)
4413  (20, 8, 61)
4414  (20, 8, 62)
4415  (20, 8, 63)
4416  (20, 8, 64)
4417  (20, 8, 65)
4418  (20, 8, 66)
4419  (20, 8, 67)
4420  (20, 8, 68)
4421  (20, 8, 69)
4422  (20, 8, 70)
4423   (20, 9, 0)
4424  (20, 10, 0)
4425  (20, 11, 0)
4426  (20, 12, 0)
4427  (20, 13, 0)
4428  (20,

0        2577
1        2577
2        2577
3        2578
4        2577
5        2577
6        2577
7        2578
8        2578
9        2577
10       1576
11       1576
12       1576
13       2577
14       1577
15       1577
16       1578
17       1578
18       1577
19       1577
20       1578
21       2578
22       1576
23       1576
24       4352
25       1579
26       1580
27       1580
28       1577
29       2577
         ... 
29194    3765
29195    3837
29196    3825
29197    3826
29198    3838
29199    3825
29200    3823
29201    3823
29202    3864
29203    3744
29204    3877
29205    3807
29206    3843
29207    3836
29208    3844
29209    3845
29210    3840
29211    3836
29212    3836
29213    3842
29214    3807
29215    3886
29216    3885
29217    3886
29218    3879
29219    3884
29220    3885
29221    3886
29222    3846
29223    3885
Length: 29224, dtype: int64
0        2613
1        2612
2        2611
3        2612
4        2610
5        2609
6        2608
7        2609
8     

                                     0              1              2   \
atomic_number ion_number                                                
8             1            1.583706e+15   1.980580e+15   2.202625e+15   
              2            1.164158e+06   1.990229e+06   2.567041e+06   
              3            3.280037e-04   7.512899e-04   1.113341e-03   
              4            5.060622e-15   1.615536e-14   2.802593e-14   
              5            1.382061e-30   7.553864e-30   1.691295e-29   
              6            3.163698e-42   2.470303e-41   6.550977e-41   
              7            0.000000e+00   0.000000e+00   0.000000e+00   
              8            0.000000e+00   0.000000e+00   0.000000e+00   
12            1            1.498027e+18   1.629037e+18   1.694012e+18   
              2            3.830705e+14   4.872627e+14   5.462158e+14   
              3            1.254729e-14   4.175167e-14   7.386925e-14   
              4            9.713599e-29   4.963721e

TypeError: unsupported operand type(s) for -: 'bytes' and 'str'

In [100]:
tt[["exists_1", "exists_2", 'rel_diff_mean', 'rel_diff_max']].style.applymap(
    highlight_missing, subset=['exists_1', 'exists_2', 'match']).applymap(
    highlight_relative_difference, subset=['rel_diff_mean', 'rel_diff_max'])

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


TypeError: ("'>' not supported between instances of 'NoneType' and 'float'", 'occurred at index rel_diff_mean')

<pandas.io.formats.style.Styler at 0x1179e5c18>

## Detailed Inspection of the Reference Data

If parts of the reference data show differences between revisions, you should invest some time examining these differences in detail. Often, visualizing the relevant data blocks already helps. You can use the following plotting routines as a blueprint and adjust and extend them to your needs.

In [None]:
def compare_output_nu(df1, df2):
    nu_min = np.min([df1.min(), df2.min()])
    nu_max = np.max([df1.max(), df2.max()])
    
    plt.figure(figsize=(14, 6))
    plt.subplot(121)
    plt.plot(df1, df2, ',')
    plt.xlabel("output_nu, ref 1")
    plt.ylabel("output_nu, ref 2")
    plt.subplot(122)
    plt.hist(df1, bins=np.linspace(nu_min, nu_max, 100), histtype="step", label="ref 1")
    plt.hist(df2, bins=np.linspace(nu_min, nu_max, 100), histtype="step", label="ref 2")
    plt.xlabel("output_nu")
    plt.legend(frameon=False)
    
def compare_spectrum(ref1_nu, ref1_L, ref2_nu, ref2_L):
    plt.figure(figsize=(14, 6))
    plt.subplot(121)
    plt.plot(ref1_nu, ref1_L, label="ref 1")
    plt.plot(ref2_nu, ref2_L, label="ref 2")
    plt.xlabel("nu")
    plt.ylabel("L")
    plt.legend(frameon=False)
    plt.subplot(122)
    plt.plot(ref1_nu, ref1_L / ref2_L)
    plt.xlabel("nu")
    plt.ylabel("L ref 1 / L ref 2")

Get the data and find all the entries for which differences exist

In [None]:
tmp1 = pd.HDFStore(comparer.ref1_fname, "r")
tmp2 = pd.HDFStore(comparer.ref2_fname, "r")

diff_entries = tt.loc[(tt["match"] == False) & (tt["exists_1"] == True) & (tt["exists_2"] == True)].index

Start the visual exploration

In [None]:
compare_output_nu(tmp1['/test_simulation/output_nu'], tmp2['/test_simulation/output_nu'])

In [None]:
compare_spectrum(tmp1['/test_runner_simple/spectrum/_frequency'][:-1], 
                 tmp1['/test_runner_simple/spectrum/luminosity'],
                 tmp2['/test_runner_simple/spectrum/_frequency'][:-1], 
                 tmp2['/test_runner_simple/spectrum/luminosity'])