Merge pull request #717 from tomato42/more-stats

More statisics
tlsfuzzer · Nov 5, 2020 · 5b32de5 · 5b32de5
2 parents 23fa103 + 153efb8
commit 5b32de5
Show file tree

Hide file tree

Showing 5 changed files with 431 additions and 181 deletions.
diff --git a/build-requirements-analysis.txt b/build-requirements-analysis.txt
@@ -1,4 +1,4 @@
 numpy>=1.15.0
-scipy
+scipy>=1.5.0
 matplotlib>=3.3.2
 pandas
diff --git a/docs/source/timing-analysis.rst b/docs/source/timing-analysis.rst
@@ -400,8 +400,11 @@ and ``diff_ecdf_plot_zoom_in_10.png`` show just the central 98, 33, and 10
 percentiles respectively of the graph (to make estimating small differences
 between samples easier).
 
-Finally, the ``conf_interval_plot.png`` shows the mean of differences between
-samples together with
+Finally, the ``conf_interval_plot_mean.png``,
+``conf_interval_plot_median.png``, ``conf_interval_plot_trim_mean_05.png``,
+``conf_interval_plot_trim_mean_25.png``, and ``conf_interval_plot_trimean.png``
+show the mean, median, trimmed mean (5%), trimmed mean (25%), and trimean
+respecively, of the differences between samples together with
 `bootstrapped
 <https://en.wikipedia.org/wiki/Bootstrapping_(statistics)>`_ confidence
 interval for them.
@@ -422,6 +425,28 @@ second is the uniformity test of those results, third is the Friedman test.
    samples (at least 5, optimally 10). You should ignore it for such small
    runs. It's also invalid in case of just two samples (used conversations).
 
+The sign test is performed in three different ways: the default, used for
+determining presence of the timing side-channel, is the two-sided variant,
+saved in the ``report.csv`` file as the ``Sign test``. The two other ways,
+the ``Sign test less`` and ``Sign test greater`` test the hypothesis that
+the one sample stochastically dominates the other. High p-values here aren't
+meangingful (i.e. you can get a p-value == 1 even if the alternative is not
+statistically significant even at alpha=0.05).
+Very low values of a ``Sign test less`` mean that the *second* sample
+is unlikely to be smaller than the *first* sample.
+Those tests are more sensitive than the confidence intervals for median, so
+you can use them to test the theory if the timing signal depends on some
+parameters, like the length of pre-master secret in RSA key exchange or place
+of the first mismatched byte in CBC MAC.
+
+The code also calculates the
+`dependent t-test for paired samples
+<https://en.wikipedia.org/wiki/Student%27s_t-test#Dependent_t-test_for_paired_samples>`_,
+but as the timings generally don't follow the normal distribution, it severly
+underestimates the difference between samples (it is strongly influenced by
+outliers). The results from it are not taken into account to decide failure of
+the overall timing test.
+
 If either the KS-tests of uniformity of p-values, or the Friedman test fails,
 you should inspect the individual test p-values.
 
@@ -436,15 +461,32 @@ slower than another set by 10%), then you can also use the generated
 ``box_plot.png`` graph to see it.
 For small differences with large sample sizes, the differences will be
 statistically detectable, even if not obvious from from the box plot.
-You can use the ``conf_interval_plot.png`` graph to see the average difference
+You can use the ``conf_interval_plot*.png`` graphs to see the difference
 between samples and the first sample together with the 95% confidence
 interval for them.
 
-The script prints the numerical value for confidence interval for mean and
-median for differences of the pair of two most dissimilar probes.
-It also writes it to the ``report.txt`` file.
-
-Using R you can also manually generate ``conf_interval_plot.png`` graph,
+The script prints the numerical value for confidence interval for mean, median,
+trimmed mean (with 5% of observervations on either end ignored), trimmed mean
+(with 25% of smalles and biggest observations ignored), and trimean of
+differences of the pair of two most dissimilar probes.
+It also writes them to the ``report.txt`` file.
+
+The ``report.csv`` file includes the exact p-values for the statistical
+tests executed as well as the calculated descriptive statistics of
+distribution of differences: the mean, standard deviation (SD), median,
+interquartile range (IQR, as well as the
+`median absolute deviation
+<https://en.wikipedia.org/wiki/Median_absolute_deviation>`_ (MAD).
+Note that the mean and SD are very sensitive to outliers, the other three
+measures are more robust. The calculated MAD already includes the conversion
+factor so for a normal distribution it can be compared directly to SD.
+
+The ``sample_stats.csv`` file include the calculated mean, median, and MAD
+for the samples themselves (i.e. not the differences between samples).
+You can use this data to estimate the smallest detectable difference between
+samples for a given sample size.
+
+Using R you can also manually generate ``conf_interval_plot_mean.png`` graph,
 but note that this will take about an hour for 21 tests and
 samples with 1 million observations each on a 4 core/8 thread 2GHz CPU:
 

diff --git a/requirements-timing.txt b/requirements-timing.txt
@@ -1,5 +1,5 @@
 dpkt>=1.9.2
 numpy>=1.15.0
-scipy
+scipy>=1.5.0
 matplotlib>=3.3.2
 pandas
diff --git a/tests/test_tlsfuzzer_analysis.py b/tests/test_tlsfuzzer_analysis.py
@@ -14,7 +14,7 @@
 
 failed_import = False
 try:
-    from tlsfuzzer.analysis import Analysis, main, TestPair, help_msg, _DATA
+    from tlsfuzzer.analysis import Analysis, main, TestPair, help_msg
     import pandas as pd
     import numpy as np
 except ImportError:
@@ -60,13 +60,13 @@ def test_report(self):
                                                 analysis = Analysis("/tmp")
                                                 ret = analysis.generate_report()
 
-                                                self.mock_read_csv.assert_called_once()
+                                                self.mock_read_csv.assert_called()
                                                 #mock_ecdf.assert_called_once()
                                                 #mock_box.assert_called_once()
                                                 #mock_scatter.assert_called_once()
-                                                # we're writing to report.csv, legend.csv, and
-                                                # report.txt
-                                                self.assertEqual(mock_open.call_count, 3)
+                                                # we're writing to report.csv, legend.csv,
+                                                # sample_stats.csv, and report.txt
+                                                self.assertEqual(mock_open.call_count, 4)
                                                 self.assertEqual(ret, 0)
 
     def test_report_multithreaded(self):
@@ -84,13 +84,13 @@ def test_report_multithreaded(self):
                                                     multithreaded_graph=True)
                                                 ret = analysis.generate_report()
 
-                                                self.mock_read_csv.assert_called_once()
+                                                self.mock_read_csv.assert_called()
                                                 #mock_ecdf.assert_called_once()
                                                 #mock_box.assert_called_once()
                                                 #mock_scatter.assert_called_once()
-                                                # we're writing to report.csv, legend.csv, and
-                                                # report.txt
-                                                self.assertEqual(mock_open.call_count, 3)
+                                                # we're writing to report.csv, legend.csv,
+                                                # sample_stats.csv, and report.txt
+                                                self.assertEqual(mock_open.call_count, 4)
                                                 self.assertEqual(ret, 0)
 
     def test_report_neq(self):
@@ -111,13 +111,13 @@ def test_report_neq(self):
                                                 analysis = Analysis("/tmp")
                                                 ret = analysis.generate_report()
 
-                                                mock_read_csv.assert_called_once()
+                                                mock_read_csv.assert_called()
                                                 #mock_ecdf.assert_called_once()
                                                 #mock_box.assert_called_once()
                                                 #mock_scatter.assert_called_once()
                                                 # we're writing to report.csv, legend.csv,
-                                                # and report.txt
-                                                self.assertEqual(mock_open.call_count, 3)
+                                                # sample_stats.csv, and report.txt
+                                                self.assertEqual(mock_open.call_count, 4)
                                                 self.assertEqual(ret, 1)
 
     def test_report_error_in_box_plot(self):
@@ -267,12 +267,70 @@ def test_wilcoxon_test(self):
                 self.assertGreaterEqual(result, 0.25)
 
     def test__wilcox_test(self):
-        with mock.patch("tlsfuzzer.analysis._DATA", self.neq_data):
+        pval = Analysis._wilcox_test(self.neq_data.iloc[:,0],
+                                     self.neq_data.iloc[:,1])
+        self.assertGreaterEqual(0.05, pval)
+
+    def test_sign_test(self):
+        with mock.patch("tlsfuzzer.analysis.Analysis.load_data", self.mock_read_csv):
+            analysis = Analysis("/tmp")
+            self.mock_read_csv.assert_called_once()
 
-            ret = Analysis._wilcox_test((0, 1))
-            pair, pval = ret
-            self.assertEqual(pair, (0, 1))
-            self.assertGreaterEqual(0.05, pval)
+            res = analysis.sign_test()
+            self.assertEqual(len(res), 3)
+            for index, result in res.items():
+                self.assertEqual(result, 1)
+
+    def test__sign_test(self):
+        pval = Analysis._sign_test(self.neq_data.iloc[:, 0],
+                                   self.neq_data.iloc[:, 1],
+                                   0, "two-sided")
+        self.assertLess(pval, 0.002)
+
+    def test_sign_test_with_alternative_less(self):
+        with mock.patch("tlsfuzzer.analysis.Analysis.load_data", self.mock_read_csv):
+            analysis = Analysis("/tmp")
+            self.mock_read_csv.assert_called_once()
+
+            res = analysis.sign_test(alternative="less")
+            self.assertEqual(len(res), 3)
+            for index, result in res.items():
+                self.assertEqual(result, 0.5)
+
+    def test_sign_test_with_alternative_less_and_neq_data(self):
+        with mock.patch("tlsfuzzer.analysis.Analysis.load_data") as load_data:
+            load_data.return_value = self.neq_data
+            analysis = Analysis("/tmp")
+
+            res = analysis.sign_test(alternative="less")
+            self.assertEqual(len(res), 1)
+            for index, result in res.items():
+                self.assertLessEqual(result, 0.001)
+
+    def test_sign_test_with_alternative_greater_and_neq_data(self):
+        with mock.patch("tlsfuzzer.analysis.Analysis.load_data") as load_data:
+            load_data.return_value = self.neq_data
+            analysis = Analysis("/tmp")
+
+            res = analysis.sign_test(alternative="greater")
+            self.assertEqual(len(res), 1)
+            for index, result in res.items():
+                self.assertLessEqual(result, 1)
+
+    def test_rel_t_test(self):
+        with mock.patch("tlsfuzzer.analysis.Analysis.load_data", self.mock_read_csv):
+            analysis = Analysis("/tmp")
+            self.mock_read_csv.assert_called_once()
+
+            res = analysis.rel_t_test()
+            self.assertEqual(len(res), 3)
+            for index, result in res.items():
+                self.assertGreaterEqual(result, 0.25)
+
+    def test__rel_t_test(self):
+        pval = Analysis._rel_t_test(self.neq_data.iloc[:,0],
+                                     self.neq_data.iloc[:,1])
+        self.assertGreaterEqual(0.05, pval)
 
     def test_box_test(self):
         with mock.patch("tlsfuzzer.analysis.Analysis.load_data", self.mock_read_csv):
@@ -354,6 +412,7 @@ def setUp(self):
         mock_read_csv.return_value = timings
         with mock.patch("tlsfuzzer.analysis.Analysis.load_data", mock_read_csv):
             self.analysis = Analysis("/tmp")
+        self.analysis.load_data = mock_read_csv
 
     def test_ecdf_plot(self):
         with mock.patch("tlsfuzzer.analysis.FigureCanvas.print_figure",
@@ -410,7 +469,7 @@ def test_box_plot(self):
     @mock.patch("tlsfuzzer.analysis.os.remove")
     @mock.patch("tlsfuzzer.analysis.shutil.copyfile")
     def test__calc_percentiles(self, mock_copyfile, mock_remove, mock_memmap):
-        mock_memmap.return_value = self.analysis.data.values
+        mock_memmap.return_value = self.analysis.load_data()
 
         ret = self.analysis._calc_percentiles()
 
@@ -426,7 +485,17 @@ def test_conf_interval_plot(self):
             with mock.patch("__main__.__builtins__.open", mock.mock_open())\
                     as mock_open:
                 self.analysis.conf_interval_plot()
-                mock_save.assert_called_once()
+                self.assertEqual(mock_save.call_args_list,
+                    [mock.call('/tmp/conf_interval_plot_mean.png',
+                               bbox_inches='tight'),
+                     mock.call('/tmp/conf_interval_plot_median.png',
+                               bbox_inches='tight'),
+                     mock.call('/tmp/conf_interval_plot_trim_mean_05.png',
+                               bbox_inches='tight'),
+                     mock.call('/tmp/conf_interval_plot_trim_mean_25.png',
+                               bbox_inches='tight'),
+                     mock.call('/tmp/conf_interval_plot_trimean.png',
+                               bbox_inches='tight')])
 
 
 @unittest.skipIf(failed_import,
@@ -531,11 +600,11 @@ def test_load_data(self, convert_mock, open_mock, read_csv_mock,
 
         a = Analysis("/tmp")
 
-        self.assertTrue(a.data.equals(self.df))
+        self.assertTrue(a.load_data().equals(self.df))
 
-        convert_mock.assert_called_once_with()
-        read_csv_mock.assert_called_once_with("/tmp/legend.csv")
-        memmap_mock.assert_called_once_with(
+        convert_mock.assert_called_with()
+        read_csv_mock.assert_called_with("/tmp/legend.csv")
+        memmap_mock.assert_called_with(
             "/tmp/timing.bin", dtype=np.float64, mode="r", shape=(10, 2),
             order="C")
 
@@ -584,14 +653,17 @@ def test__convert_to_binary_with_noop(self, open_mock, read_csv_mock,
 
         a = Analysis("/tmp")
 
-        self.assertTrue(a.data.equals(self.df))
+        self.assertTrue(a.load_data().equals(self.df))
 
-        read_csv_mock.assert_called_once_with("/tmp/legend.csv")
-        memmap_mock.assert_called_once_with(
+        read_csv_mock.assert_called_with("/tmp/legend.csv")
+        memmap_mock.assert_called_with(
             "/tmp/timing.bin", dtype=np.float64, mode="r", shape=(10, 2),
             order="C")
         self.assertEqual(isfile_mock.call_args_list,
             [mock.call("/tmp/timing.bin"),
+             mock.call("/tmp/legend.csv"),
+             mock.call("/tmp/timing.bin.shape"),
+             mock.call("/tmp/timing.bin"),
              mock.call("/tmp/legend.csv"),
              mock.call("/tmp/timing.bin.shape")])