dflook · dflook · Apr 14, 2023 · Apr 16, 2023 · Apr 16, 2023 · Apr 16, 2023
diff --git a/.github/workflows/test_corpus.yaml b/.github/workflows/test_corpus.yaml
@@ -17,7 +17,7 @@ on:
         type: boolean
         description: 'Regenerate results'
         required: true
-        default: true
+        default: false
   workflow_call:
     inputs:
       ref:

diff --git a/.github/workflows/xtest.yaml b/.github/workflows/xtest.yaml
@@ -26,7 +26,7 @@ jobs:
 
       - name: Run tests
         run: |
-          
+
           if [[ "${{ matrix.python }}" == "python3.4" ]]; then
             (cd /usr/lib64/python3.4/test && python3.4 make_ssl_certs.py)
           elif [[ "${{ matrix.python }}" == "python3.5" ]]; then

diff --git a/.gitignore b/.gitignore
@@ -18,3 +18,4 @@ docs/source/transforms/*.min.py
 .circleci-config.yml
 .coverage
 .mypy_cache/
+NOTES.md
diff --git a/corpus_test/generate_report.py b/corpus_test/generate_report.py
@@ -6,7 +6,7 @@
 
 from result import Result, ResultReader
 
-ENHANCED_REPORT = os.environ.get('ENHANCED_REPORT', False)
+ENHANCED_REPORT = os.environ.get('ENHANCED_REPORT', True)
 
 
 @dataclass
@@ -64,6 +64,9 @@ def mean_percent_of_original(self) -> float:
     def larger_than_original(self) -> Iterable[Result]:
         """Return those entries that have a larger minified size than the original size"""
         for result in self.entries.values():
+            if result.outcome != 'Minified':
+                continue
+
             if result.original_size < result.minified_size:
                 yield result
 
@@ -91,10 +94,18 @@ def compare_size_increase(self, base: 'ResultSet') -> Iterable[Result]:
         """
 
         for result in self.entries.values():
+            if result.outcome != 'Minified':
+                # This result was not minified, so we can't compare
+                continue
+
             if result.corpus_entry not in base.entries:
                 continue
 
             base_result = base.entries[result.corpus_entry]
+            if base_result.outcome != 'Minified':
+                # The base result was not minified, so we can't compare
+                continue
+
             if result.minified_size > base_result.minified_size:
                 yield result
 
@@ -104,10 +115,17 @@ def compare_size_decrease(self, base: 'ResultSet') -> Iterable[Result]:
         """
 
         for result in self.entries.values():
+            if result.outcome != 'Minified':
+                continue
+
             if result.corpus_entry not in base.entries:
                 continue
 
             base_result = base.entries[result.corpus_entry]
+            if base_result.outcome != 'Minified':
+                # The base result was not minified, so we can't compare
+                continue
+
             if result.minified_size < base_result.minified_size:
                 yield result
 
@@ -164,6 +182,103 @@ def format_difference(compare: Iterable[Result], base: Iterable[Result]) -> str:
     else:
         return s
 
+def report_larger_than_original(results_dir: str, python_versions: str, minifier_sha: str) -> str:
+    yield '''
+## Larger than original
+
+| Corpus Entry | Original Size | Minified Size |
+|--------------|--------------:|--------------:|'''
+
+    for python_version in python_versions:
+        try:
+            summary = result_summary(results_dir, python_version, minifier_sha)
+        except FileNotFoundError:
+            continue
+
+        larger_than_original = sorted(summary.larger_than_original(), key=lambda result: result.original_size)
+
+        for entry in larger_than_original:
+            yield f'| {entry.corpus_entry} | {entry.original_size} | {entry.minified_size} ({entry.minified_size - entry.original_size:+}) |'
+
+def report_unstable(results_dir: str, python_versions: str, minifier_sha: str) -> str:
+    yield '''
+## Unstable
+
+| Corpus Entry | Python Version | Original Size |
+|--------------|----------------|--------------:|'''
+
+    for python_version in python_versions:
+        try:
+            summary = result_summary(results_dir, python_version, minifier_sha)
+        except FileNotFoundError:
+            continue
+
+        unstable = sorted(summary.unstable_minification(), key=lambda result: result.original_size)
+
+        for entry in unstable:
+            yield f'| {entry.corpus_entry} | {python_version} | {entry.original_size} |'
+
+def report_exceptions(results_dir: str, python_versions: str, minifier_sha: str) -> str:
+    yield '''
+## Exceptions
+
+| Corpus Entry | Python Version | Exception |
+|--------------|----------------|-----------|'''
+
+    exceptions_found = False
+
+    for python_version in python_versions:
+        try:
+            summary = result_summary(results_dir, python_version, minifier_sha)
+        except FileNotFoundError:
+            continue
+
+        exceptions = sorted(summary.exception(), key=lambda result: result.original_size)
+
+        for entry in exceptions:
+            exceptions_found = True
+            yield f'| {entry.corpus_entry} | {python_version} | {entry.outcome} |'
+
+    if not exceptions_found:
+        yield ' None | | |'
+
+def report_larger_than_base(results_dir: str, python_versions: str, minifier_sha: str, base_sha: str) -> str:
+    yield '''
+## Top 10 Larger than base
+
+| Corpus Entry | Original Size | Minified Size |
+|--------------|--------------:|--------------:|'''
+
+    there_are_some_larger_than_base = False
+
+    for python_version in python_versions:
+        try:
+            summary = result_summary(results_dir, python_version, minifier_sha)
+        except FileNotFoundError:
+            continue
+
+        base_summary = result_summary(results_dir, python_version, base_sha)
+        larger_than_original = sorted(summary.compare_size_increase(base_summary), key=lambda result: result.original_size)[:10]
+
+        for entry in larger_than_original:
+            there_are_some_larger_than_base = True
+            yield f'| {entry.corpus_entry} | {entry.original_size} | {entry.minified_size} ({entry.minified_size - base_summary.entries[entry.corpus_entry].minified_size:+}) |'
+
+    if not there_are_some_larger_than_base:
+        yield '| N/A | N/A | N/A |'
+
+def report_slowest(results_dir: str, python_versions: str, minifier_sha: str) -> str:
+    yield '''
+## Top 10 Slowest
+
+| Corpus Entry | Original Size | Minified Size | Time |
+|--------------|--------------:|--------------:|-----:|'''
+
+    for python_version in python_versions:
+        summary = result_summary(results_dir, python_version, minifier_sha)
+
+        for entry in sorted(summary.entries.values(), key=lambda entry: entry.time, reverse=True)[:10]:
+            yield f'| {entry.corpus_entry} | {entry.original_size} | {entry.minified_size} | {entry.time:.3f} |'
 
 def report(results_dir: str, minifier_ref: str, minifier_sha: str, base_ref: str, base_sha: str) -> Iterable[str]:
     """
@@ -236,50 +351,11 @@ def format_size_change_detail() -> str:
         )
 
     if ENHANCED_REPORT:
-        yield '''
-## Larger than original
-
-| Corpus Entry | Original Size | Minified Size |
-|--------------|--------------:|--------------:|'''
-
-        for python_version in ['3.11']:
-            summary = result_summary(results_dir, python_version, minifier_sha)
-            larger_than_original = sorted(summary.larger_than_original(), key=lambda result: result.original_size)
-
-            for entry in larger_than_original:
-                yield f'| {entry.corpus_entry} | {entry.original_size} | {entry.minified_size} ({entry.minified_size - entry.original_size:+}) |'
-
-        yield '''
-## Top 10 Larger than base
-
-| Corpus Entry | Original Size | Minified Size |
-|--------------|--------------:|--------------:|'''
-
-        there_are_some_larger_than_base = False
-
-        for python_version in ['3.11']:
-            summary = result_summary(results_dir, python_version, minifier_sha)
-            base_summary = result_summary(results_dir, python_version, base_sha)
-            larger_than_original = sorted(summary.compare_size_increase(base_summary), key=lambda result: result.original_size)[:10]
-
-            for entry in larger_than_original:
-                there_are_some_larger_than_base = True
-                yield f'| {entry.corpus_entry} | {entry.original_size} | {entry.minified_size} ({entry.minified_size - base_summary.entries[entry.corpus_entry].minified_size:+}) |'
-
-        if not there_are_some_larger_than_base:
-            yield '| N/A | N/A | N/A |'
-
-        yield '''
-## Top 10 Slowest
-
-| Corpus Entry | Original Size | Minified Size | Time |
-|--------------|--------------:|--------------:|-----:|'''
-
-        for python_version in ['3.11']:
-            summary = result_summary(results_dir, python_version, minifier_sha)
-
-            for entry in sorted(summary.entries.values(), key=lambda entry: entry.time, reverse=True)[:10]:
-                yield f'| {entry.corpus_entry} | {entry.original_size} | {entry.minified_size} | {entry.time:.3f} |'
+        yield from report_larger_than_original(results_dir, ['3.11'], minifier_sha)
+        yield from report_larger_than_base(results_dir, ['3.11'], minifier_sha, base_sha)
+        yield from report_slowest(results_dir, ['3.11'], minifier_sha)
+        yield from report_unstable(results_dir, ['2.7', '3.3', '3.4', '3.5', '3.6', '3.7', '3.8', '3.9', '3.10', '3.11'], minifier_sha)
+        yield from report_exceptions(results_dir, ['3.6', '3.7', '3.8', '3.9', '3.10', '3.11'], minifier_sha)
 
 
 def main():

diff --git a/corpus_test/generate_results.py b/corpus_test/generate_results.py
@@ -1,8 +1,14 @@
 import argparse
+import datetime
+import gzip
 import os
 import sys
 import time
 
+
+import logging
+
+
 import python_minifier
 from result import Result, ResultWriter
 
@@ -23,8 +29,13 @@ def minify_corpus_entry(corpus_path, corpus_entry):
     :rtype: Result
     """
 
-    with open(os.path.join(corpus_path, corpus_entry), 'rb') as f:
-        source = f.read()
+    if os.path.isfile(os.path.join(corpus_path, corpus_entry + '.py.gz')):
+        with gzip.open(os.path.join(corpus_path, corpus_entry + '.py.gz'), 'rb') as f:
+            source = f.read()
+    else:
+        with open(os.path.join(corpus_path, corpus_entry), 'rb') as f:
+            source = f.read()
+
 
     result = Result(corpus_entry, len(source), 0, 0, '')
 
@@ -72,21 +83,54 @@ def corpus_test(corpus_path, results_path, sha, regenerate_results):
     :param str sha: The python-minifier sha we are testing
     :param bool regenerate_results: Regenerate results even if they are present
     """
-    corpus_entries = os.listdir(corpus_path)
-
     python_version = '.'.join([str(s) for s in sys.version_info[:2]])
+
+    log_path = 'results_' + python_version + '_' + sha + '.log'
+    print('Logging in GitHub Actions is absolute garbage. Logs are going to ' + log_path)
+
+    logging.basicConfig(filename=os.path.join(results_path, log_path), level=logging.DEBUG)
+
+    corpus_entries = [entry[:-len('.py.gz')] for entry in os.listdir(corpus_path)]
+
     results_file_path = os.path.join(results_path, 'results_' + python_version + '_' + sha + '.csv')
 
-    if os.path.isfile(results_file_path) and not regenerate_results:
-        print('Results file already exists: %s', results_file_path)
-        return
+    if os.path.isfile(results_file_path):
+        logging.info('Results file already exists: %s', results_file_path)
+        if regenerate_results:
+            os.remove(results_file_path)
+
+    total_entries = len(corpus_entries)
+    logging.info('Testing python-minifier on %d entries' % total_entries)
+    tested_entries = 0
+
+    start_time = time.time()
+    next_checkpoint = time.time() + 60
 
     with ResultWriter(results_file_path) as result_writer:
+        logging.info('%d results already present' % len(result_writer))
+
         for entry in corpus_entries:
-            print(entry)
+            if entry in result_writer:
+                continue
+
+            logging.debug(entry)
+
             result = minify_corpus_entry(corpus_path, entry)
             result_writer.write(result)
+            tested_entries += 1
+
+            sys.stdout.flush()
+
+            if time.time() > next_checkpoint:
+                percent = len(result_writer) / total_entries * 100
+                time_per_entry = (time.time() - start_time) / tested_entries
+                entries_remaining = len(corpus_entries) - len(result_writer)
+                time_remaining = int(entries_remaining * time_per_entry)
+                logging.info('Tested %d/%d entries (%d%%) %s seconds remaining' % (len(result_writer), total_entries, percent, time_remaining))
+                sys.stdout.flush()
+                next_checkpoint = time.time() + 60
 
+    logging.info('Finished')
 
 def bool_parse(value):
     return value == 'true'

diff --git a/corpus_test/result.py b/corpus_test/result.py
@@ -1,3 +1,6 @@
+import os
+
+
 class Result(object):
 
     def __init__(self, corpus_entry, original_size, minified_size, time, outcome):
@@ -21,15 +24,37 @@ def __init__(self, results_path):
         :param str results_path: The path to the results file
         """
         self._results_path = results_path
+        self._size = 0
+        self._existing_result_set = set()
+
+        if not os.path.isfile(self._results_path):
+            return
+
+        with open(self._results_path, 'r') as f:
+            for line in f:
+                if line != 'corpus_entry,original_size,minified_size,time,result\n':
+                    self._existing_result_set.add(line.split(',')[0])
+
+        self._size += len(self._existing_result_set)
 
     def __enter__(self):
-        self.results = open(self._results_path, 'w')
+        self.results = open(self._results_path, 'a')
         self.results.write('corpus_entry,original_size,minified_size,time,result\n')
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.results.close()
 
+    def __contains__(self, item):
+        """
+        :param str item: The name of the entry in the corpus
+        :return bool: True if the entry already exists in the results file
+        """
+        return item in self._existing_result_set
+
+    def __len__(self):
+        return self._size
+
     def write(self, result):
         """
         :param Result result: The result to write to the file
@@ -41,6 +66,7 @@ def write(self, result):
             str(result.time) + ',' + result.outcome + '\n'
         )
         self.results.flush()
+        self._size += 1
 
 
 class ResultReader:
@@ -66,7 +92,11 @@ def __next__(self):
         """
         :return Result: The next result in the file
         """
+
         line = self.results.readline()
+        while line == 'corpus_entry,original_size,minified_size,time,result\n':
+            line = self.results.readline()
+
         if line == '':
             raise StopIteration
         else: