[WB-3722] diagose/fix unittest flakiness (#1685)

Added circleci-tool to help trigger workflows (down to individual tests) to find flaky tests Add --flake-finder to help find flaky tests Fix sender.py fixture to properly shutdown BackendSender (fixes some flake) Add debug info when live_mock_server fails to start Give live_mock_server longer to start (might have been causing flake - time will tell) Added junit.xml so circleci knows about our tests and can show cleaner test failures and possibly test history insights
wandb · Jan 11, 2021 · bbdeea6 · bbdeea6
1 parent 7bb1c3b
commit bbdeea6
Show file tree

Hide file tree

Showing 10 changed files with 255 additions and 25 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -3,28 +3,77 @@ version: 2.1
 orbs:
   win: circleci/windows@2.4.0
 
+parameters:
+  manual:
+    type: boolean
+    default: false
+  manual_test:
+    type: boolean
+    default: false
+  manual_win:
+    type: boolean
+    default: false
+  manual_mac:
+    type: boolean
+    default: false
+  manual_test_image:
+    type: string
+    default: "python:3.7"
+  manual_test_toxenv:
+    type: string
+    default: "py37"
+  manual_win_toxenv:
+    type: string
+    default: "py37"
+  manual_mac_toxenv:
+    type: string
+    default: "py37"
+  manual_test_name:
+    type: string
+    default: "Python 3.7 [MANUAL]"
+  manual_win_name:
+    type: string
+    default: "Windows (Python 3.7) [MANUAL]"
+  manual_mac_name:
+    type: string
+    default: "MacOS (Python 3.7) [MANUAL]"
+
 commands:
   save-tox-cache:
     description: "Save tox environment to cache"
     steps:
-      - save_cache:
-            paths:
-                - ./.tox
-            key: v0.6-toxenv-{{ .Environment.CIRCLE_BRANCH }}-{{ .Environment.CIRCLE_JOB }}-{{ checksum "tox.ini" }}-{{ checksum "setup.py" }}-{{ checksum "requirements.txt" }}-{{ checksum "requirements_dev.txt" }}
+      - unless:
+          condition: << pipeline.parameters.manual >>
+          steps:
+            - save_cache:
+                paths:
+                  - ./.tox
+                key: v0.7-toxenv-{{ .Environment.CIRCLE_BRANCH }}-{{ .Environment.CIRCLE_JOB }}-{{ checksum "tox.ini" }}-{{ checksum "setup.py" }}-{{ checksum "requirements.txt" }}-{{ checksum "requirements_dev.txt" }}
   restore-tox-cache:
     description: "Restore tox environment from cache"
     steps:
       - restore_cache:
               keys:
-              - v0.6-toxenv-{{ .Environment.CIRCLE_BRANCH }}-{{ .Environment.CIRCLE_JOB }}-{{ checksum "tox.ini" }}-{{ checksum "setup.py" }}-{{ checksum "requirements.txt" }}-{{ checksum "requirements_dev.txt" }}
-              - v0.6-toxenv-{{ .Environment.CIRCLE_BRANCH }}-{{ .Environment.CIRCLE_JOB }}-{{ checksum "tox.ini" }}-
-              - v0.6-toxenv-{{ .Environment.CIRCLE_BRANCH }}-{{ .Environment.CIRCLE_JOB }}-
-              - v0.6-toxenv-master-{{ .Environment.CIRCLE_JOB }}-{{ checksum "tox.ini" }}-{{ checksum "setup.py" }}-{{ checksum "requirements.txt" }}-{{ checksum "requirements_dev.txt" }}
-              - v0.6-toxenv-master-{{ .Environment.CIRCLE_JOB }}-{{ checksum "tox.ini" }}-
-              - v0.6-toxenv-master-{{ .Environment.CIRCLE_JOB }}-
+              - v0.7-toxenv-{{ .Environment.CIRCLE_BRANCH }}-{{ .Environment.CIRCLE_JOB }}-{{ checksum "tox.ini" }}-{{ checksum "setup.py" }}-{{ checksum "requirements.txt" }}-{{ checksum "requirements_dev.txt" }}
+              - v0.7-toxenv-{{ .Environment.CIRCLE_BRANCH }}-{{ .Environment.CIRCLE_JOB }}-{{ checksum "tox.ini" }}-
+              - v0.7-toxenv-{{ .Environment.CIRCLE_BRANCH }}-{{ .Environment.CIRCLE_JOB }}-
+              - v0.7-toxenv-master-{{ .Environment.CIRCLE_JOB }}-{{ checksum "tox.ini" }}-{{ checksum "setup.py" }}-{{ checksum "requirements.txt" }}-{{ checksum "requirements_dev.txt" }}
+              - v0.7-toxenv-master-{{ .Environment.CIRCLE_JOB }}-{{ checksum "tox.ini" }}-
+              - v0.7-toxenv-master-{{ .Environment.CIRCLE_JOB }}-
+  save-test-results:
+    description: "Save test results"
+    steps:
+      - unless:
+          condition: << pipeline.parameters.manual >>
+          steps:
+            - store_test_results:
+                path: test-results
+            - store_artifacts:
+                path: test-results
 
 workflows:
   main:
+    unless: << pipeline.parameters.manual >>
     jobs:
       - test:
          name: "Linters"
@@ -52,13 +101,34 @@ workflows:
          toxenv: "py39"
       - win:
          name: "Windows (Python 3.7)"
+         toxenv: "py37"
       - mac:
          name: "MacOS (Python 3.7)"
+         toxenv: "py37"
       - final:
          name: "Upload Coverage"
          requires:
             - "Python 2.7"
             - "Python 3.6"
+  manual_test:
+    when: << pipeline.parameters.manual_test >>
+    jobs:
+      - test:
+         name: << pipeline.parameters.manual_test_name >>
+         image: << pipeline.parameters.manual_test_image >>
+         toxenv: << pipeline.parameters.manual_test_toxenv >>
+  manual_win:
+    when: << pipeline.parameters.manual_win >>
+    jobs:
+      - win:
+         name: << pipeline.parameters.manual_win_name >>
+         toxenv: << pipeline.parameters.manual_win_toxenv >>
+  manual_mac:
+    when: << pipeline.parameters.manual_mac >>
+    jobs:
+      - mac:
+         name: << pipeline.parameters.manual_mac_name >>
+         toxenv: << pipeline.parameters.manual_mac_toxenv >>
 
 jobs:
     test:
@@ -86,7 +156,11 @@ jobs:
                       tox -vv -e << parameters.toxenv >>
                   no_output_timeout: 10m
             - save-tox-cache
+            - save-test-results
     win:
+        parameters:
+            toxenv:
+              type: string
         executor: win/default
         steps:
             - checkout
@@ -104,11 +178,14 @@ jobs:
                   name: Run tests
                   shell: bash.exe
                   command: |
-                      tox -vv -e py37
+                      tox -vv -e << parameters.toxenv >>
                   no_output_timeout: 10m
             - save-tox-cache
-
+            - save-test-results
     mac:
+        parameters:
+            toxenv:
+              type: string
         macos:
             xcode: 11.4.1
         steps:
@@ -123,9 +200,10 @@ jobs:
                   # Tests failed with Too many open files, so added ulimit
                   command: |
                       ulimit -n 1024
-                      python3 -m tox -vv -e py37
+                      python3 -m tox -vv -e << parameters.toxenv >>
                   no_output_timeout: 10m
             - save-tox-cache
+            - save-test-results
     final:
         docker:
             - image: python:3.7

diff --git a/.gitignore b/.gitignore
@@ -16,4 +16,5 @@ tests/logs/*
 !tests/logs/cleanup.sh
 pip-wheel-metadata
 .vscode/.ropeproject
-xcuserdata/
+xcuserdata/
+test-results/
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -87,15 +87,17 @@ def reset_ctx():
     server.reset_ctx = reset_ctx
 
     started = False
-    for i in range(5):
+    for i in range(10):
         try:
-            res = requests.get("%s/ctx" % server.base_url, timeout=1)
+            res = requests.get("%s/ctx" % server.base_url, timeout=5)
             if res.status_code == 200:
                 started = True
                 break
             print("Attempting to connect but got: %s" % res)
         except requests.exceptions.RequestException:
-            print("Timed out waiting for server to start...")
+            print(
+                "Timed out waiting for server to start...", server.base_url, time.time()
+            )
             if server.poll() is None:
                 time.sleep(1)
             else:
@@ -108,6 +110,14 @@ def reset_ctx():
     else:
         server.terminate()
         print("Server failed to launch, see tests/logs/live_mock_server.log")
+        try:
+            print("=" * 40)
+            with open("tests/logs/live_mock_server.log") as f:
+                for l in f.readlines():
+                    print(l.strip())
+            print("=" * 40)
+        except Exception as e:
+            print("EXCEPTION:", e)
         raise ValueError("Failed to start server!  Exit code %s" % server.returncode)
     return server
 
@@ -117,10 +127,17 @@ def reset_ctx():
 
 
 @pytest.fixture
-def test_dir(request):
+def test_name(request):
+    # change "test[1]" to "test__1__"
+    name = urllib.parse.quote(request.node.name.replace("[", "__").replace("]", "__"))
+    return name
+
+
+@pytest.fixture
+def test_dir(test_name):
     orig_dir = os.getcwd()
     root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
-    test_dir = os.path.join(root, "tests", "logs", request.node.name)
+    test_dir = os.path.join(root, "tests", "logs", test_name)
     if os.path.exists(test_dir):
         shutil.rmtree(test_dir)
     mkdir_exists_ok(test_dir)

diff --git a/tests/integrations/test_torch.py b/tests/integrations/test_torch.py
@@ -139,6 +139,7 @@ def conv3x3(in_channels, out_channels, **kwargs):
 
 
 def test_all_logging(wandb_init_run):
+    # TODO(jhr): does not work with --flake-finder
     net = ConvNet()
     wandb.watch(net, log="all", log_freq=1)
     for i in range(3):

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -694,6 +694,7 @@ def test_local_already_running(runner, docker, local_settings):
     reason="The patch in mock_server.py doesn't work in windows",
 )
 def test_restore_no_remote(runner, mock_server, git_repo, docker, monkeypatch):
+    # TODO(jhr): does not work with --flake-finder
     with open("patch.txt", "w") as f:
         f.write("test")
     git_repo.repo.index.add(["patch.txt"])

diff --git a/tests/test_sender.py b/tests/test_sender.py
@@ -178,7 +178,15 @@ def stop_backend(
     mocked_run, hm, sm, sender, start_handle_thread, start_send_thread,
 ):
     def stop_backend_func():
-        sender.communicate_exit(0, timeout=5)
+        sender.publish_exit(0)
+        for _ in range(10):
+            ret = sender.communicate_poll_exit()
+            assert ret, "poll exit timedout"
+            done = ret.response.poll_exit_response.done
+            if done:
+                break
+            time.sleep(1)
+        assert done, "backend didnt shutdown"
 
     yield stop_backend_func
 

diff --git a/tests/wandb_integration_test.py b/tests/wandb_integration_test.py
@@ -60,7 +60,7 @@ def test_resume_allow_success(live_mock_server, test_settings):
     platform.system() == "Windows", reason="File syncing is somewhat busted in windows"
 )
 # TODO: Sometimes wandb-summary.json didn't exists, other times requirements.txt in windows
-def test_parallel_runs(live_mock_server, test_settings):
+def test_parallel_runs(request, live_mock_server, test_settings, test_name):
     with open("train.py", "w") as f:
         f.write(fixture_open("train.py").read())
     p1 = subprocess.Popen(["python", "train.py"], env=os.environ)
@@ -70,12 +70,13 @@ def test_parallel_runs(live_mock_server, test_settings):
     num_runs = 0
     # Assert we've stored 2 runs worth of files
     # TODO: not confirming output.log because it is missing sometimes likely due to a BUG
+    # TODO: code saving sometimes doesnt work?
     files_sorted = sorted(
         [
-            "wandb-metadata.json",
-            "code/tests/logs/test_parallel_runs/train.py",
-            "requirements.txt",
             "config.yaml",
+            "code/tests/logs/{}/train.py".format(test_name),
+            "requirements.txt",
+            "wandb-metadata.json",
             "wandb-summary.json",
         ]
     )

diff --git a/tests/wandb_tensorflow_test.py b/tests/wandb_tensorflow_test.py
@@ -127,6 +127,7 @@ def spy_cb(row, step=None):
     reason="TF has sketchy support for py2.  TODO: Windows is legitimately busted",
 )
 def test_compat_tensorboard(live_mock_server, test_settings):
+    # TODO(jhr): does not work with --flake-finder
     # TODO: we currently don't unpatch tensorflow so this is the only test that can do it...
     wandb.init(sync_tensorboard=True, settings=test_settings)