diff --git a/benchmark/README.md b/benchmark/README.md
index bb1722eafec49..6d1e436401ed3 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -86,22 +86,38 @@ Using the Benchmark Driver
 * `--num-samples`
     * Control the number of samples to take for each test
 * `--list`
-    * Print a list of available tests
+    * Print a list of available tests matching specified criteria
+* `--tags`
+    * Run tests that are labeled with specified [tags](https://github.com/apple/swift/blob/master/benchmark/utils/TestsUtils.swift#L19)
+     (comma separated list); multiple tags are interpreted as logical AND, i.e.
+     run only test that are labeled with all the supplied tags
+* `--skip-tags`
+    * Don't run tests that are labeled with any of the specified tags (comma
+      separated list); default value: `skip,unstable`; to get complete list of
+      tests, specify empty `--skip-tags=`
+
 
 ### Examples
 
 * `$ ./Benchmark_O --num-iters=1 --num-samples=1`
 * `$ ./Benchmark_Onone --list`
 * `$ ./Benchmark_Osize Ackermann`
+* `$ ./Benchmark_O --tags=Dictionary`
+* `$ ./Benchmark_O --skip-tags=unstable,skip,validation`
 
 ### Note
 As a shortcut, you can also refer to benchmarks by their ordinal numbers.
-The regular `--list` option does not provide these, but you can run:
-* `$ ./Benchmark_O --list --run-all | tail -n +2 | nl`
-You can use ordinal numbers instead of test names like this:
+These are printed out together with benchmark names and tags using the
+`--list` parameter. For a complete list of all available performance tests run
+* `$ ./Benchmark_O --list --skip-tags=`
+
+You can use test numbers instead of test names like this:
 * `$ ./Benchmark_O 1 42`
 * `$ ./Benchmark_Driver run 1 42`
 
+Test numbers are not stable in the long run, adding and removing tests from the
+benchmark suite will reorder them, but they are stable for a given build.
+
 Using the Harness Generator
 ---------------------------
 
@@ -186,3 +202,21 @@ public func run_YourTestName(N: Int) {
 
 The current set of tags are defined by the `BenchmarkCategory` enum in
 `TestsUtils.swift` .
+
+Testing the Benchmark Drivers
+-----------------------------
+When working on tests, after the initial build
+````
+swift-source$ ./swift/utils/build-script -R -B
+````
+you can rebuild just the benchmarks:
+````
+swift-source$ export SWIFT_BUILD_DIR=`pwd`/build/Ninja-ReleaseAssert/swift-macosx-x86_64
+swift-source$ ninja -C ${SWIFT_BUILD_DIR} swift-benchmark-macosx-x86_64
+````
+
+When modifying the testing infrastructure, you should verify that your changes
+pass all the tests:
+````
+swift-source$ ./llvm/utils/lit/lit.py -sv ${SWIFT_BUILD_DIR}/test-macosx-x86_64/benchmark
+````
diff --git a/benchmark/scripts/Benchmark_Driver b/benchmark/scripts/Benchmark_Driver
index cc918aec19e68..92f12c627855a 100755
--- a/benchmark/scripts/Benchmark_Driver
+++ b/benchmark/scripts/Benchmark_Driver
@@ -118,28 +118,32 @@ def instrument_test(driver_path, test, num_samples):
     return avg_test_output
 
 
-BENCHMARK_OUTPUT_RE = re.compile('([^,]+),')
-
-
 def get_tests(driver_path, args):
     """Return a list of available performance tests"""
     driver = ([driver_path, '--list'])
+    # Use tab delimiter for easier parsing to override the default comma.
+    # (The third 'column' is always comma-separated list of tags in square
+    # brackets -- currently unused here.)
+    driver.append('--delim=\t')
     if args.benchmarks or args.filters:
-        driver.append('--run-all')
-    tests = []
-    for l in subprocess.check_output(driver).split("\n")[1:]:
-        m = BENCHMARK_OUTPUT_RE.match(l)
-        if m is None:
-            continue
-        tests.append(m.group(1))
+        driver.append('--skip-tags=')  # list all tests, don't skip any tags
+    index_name_pairs = [
+        line.split('\t')[:2] for line in
+        subprocess.check_output(driver).split('\n')[1:-1]
+    ]
+    indices, names = zip(*index_name_pairs)  # unzip list of pairs into 2 lists
     if args.filters:
         regexes = [re.compile(pattern) for pattern in args.filters]
         return sorted(list(set([name for pattern in regexes
-                                for name in tests if pattern.match(name)])))
+                                for name in names if pattern.match(name)])))
     if not args.benchmarks:
-        return tests
-    tests.extend(map(str, range(1, len(tests) + 1)))  # ordinal numbers
-    return sorted(list(set(tests).intersection(set(args.benchmarks))))
+        return names
+    benchmarks = set(args.benchmarks)
+    index_to_name = dict(index_name_pairs)
+    indexed_names = [index_to_name[i]
+                     for i in benchmarks.intersection(set(indices))]
+    return sorted(list(
+        benchmarks.intersection(set(names)).union(indexed_names)))
 
 
 def get_current_git_branch(git_repo_path):
diff --git a/benchmark/utils/DriverUtils.swift b/benchmark/utils/DriverUtils.swift
index bf7b4d56a5653..5675c767ecd9b 100644
--- a/benchmark/utils/DriverUtils.swift
+++ b/benchmark/utils/DriverUtils.swift
@@ -69,7 +69,7 @@ struct Test {
 
   /// The benchmark categories that this test belongs to. Used for filtering.
   var tags: [BenchmarkCategory] {
-    return benchInfo.tags
+    return benchInfo.tags.sorted()
   }
 
   /// An optional initialization function for a benchmark that is run before
@@ -181,7 +181,7 @@ struct TestConfig {
 
       // We support specifying multiple tags by splitting on comma, i.e.:
       //
-      //  --tags=array,set
+      //  --tags=Array,Dictionary
       //
       // FIXME: If we used Error instead of .fail, then we could have a cleaner
       // impl here using map on x and tags.formUnion.
@@ -200,7 +200,7 @@ struct TestConfig {
 
       // We support specifying multiple tags by splitting on comma, i.e.:
       //
-      //  --skip-tags=array,set
+      //  --skip-tags=Array,Set,unstable,skip
       //
       // FIXME: If we used Error instead of .fail, then we could have a cleaner
       // impl here using map on x and tags.formUnion.
@@ -227,39 +227,22 @@ struct TestConfig {
   }
 
   mutating func findTestsToRun() {
-    let benchmarkNameFilter = Set(filters)
-
-    // t is needed so we don't capture an ivar of a mutable inout self.
-    let t = tags
-    let st = skipTags
-    let filteredTests = Array(registeredBenchmarks.filter { benchInfo in
-      if !t.isSubset(of: benchInfo.tags) {
-        return false
-      }
-
-      if !st.isDisjoint(with: benchInfo.tags) {
-        return false
-      }
-
-      // If the user did not specified a benchmark name filter and our tags are
-      // a subset of the specified tags by the user, return true. We want to run
-      // this test.
-      if benchmarkNameFilter.isEmpty {
-        return true
+    registeredBenchmarks.sort()
+    let indices = Dictionary(uniqueKeysWithValues:
+      zip(registeredBenchmarks.map{$0.name}, 1...))
+    let benchmarkNamesOrIndices = Set(filters)
+    // needed so we don't capture an ivar of a mutable inout self.
+    let (_tags, _skipTags) = (tags, skipTags)
+
+    tests = registeredBenchmarks.filter { benchmark in
+      if benchmarkNamesOrIndices.isEmpty {
+        return benchmark.tags.isSuperset(of: _tags) &&
+          benchmark.tags.isDisjoint(with: _skipTags)
+      } else {
+        return benchmarkNamesOrIndices.contains(benchmark.name) ||
+          benchmarkNamesOrIndices.contains(String(indices[benchmark.name]!))
       }
-
-      // Otherwise, we need to check if our benchInfo's name is in the benchmark
-      // name filter list. If it isn't, then we shouldn't process it.
-      return benchmarkNameFilter.contains(benchInfo.name)
-    }).sorted()
-
-    if (filteredTests.isEmpty) {
-      return
-    }
-
-    tests = filteredTests.enumerated().map {
-      Test(benchInfo: $0.element, index: $0.offset + 1)
-    }
+    }.map { Test(benchInfo: $0, index: indices[$0.name]!) }
   }
 }
 
@@ -382,14 +365,13 @@ func runBench(_ test: Test, _ c: TestConfig) -> BenchResults? {
 
   let sampler = SampleRunner()
   for s in 0..<c.numSamples {
+    test.setUpFunction?()
     let time_per_sample: UInt64 = 1_000_000_000 * UInt64(c.iterationScale)
 
     var scale : UInt
     var elapsed_time : UInt64 = 0
     if c.fixedNumIters == 0 {
-      test.setUpFunction?()
       elapsed_time = sampler.run(test.name, fn: testFn, num_iters: 1)
-      test.tearDownFunction?()
 
       if elapsed_time > 0 {
         scale = UInt(time_per_sample / elapsed_time)
@@ -402,6 +384,9 @@ func runBench(_ test: Test, _ c: TestConfig) -> BenchResults? {
     } else {
       // Compute the scaling factor if a fixed c.fixedNumIters is not specified.
       scale = c.fixedNumIters
+      if scale == 1 {
+        elapsed_time = sampler.run(test.name, fn: testFn, num_iters: 1)
+      }
     }
     // Make integer overflow less likely on platforms where Int is 32 bits wide.
     // FIXME: Switch BenchmarkInfo to use Int64 for the iteration scale, or fix
@@ -413,9 +398,7 @@ func runBench(_ test: Test, _ c: TestConfig) -> BenchResults? {
       if c.verbose {
         print("    Measuring with scale \(scale).")
       }
-      test.setUpFunction?()
       elapsed_time = sampler.run(test.name, fn: testFn, num_iters: scale)
-      test.tearDownFunction?()
     } else {
       scale = 1
     }
@@ -424,6 +407,7 @@ func runBench(_ test: Test, _ c: TestConfig) -> BenchResults? {
     if c.verbose {
       print("    Sample \(s),\(samples[s])")
     }
+    test.tearDownFunction?()
   }
 
   let (mean, sd) = internalMeanSD(samples)
@@ -497,9 +481,9 @@ public func main() {
       fatalError("\(msg)")
     case .listTests:
       config.findTestsToRun()
-      print("Enabled Tests\(config.delim)Tags")
+      print("#\(config.delim)Test\(config.delim)[Tags]")
       for t in config.tests {
-        print("\(t.name)\(config.delim)\(t.tags)")
+        print("\(t.index)\(config.delim)\(t.name)\(config.delim)\(t.tags)")
       }
     case .run:
       config.findTestsToRun()
diff --git a/benchmark/utils/TestsUtils.swift b/benchmark/utils/TestsUtils.swift
index 594375c67a6b9..1330853443eaa 100644
--- a/benchmark/utils/TestsUtils.swift
+++ b/benchmark/utils/TestsUtils.swift
@@ -70,6 +70,18 @@ public enum BenchmarkCategory : String {
   case skip
 }
 
+extension BenchmarkCategory : CustomStringConvertible {
+  public var description: String {
+    return self.rawValue
+  }
+}
+
+extension BenchmarkCategory : Comparable {
+    public static func < (lhs: BenchmarkCategory, rhs: BenchmarkCategory) -> Bool {
+        return lhs.rawValue < rhs.rawValue
+    }
+}
+
 public struct BenchmarkPlatformSet : OptionSet {
   public let rawValue: Int
 
@@ -111,7 +123,7 @@ public struct BenchmarkInfo {
   /// A set of category tags that describe this benchmark. This is used by the
   /// harness to allow for easy slicing of the set of benchmarks along tag
   /// boundaries, e.x.: run all string benchmarks or ref count benchmarks, etc.
-  public var tags: [BenchmarkCategory]
+  public var tags: Set<BenchmarkCategory>
 
   /// The platforms that this benchmark supports. This is an OptionSet.
   private var unsupportedPlatforms: BenchmarkPlatformSet
@@ -146,7 +158,7 @@ public struct BenchmarkInfo {
               unsupportedPlatforms: BenchmarkPlatformSet = []) {
     self.name = name
     self._runFunction = runFunction
-    self.tags = tags
+    self.tags = Set(tags)
     self._setUpFunction = setUpFunction
     self._tearDownFunction = tearDownFunction
     self.unsupportedPlatforms = unsupportedPlatforms
diff --git a/test/benchmark/Benchmark_Driver.test-sh b/test/benchmark/Benchmark_Driver.test-sh
new file mode 100644
index 0000000000000..dec24a369da2b
--- /dev/null
+++ b/test/benchmark/Benchmark_Driver.test-sh
@@ -0,0 +1,22 @@
+// REQUIRES: OS=macosx
+// REQUIRES: asserts
+// REQUIRES: benchmark
+// REQUIRES: CMAKE_GENERATOR=Ninja
+
+// Integration tests between Benchmark_Driver and Benchmark_O
+// TODO: Keep the "run just once" check and move the rest into unit tests for
+//       Benchmark_Driver, as they are redundant and unnecessarily slow.
+
+// RUN: %Benchmark_Driver run Ackermann | %FileCheck %s --check-prefix RUNNAMED
+// RUNNAMED: Ackermann
+
+// RUN: %Benchmark_Driver run 1 | %FileCheck %s --check-prefix RUNBYNUMBER
+// RUNBYNUMBER: Ackermann
+
+// RUN: %Benchmark_Driver run 1 Ackermann 1 \
+// RUN:                 | %FileCheck %s --check-prefix RUNJUSTONCE
+// RUNJUSTONCE-LABEL: Ackermann
+// RUNJUSTONCE-NOT: Ackermann
+
+// RUN: %Benchmark_Driver run -f Acker | %FileCheck %s --check-prefix RUNFILTER
+// RUNFILTER: Ackermann
diff --git a/test/benchmark/Benchmark_O.test-sh b/test/benchmark/Benchmark_O.test-sh
new file mode 100644
index 0000000000000..7e991e9240c2d
--- /dev/null
+++ b/test/benchmark/Benchmark_O.test-sh
@@ -0,0 +1,50 @@
+// REQUIRES: OS=macosx
+// REQUIRES: asserts
+// REQUIRES: benchmark
+// REQUIRES: CMAKE_GENERATOR=Ninja
+
+// RUN: %Benchmark_O --list | %FileCheck %s --check-prefix LISTTAGS
+// LISTTAGS: AngryPhonebook,[
+// LISTTAGS-NOT: TestsUtils.BenchmarkCategory.
+// LISTTAGS-SAME: String,
+// LISTTAGS-SAME: ]
+
+// RUN: %Benchmark_O AngryPhonebook --num-iters=1 \
+// RUN:             | %FileCheck %s --check-prefix NUMITERS1
+// NUMITERS1: AngryPhonebook,1
+// NUMITERS1-NOT: 0,0,0,0,0
+
+// Should run benchmark by name, even if its tags match the default skip-tags
+// (unstable,skip). Ackermann is marked unstable
+// RUN: %Benchmark_O Ackermann | %FileCheck %s --check-prefix NAMEDSKIP
+// NAMEDSKIP: Ackermann
+
+// RUN: %Benchmark_O --list --tags=Dictionary,Array \
+// RUN:             | %FileCheck %s --check-prefix ANDTAGS
+// ANDTAGS: TwoSum
+// ANDTAGS-NOT: Array2D
+// ANDTAGS-NOT: DictionarySwap
+
+// RUN: %Benchmark_O --list --tags=algorithm --skip-tags=validation \
+// RUN:             | %FileCheck %s --check-prefix TAGSANDSKIPTAGS
+// TAGSANDSKIPTAGS: Ackermann
+// TAGSANDSKIPTAGS: DictOfArraysToArrayOfDicts
+// TAGSANDSKIPTAGS: Fibonacci
+// TAGSANDSKIPTAGS: RomanNumbers
+
+// RUN: %Benchmark_O  --list --tags=algorithm \
+// RUN:               --skip-tags=validation,Dictionary,String \
+// RUN:             | %FileCheck %s --check-prefix ORSKIPTAGS
+// ORSKIPTAGS: Ackermann
+// ORSKIPTAGS-NOT: DictOfArraysToArrayOfDicts
+// ORSKIPTAGS: Fibonacci
+// ORSKIPTAGS-NOT: RomanNumbers
+
+// RUN: %Benchmark_O  --list | %FileCheck %s --check-prefix LISTPRECOMMIT
+// LISTPRECOMMIT: #,Test,[Tags]
+// LISTPRECOMMIT-NOT: Ackermann
+// LISTPRECOMMIT: {{[0-9]+}},AngryPhonebook
+
+// RUN: %Benchmark_O  --list --skip-tags= | %FileCheck %s --check-prefix LISTALL
+// LISTALL: Ackermann
+// LISTALL: AngryPhonebook
diff --git a/validation-test/Python/benchmark-scripts.test-sh b/test/benchmark/benchmark-scripts.test-sh
similarity index 100%
rename from validation-test/Python/benchmark-scripts.test-sh
rename to test/benchmark/benchmark-scripts.test-sh
diff --git a/test/lit.cfg b/test/lit.cfg
index 10a0f0dd2cf7f..46f5e7c56fd37 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -272,6 +272,8 @@ config.complete_test = inferSwiftBinary('complete-test')
 config.swift_api_digester = inferSwiftBinary('swift-api-digester')
 config.swift_refactor = inferSwiftBinary('swift-refactor')
 config.swift_demangle_yamldump = inferSwiftBinary('swift-demangle-yamldump')
+config.benchmark_o = inferSwiftBinary('Benchmark_O')
+config.benchmark_driver = inferSwiftBinary('Benchmark_Driver')
 
 config.swift_utils = make_path(config.swift_src_root, 'utils')
 config.line_directive = make_path(config.swift_utils, 'line-directive')
@@ -366,6 +368,8 @@ config.substitutions.append( ('%swift-llvm-opt', config.swift_llvm_opt) )
 config.substitutions.append( ('%llvm-dwarfdump', config.llvm_dwarfdump) )
 config.substitutions.append( ('%llvm-dis', config.llvm_dis) )
 config.substitutions.append( ('%swift-demangle-yamldump', config.swift_demangle_yamldump) )
+config.substitutions.append( ('%Benchmark_O', config.benchmark_o) )
+config.substitutions.append( ('%Benchmark_Driver', config.benchmark_driver) )
 
 # This must come after all substitutions containing "%swift".
 config.substitutions.append(
diff --git a/test/lit.site.cfg.in b/test/lit.site.cfg.in
index 52f935e1095a3..5d4017d092845 100644
--- a/test/lit.site.cfg.in
+++ b/test/lit.site.cfg.in
@@ -88,6 +88,9 @@ config.available_features.add("CMAKE_GENERATOR=@CMAKE_GENERATOR@")
 if "@SWIFT_ENABLE_SOURCEKIT_TESTS@" == "TRUE":
     config.available_features.add('sourcekit')
 
+if "@SWIFT_BUILD_PERF_TESTSUITE@" == "TRUE":
+    config.available_features.add('benchmark')
+
 if "@SWIFT_ENABLE_GUARANTEED_NORMAL_ARGUMENTS@" == "TRUE":
    config.available_features.add('plus_zero_runtime')
 else: