From 0b91037857c59db513ef18a8f7e9831534549a27 Mon Sep 17 00:00:00 2001
From: Chris LaPointe <cmlapointe11@gmail.com>
Date: Tue, 7 Jun 2022 21:50:09 -0400
Subject: [PATCH] Enable follow tailing (#70)

* Enable follow tailing
---
 README.md                                  |   2 +-
 cmd/helpers/extractorBuilder.go            |  10 ++-
 docs/cli-help.md                           | Bin 7493 -> 8460 bytes
 docs/index.md                              |   2 +-
 docs/usage/extractor.md                    |   6 +-
 docs/usage/input.md                        | 100 +++++++++++++++++++++
 docs/usage/overview.md                     |  20 ++++-
 mkdocs.yml                                 |   1 +
 pkg/extractor/batchers/tailBatcher.go      |   8 +-
 pkg/extractor/batchers/tailBatcher_test.go |  48 +++++++++-
 10 files changed, 185 insertions(+), 12 deletions(-)
 create mode 100644 docs/usage/input.md

diff --git a/README.md b/README.md
index d049e9e..4494168 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ See [rare.zdyn.net](https://rare.zdyn.net) or the [docs/ folder](docs/) for the
  * Multiple summary formats including: filter (like grep), histogram, bar graphs, and numerical analysis
  * File glob expansions (eg `/var/log/*` or `/var/log/*/*.log`) and `-R`
  * Optional gzip decompression (with `-z`)
- * Following `-f` or re-open following `-F` (use `--poll` to poll)
+ * Following `-f` or re-open following `-F` (use `--poll` to poll, and `--tail` to tail)
  * Ignoring lines that match an expression (with `-i`)
  * Aggregating and realtime summary (Don't have to wait for all data to be scanned)
  * Multi-threaded reading, parsing, and aggregation (It's fast)
diff --git a/cmd/helpers/extractorBuilder.go b/cmd/helpers/extractorBuilder.go
index 5d3b8b8..bc82c99 100644
--- a/cmd/helpers/extractorBuilder.go
+++ b/cmd/helpers/extractorBuilder.go
@@ -18,6 +18,7 @@ const DefaultArgumentDescriptor = "<-|filename|glob...>"
 func BuildBatcherFromArguments(c *cli.Context) *batchers.Batcher {
 	var (
 		follow            = c.Bool("follow") || c.Bool("reopen")
+		followTail        = c.Bool("tail")
 		followReopen      = c.Bool("reopen")
 		followPoll        = c.Bool("poll")
 		concurrentReaders = c.Int("readers")
@@ -35,6 +36,9 @@ func BuildBatcherFromArguments(c *cli.Context) *batchers.Batcher {
 	if followPoll && !follow {
 		logger.Fatalf("Follow (-f) must be enabled for --poll")
 	}
+	if followTail && !follow {
+		logger.Fatalf("Follow (-f) must be enabled for --tail")
+	}
 
 	fileglobs := c.Args()
 
@@ -50,7 +54,7 @@ func BuildBatcherFromArguments(c *cli.Context) *batchers.Batcher {
 		if gunzip {
 			logger.Println("Cannot combine -f and -z")
 		}
-		return batchers.TailFilesToChan(dirwalk.GlobExpand(fileglobs, recursive), batchSize, followReopen, followPoll)
+		return batchers.TailFilesToChan(dirwalk.GlobExpand(fileglobs, recursive), batchSize, followReopen, followPoll, followTail)
 	} else { // Read (no-follow) source file(s)
 		return batchers.OpenFilesToChan(dirwalk.GlobExpand(fileglobs, recursive), gunzip, concurrentReaders, batchSize)
 	}
@@ -102,6 +106,10 @@ func getExtractorFlags() []cli.Flag {
 			Name:  "poll",
 			Usage: "When following a file, poll for changes rather than using inotify",
 		},
+		cli.BoolFlag{
+			Name:  "tail,t",
+			Usage: "When following a file, navigate to the end of the file to skip existing content",
+		},
 		cli.BoolFlag{
 			Name:  "posix,p",
 			Usage: "Compile regex as against posix standard",
diff --git a/docs/cli-help.md b/docs/cli-help.md
index 5517980335123a551a000bbc6441f892762f16c2..dcc24206402aba0a0d3b3b373efa7318e9c1eba9 100644
GIT binary patch
delta 895
zcmb`Gze)o^5XMmn2BIQ@2>xRfL_NJk5L6DtKZs(X*l1z5=H|}D%WT-4#Kgj0uu!kF
z@Ev3uUqBG-ENv`&2|L}(BgAyeEZ;Z38NPW9Zny6)yUhLd#H7*?D_{lZ%vBSVN?K?v
zUWH9667uuoMQ&Bbq%`_vVQ>-rea{PEs}@&~7&OM%$kb;D**{1!NJg@(qy5eGcIR2h
z#Hl!)>Kj`lQJ1k`%oKc%fE0p?TF*!(t+Kh0RltTcEGdSvB6fWq@;j%V=aVz?saEm4
z_37yVj<j?LnNK9OVK_WK$lMZ1Xuu#6%pEEQ!ZI2ov`m@~&sRD`Jr8psMxvH*084f&
z-`~^jl<CIciG3=Fj!n8}HwSXv?c&&t!Q~F3*rDCFQQT&}?8op+SEn+tXWvI|{)2Op
xJ{4B%=jdDKkkaq*nFSY3pOjw}F=`i4$}VI_(JVvFokXjgCB824GW{7p_yc50G=%^F

delta 102
zcmeBiI%+kcoL!-~GOr}DVq%5t#w}T_o1@tnnI<0+aoilu$<92vO*~-pd!9sy$R~bQ
uCZNd8&Ah@PjGGUMv@>mP6Q2l?+bPuvk_*^;UiKd&NdM$~1=Y#t6>I^@h$kcf

diff --git a/docs/index.md b/docs/index.md
index 52a6240..777c5dc 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -19,7 +19,7 @@ Supports various CLI-based graphing and metric formats (filter (grep-like), hist
  * Multiple summary formats including: filter (like grep), histogram, bar graphs, and numerical analysis
  * File glob expansions (eg `/var/log/*` or `/var/log/*/*.log`) and `-R`
  * Optional gzip decompression (with `-z`)
- * Following `-f` or re-open following `-F` (use `--poll` to poll)
+ * Following `-f` or re-open following `-F` (use `--poll` to poll, and `--tail` to tail)
  * Ignoring lines that match an expression (with `-i`)
  * Aggregating and realtime summary (Don't have to wait for all data to be scanned)
  * Multi-threaded reading, parsing, and aggregation (It's fast)
diff --git a/docs/usage/extractor.md b/docs/usage/extractor.md
index 0f4981c..d8baf56 100644
--- a/docs/usage/extractor.md
+++ b/docs/usage/extractor.md
@@ -44,4 +44,8 @@ get something that will count based on keys.
 
 ```bash
 rare histogram -m '"(\w{3,4}) ([A-Za-z0-9/.@_-]+)' -e '{1} {2}' -b access.log
-```
\ No newline at end of file
+```
+
+## See Also
+
+* [Regular Expressions](regexp.md)
\ No newline at end of file
diff --git a/docs/usage/input.md b/docs/usage/input.md
new file mode 100644
index 0000000..2684a76
--- /dev/null
+++ b/docs/usage/input.md
@@ -0,0 +1,100 @@
+# Input
+
+*rare* reads the supplied inputs in massive parallelization, rather
+than in-order reads.  In most cases, you won't need to do anything
+other than specifying what to read.  In some cases, you may want to
+tweak some parameters.
+
+## Input Methods
+
+### Read File(s)
+
+The simplest version of reading files is by specifying one or more filename:
+
+`rare <aggregator> file1 file2 file3...`
+
+You can also use simple expansions, such as:
+
+`rare <aggregator> path/**/*.log`
+
+In this case, all `*.log` files in any nested directory under `path/` will be read.
+
+or you can use recursion, which will read all plain files in the path
+
+`rare <aggregator> -R path/`
+
+#### gzip
+
+If the files *may* be gzip'd you can specify `-z`, and will be gunzip'd if able.  If a
+file can't be opened as a gzip file, a warning will be logged, and it will be interpreted
+as a raw file.
+
+`rare <aggregator> -z *.log.gz`
+
+### Following File(s)
+
+Like `tail -f`, following files allows you to watch files actively being written to. This is
+useful, for example, to read a log of an actively running application.
+
+**Note:** When following files, all files are open at once, and max readers are ignored.
+
+`rare <aggregator> -f app.log`
+
+If the file may be deleted and recreated, such as in a log-rotation, you can follow with re-open
+
+`rare <aggregator> -F app.log`
+
+#### Polling (Instead of blocking)
+
+By default, following a file uses `fsnotify` which monitors files for changes.  This should
+work fine for most major operating systems.  If not, you can enable polling to watch for changes
+instead with `--poll`
+
+#### Tailing
+
+If you wish to only start reading at the end of the file (eg. only looking at newer entries),
+you can specify `-t` or `--tail` to start following at the end.
+
+### Stdin/Pipe
+
+There are two ways to read from a pipe: implicit and explicit.
+
+Implicitely, if *rare* detects its stdin is a pipe, it will read it simply by not providing any arguments
+
+`cat file.log | rare <aggregator>` or `rare <aggregator> < file.log`
+
+Explicitely, you can pass a single read argument of `-` (dash) to mandate reading from stdin
+
+`cat file.log | rare <aggregator> -`
+
+## Tweaking the Batcher
+
+There are already some heuristics that optimize how files are read which
+should work for most cases. If you do find you need to modify how *rare*
+is reading, you can tweak two things:
+
+* concurrency -- How many files are read at once
+* batch size -- How many lines read from a given file are "batched" to send to the expression stage
+
+### Concurrency
+
+Concurrency specifies how many files are opened at once (in a normal case). It
+defaults to `3`, but is ignored if following files.
+
+Specify with:
+
+`rare <aggregator> --readers=1 file1 file2 file3...`
+
+### Batch Sizes
+
+Rare reads (by default) 1000 lines in a file, for a batch, before providing it
+to the extractor stage.  This significantly speeds up processing, but comes
+at the cost of being less real-time if input generation is slow.
+
+To counteract this, in the *follow* or *stdin* cases, there's also a flush timeout of
+250ms. This means if a new line has been received, and the duration has passed,
+that the batch will be processed irregardless of its current size.
+
+You can tweak this value with `--batch`
+
+`rare <aggreagator> --batch=10 ...`
diff --git a/docs/usage/overview.md b/docs/usage/overview.md
index 8b17e19..4466d98 100644
--- a/docs/usage/overview.md
+++ b/docs/usage/overview.md
@@ -3,11 +3,23 @@
 Rare is a fast, realtime regex-extraction, and aggregation into common formats
 such as histograms, numerical summaries, tables, and more!
 
-Rare is composed of three parts in the pipeline:
+Rare is composed of four parts in the pipeline:
 
-1. Extraction (Matching)
-2. Expression Building
-3. Aggregation
+1. Batching (Loading)
+2. Extraction (Matching)
+3. Expression Building
+4. Aggregation
+
+## Input (Batching/Loading)
+
+Input (or batching) is the act of feeding contents read from a file (or stdin/pipe) into
+the next stages.  Many times this is invisible, and is simply the pipe or specified filename.
+
+It is possible to tune the batcher to follow the file or batch in different ways.
+
+Read more at:
+
+* [input](input.md)
 
 ## Extraction (Matching)
 
diff --git a/mkdocs.yml b/mkdocs.yml
index e400015..bb32613 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -10,6 +10,7 @@ nav:
   - Home: index.md
   - Usage:
     - Overview: usage/overview.md
+    - Input: usage/input.md
     - Extractor: usage/extractor.md
     - Expressions: usage/expressions.md
     - Aggregators: usage/aggregators.md
diff --git a/pkg/extractor/batchers/tailBatcher.go b/pkg/extractor/batchers/tailBatcher.go
index 663ef8d..5d64247 100644
--- a/pkg/extractor/batchers/tailBatcher.go
+++ b/pkg/extractor/batchers/tailBatcher.go
@@ -8,7 +8,7 @@ import (
 
 // TailFilesToChan tails a set of files to an input batcher that can be consumed by extractor
 //  unlike a normal file batcher, this will attempt to tail all files at once
-func TailFilesToChan(filenames <-chan string, batchSize int, reopen, poll bool) *Batcher {
+func TailFilesToChan(filenames <-chan string, batchSize int, reopen, poll, tail bool) *Batcher {
 	out := newBatcher(128)
 
 	go func() {
@@ -28,6 +28,12 @@ func TailFilesToChan(filenames <-chan string, batchSize int, reopen, poll bool)
 					out.incErrors()
 					return
 				}
+				if tail {
+					if err := r.Drain(); err != nil {
+						logger.Print("Unable to tail file source: ", err)
+						out.incErrors()
+					}
+				}
 
 				out.syncReaderToBatcherWithTimeFlush(filename, r, batchSize, AutoFlushTimeout)
 			}(filename)
diff --git a/pkg/extractor/batchers/tailBatcher_test.go b/pkg/extractor/batchers/tailBatcher_test.go
index 5818c96..6cb04c1 100644
--- a/pkg/extractor/batchers/tailBatcher_test.go
+++ b/pkg/extractor/batchers/tailBatcher_test.go
@@ -1,19 +1,61 @@
 package batchers
 
 import (
+	"io/ioutil"
+	"os"
 	"testing"
+	"time"
 
 	"github.com/stretchr/testify/assert"
 )
 
-func TestBatchTailFile(t *testing.T) {
+func TestBatchFollowFile(t *testing.T) {
 	filenames := make(chan string, 1)
 	filenames <- "tailBatcher_test.go" // me
 
-	batcher := TailFilesToChan(filenames, 5, false, false)
+	batcher := TailFilesToChan(filenames, 5, false, false, false)
 
-	batch := <-batcher.c
+	batch := <-batcher.BatchChan()
 	assert.Equal(t, "tailBatcher_test.go", batch.Source)
 	assert.Len(t, batch.Batch, 5)
 	assert.NotZero(t, batcher.ReadBytes())
 }
+
+func TestBatchFollowTailFile(t *testing.T) {
+	tmp, err := ioutil.TempFile("", "followtest-")
+	if err != nil {
+		panic(err)
+	}
+	defer tmp.Close()
+	defer os.Remove(tmp.Name())
+
+	// Add test data
+	for i := 0; i < 10; i++ {
+		tmp.WriteString("abc\n")
+	}
+
+	// Now tail the file
+	filenames := make(chan string, 1)
+	filenames <- tmp.Name()
+
+	batcher := TailFilesToChan(filenames, 1, false, false, true)
+
+	time.Sleep(300 * time.Millisecond) // Uhg hack cause auto-flushing
+
+	// And write some more data
+	const testLines = 5
+	for i := 0; i < testLines; i++ {
+		tmp.WriteString("abc\n")
+	}
+
+	// And finally assert we got what we wanted
+	for i := 0; i < testLines; i++ {
+		batch, ok := <-batcher.BatchChan()
+		assert.True(t, ok)
+		if ok {
+			assert.Equal(t, tmp.Name(), batch.Source)
+			assert.Equal(t, uint64(i+1), batch.BatchStart)
+			assert.Len(t, batch.Batch, 1)
+		}
+	}
+}