/
digest.go
555 lines (489 loc) · 20.7 KB
/
digest.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package verify
import (
"bytes"
"crypto/sha256"
"encoding/binary"
"encoding/hex"
"fmt"
"hash"
"io"
"os"
"path/filepath"
"sort"
"strconv"
"strings"
"github.com/pkg/errors"
)
// HashVersion is an arbitrary number that identifies the hash algorithm used by
// the directory hasher.
//
// 1: SHA256, as implemented in crypto/sha256
const HashVersion = 1
const osPathSeparator = string(filepath.Separator)
// lineEndingReader is a `io.Reader` that converts CRLF sequences to LF.
//
// When cloning or checking out repositories, some Version Control Systems,
// VCSs, on some supported Go Operating System architectures, GOOS, will
// automatically convert line endings that end in a single line feed byte, LF,
// to line endings that end in a two byte sequence of carriage return, CR,
// followed by LF. This LF to CRLF conversion would cause otherwise identical
// versioned files to have different on disk contents simply based on which VCS
// and GOOS are involved. Different file contents for the same file would cause
// the resultant hashes to differ. In order to ensure file contents normalize
// and produce the same hash, this structure wraps an io.Reader that modifies
// the file's contents when it is read, translating all CRLF sequences to LF.
type lineEndingReader struct {
src io.Reader // source io.Reader from which this reads
prevReadEndedCR bool // used to track whether final byte of previous Read was CR
}
// newLineEndingReader returns a new lineEndingReader that reads from the
// specified source io.Reader.
func newLineEndingReader(src io.Reader) *lineEndingReader {
return &lineEndingReader{src: src}
}
var crlf = []byte("\r\n")
// Read consumes bytes from the structure's source io.Reader to fill the
// specified slice of bytes. It converts all CRLF byte sequences to LF, and
// handles cases where CR and LF straddle across two Read operations.
func (f *lineEndingReader) Read(buf []byte) (int, error) {
buflen := len(buf)
if f.prevReadEndedCR {
// Read one fewer bytes so we have room if the first byte of the
// upcoming Read is not a LF, in which case we will need to insert
// trailing CR from previous read.
buflen--
}
nr, er := f.src.Read(buf[:buflen])
if nr > 0 {
if f.prevReadEndedCR && buf[0] != '\n' {
// Having a CRLF split across two Read operations is rare, so the
// performance impact of copying entire buffer to the right by one
// byte, while suboptimal, will at least will not happen very
// often. This negative performance impact is mitigated somewhat on
// many Go compilation architectures, GOARCH, because the `copy`
// builtin uses a machine opcode for performing the memory copy on
// possibly overlapping regions of memory. This machine opcodes is
// not instantaneous and does require multiple CPU cycles to
// complete, but is significantly faster than the application
// looping through bytes.
copy(buf[1:nr+1], buf[:nr]) // shift data to right one byte
buf[0] = '\r' // insert the previous skipped CR byte at start of buf
nr++ // pretend we read one more byte
}
// Remove any CRLF sequences in the buffer using `bytes.Index` because,
// like the `copy` builtin on many GOARCHs, it also takes advantage of a
// machine opcode to search for byte patterns.
var searchOffset int // index within buffer from whence the search will commence for each loop; set to the index of the end of the previous loop.
var shiftCount int // each subsequenct shift operation needs to shift bytes to the left by one more position than the shift that preceded it.
previousIndex := -1 // index of previously found CRLF; -1 means no previous index
for {
index := bytes.Index(buf[searchOffset:nr], crlf)
if index == -1 {
break
}
index += searchOffset // convert relative index to absolute
if previousIndex != -1 {
// shift substring between previous index and this index
copy(buf[previousIndex-shiftCount:], buf[previousIndex+1:index])
shiftCount++ // next shift needs to be 1 byte to the left
}
previousIndex = index
searchOffset = index + 2 // start next search after len(crlf)
}
if previousIndex != -1 {
// handle final shift
copy(buf[previousIndex-shiftCount:], buf[previousIndex+1:nr])
shiftCount++
}
nr -= shiftCount // shorten byte read count by number of shifts executed
// When final byte from a read operation is CR, do not emit it until
// ensure first byte on next read is not LF.
if f.prevReadEndedCR = buf[nr-1] == '\r'; f.prevReadEndedCR {
nr-- // pretend byte was never read from source
}
} else if f.prevReadEndedCR {
// Reading from source returned nothing, but this struct is sitting on a
// trailing CR from previous Read, so let's give it to client now.
buf[0] = '\r'
nr = 1
er = nil
f.prevReadEndedCR = false // prevent infinite loop
}
return nr, er
}
// writeBytesWithNull appends the specified data to the specified hash, followed by
// the NULL byte, in order to make accidental hash collisions less likely.
func writeBytesWithNull(h hash.Hash, data []byte) {
// Ignore return values from writing to the hash, because hash write always
// returns nil error.
_, _ = h.Write(append(data, 0))
}
// dirWalkClosure is used to reduce number of allocation involved in closing
// over these variables.
type dirWalkClosure struct {
someCopyBufer []byte // allocate once and reuse for each file copy
someModeBytes []byte // allocate once and reuse for each node
someDirLen int
someHash hash.Hash
}
// DigestFromDirectory returns a hash of the specified directory contents, which
// will match the hash computed for any directory on any supported Go platform
// whose contents exactly match the specified directory.
//
// This function ignores any file system node named `vendor`, `.bzr`, `.git`,
// `.hg`, and `.svn`, as these are typically used as Version Control System
// (VCS) directories.
//
// Other than the `vendor` and VCS directories mentioned above, the calculated
// hash includes the pathname to every discovered file system node, whether it
// is an empty directory, a non-empty directory, an empty file, or a non-empty file.
//
// Symbolic links are excluded, as they are not considered valid elements in the
// definition of a Go module.
func DigestFromDirectory(osDirname string) (VersionedDigest, error) {
osDirname = filepath.Clean(osDirname)
// Create a single hash instance for the entire operation, rather than a new
// hash for each node we encounter.
closure := dirWalkClosure{
someCopyBufer: make([]byte, 4*1024), // only allocate a single page
someModeBytes: make([]byte, 4), // scratch place to store encoded os.FileMode (uint32)
someDirLen: len(osDirname) + len(osPathSeparator),
someHash: sha256.New(),
}
err := filepath.Walk(osDirname, func(osPathname string, info os.FileInfo, err error) error {
if err != nil {
return err
}
// Completely ignore symlinks.
if info.Mode()&os.ModeSymlink != 0 {
return nil
}
var osRelative string
if len(osPathname) > closure.someDirLen {
osRelative = osPathname[closure.someDirLen:]
}
switch filepath.Base(osRelative) {
case "vendor", ".bzr", ".git", ".hg", ".svn":
return filepath.SkipDir
}
// We could make our own enum-like data type for encoding the file type,
// but Go's runtime already gives us architecture independent file
// modes, as discussed in `os/types.go`:
//
// Go's runtime FileMode type has same definition on all systems, so
// that information about files can be moved from one system to
// another portably.
var mt os.FileMode
// We only care about the bits that identify the type of a file system
// node, and can ignore append, exclusive, temporary, setuid, setgid,
// permission bits, and sticky bits, which are coincident to bits which
// declare type of the file system node.
modeType := info.Mode() & os.ModeType
var shouldSkip bool // skip some types of file system nodes
switch {
case modeType&os.ModeDir > 0:
mt = os.ModeDir
// This func does not need to enumerate children, because
// filepath.Walk will do that for us.
shouldSkip = true
case modeType&os.ModeNamedPipe > 0:
mt = os.ModeNamedPipe
shouldSkip = true
case modeType&os.ModeSocket > 0:
mt = os.ModeSocket
shouldSkip = true
case modeType&os.ModeDevice > 0:
mt = os.ModeDevice
shouldSkip = true
}
// Write the relative pathname to hash because the hash is a function of
// the node names, node types, and node contents. Added benefit is that
// empty directories, named pipes, sockets, and devices. Use
// `filepath.ToSlash` to ensure relative pathname is os-agnostic.
writeBytesWithNull(closure.someHash, []byte(filepath.ToSlash(osRelative)))
binary.LittleEndian.PutUint32(closure.someModeBytes, uint32(mt)) // encode the type of mode
writeBytesWithNull(closure.someHash, closure.someModeBytes) // and write to hash
if shouldSkip {
return nil // nothing more to do for some of the node types
}
// If we get here, node is a regular file.
fh, err := os.Open(osPathname)
if err != nil {
return errors.Wrap(err, "cannot Open")
}
var bytesWritten int64
bytesWritten, err = io.CopyBuffer(closure.someHash, newLineEndingReader(fh), closure.someCopyBufer) // fast copy of file contents to hash
err = errors.Wrap(err, "cannot Copy") // errors.Wrap only wraps non-nil, so skip extra check
writeBytesWithNull(closure.someHash, []byte(strconv.FormatInt(bytesWritten, 10))) // 10: format file size as base 10 integer
// Close the file handle to the open file without masking
// possible previous error value.
if er := fh.Close(); err == nil {
err = errors.Wrap(er, "cannot Close")
}
return err
})
if err != nil {
return VersionedDigest{}, err
}
return VersionedDigest{
HashVersion: HashVersion,
Digest: closure.someHash.Sum(nil),
}, nil
}
// VendorStatus represents one of a handful of possible status conditions for a
// particular file system node in the vendor directory tree.
type VendorStatus uint8
const (
// NotInLock is used when a file system node exists for which there is no
// corresponding dependency in the lock file.
NotInLock VendorStatus = iota
// NotInTree is used when a lock file dependency exists for which there is
// no corresponding file system node.
NotInTree
// NoMismatch is used when the digest for a dependency listed in the
// lockfile matches what is calculated from the file system.
NoMismatch
// EmptyDigestInLock is used when the digest for a dependency listed in the
// lock file is the empty string. While this is a special case of
// DigestMismatchInLock, separating the cases is a desired feature.
EmptyDigestInLock
// DigestMismatchInLock is used when the digest for a dependency listed in
// the lock file does not match what is calculated from the file system.
DigestMismatchInLock
// HashVersionMismatch indicates that the hashing algorithm used to generate
// the digest being compared against is not the same as the one used by the
// current program.
HashVersionMismatch
)
func (ls VendorStatus) String() string {
switch ls {
case NotInTree:
return "not in tree"
case NotInLock:
return "not in lock"
case NoMismatch:
return "match"
case EmptyDigestInLock:
return "empty digest in lock"
case DigestMismatchInLock:
return "mismatch"
case HashVersionMismatch:
return "hasher changed"
}
return "unknown"
}
// fsnode is used to track which file system nodes are required by the lock
// file. When a directory is found whose name matches one of the declared
// projects in the lock file, e.g., "github.com/alice/alice1", an fsnode is
// created for that directory, but not for any of its children. All other file
// system nodes encountered will result in a fsnode created to represent it.
type fsnode struct {
osRelative string // os-specific relative path of a resource under vendor root
isRequiredAncestor bool // true iff this node or one of its descendants is in the lock file
myIndex, parentIndex int // index of this node and its parent in the tree's slice
}
// VersionedDigest comprises both a hash digest, and a simple integer indicating
// the version of the hash algorithm that produced the digest.
type VersionedDigest struct {
HashVersion int
Digest []byte
}
func (vd VersionedDigest) String() string {
return fmt.Sprintf("%s:%s", strconv.Itoa(vd.HashVersion), hex.EncodeToString(vd.Digest))
}
// IsEmpty indicates if the VersionedDigest is the zero value.
func (vd VersionedDigest) IsEmpty() bool {
return vd.HashVersion == 0 && len(vd.Digest) == 0
}
// ParseVersionedDigest decodes the string representation of versioned digest
// information - a colon-separated string with a version number in the first
// part and the hex-encdoed hash digest in the second - as a VersionedDigest.
func ParseVersionedDigest(input string) (VersionedDigest, error) {
var vd VersionedDigest
var err error
parts := strings.Split(input, ":")
if len(parts) != 2 {
return VersionedDigest{}, errors.Errorf("expected two colon-separated components in the versioned hash digest, got %q", input)
}
if vd.Digest, err = hex.DecodeString(parts[1]); err != nil {
return VersionedDigest{}, err
}
if vd.HashVersion, err = strconv.Atoi(parts[0]); err != nil {
return VersionedDigest{}, err
}
return vd, nil
}
// CheckDepTree verifies a dependency tree according to expected digest sums,
// and returns an associative array of file system nodes and their respective
// vendor status conditions.
//
// The keys to the expected digest sums associative array represent the
// project's dependencies, and each is required to be expressed using the
// solidus character, `/`, as its path separator. For example, even on a GOOS
// platform where the file system path separator is a character other than
// solidus, one particular dependency would be represented as
// "github.com/alice/alice1".
func CheckDepTree(osDirname string, wantDigests map[string]VersionedDigest) (map[string]VendorStatus, error) {
osDirname = filepath.Clean(osDirname)
// Create associative array to store the results of calling this function.
slashStatus := make(map[string]VendorStatus)
// Ensure top level pathname is a directory
fi, err := os.Stat(osDirname)
if err != nil {
// If the dir doesn't exist at all, that's OK - just consider all the
// wanted paths absent.
if os.IsNotExist(err) {
for path := range wantDigests {
slashStatus[path] = NotInTree
}
return slashStatus, nil
}
return nil, errors.Wrap(err, "cannot Stat")
}
if !fi.IsDir() {
return nil, errors.Errorf("cannot verify non directory: %q", osDirname)
}
// Initialize work queue with a node representing the specified directory
// name by declaring its relative pathname under the directory name as the
// empty string.
currentNode := &fsnode{osRelative: "", parentIndex: -1, isRequiredAncestor: true}
queue := []*fsnode{currentNode} // queue of directories that must be inspected
// In order to identify all file system nodes that are not in the lock file,
// represented by the specified expected sums parameter, and in order to
// only report the top level of a subdirectory of file system nodes, rather
// than every node internal to them, we will create a tree of nodes stored
// in a slice. We do this because we cannot predict the depth at which
// project roots occur. Some projects are fewer than and some projects more
// than the typical three layer subdirectory under the vendor root
// directory.
//
// For a following few examples, assume the below vendor root directory:
//
// github.com/alice/alice1/a1.go
// github.com/alice/alice2/a2.go
// github.com/bob/bob1/b1.go
// github.com/bob/bob2/b2.go
// launchpad.net/nifty/n1.go
//
// 1) If only the `alice1` and `alice2` projects were in the lock file, we'd
// prefer the output to state that `github.com/bob` is `NotInLock`, and
// `launchpad.net/nifty` is `NotInLock`.
//
// 2) If `alice1`, `alice2`, and `bob1` were in the lock file, we'd want to
// report `github.com/bob/bob2` as `NotInLock`, and `launchpad.net/nifty` is
// `NotInLock`.
//
// 3) If none of `alice1`, `alice2`, `bob1`, or `bob2` were in the lock
// file, the entire `github.com` directory would be reported as `NotInLock`,
// along with `launchpad.net/nifty` is `NotInLock`.
//
// Each node in our tree has the slice index of its parent node, so once we
// can categorically state a particular directory is required because it is
// in the lock file, we can mark all of its ancestors as also being
// required. Then, when we finish walking the directory hierarchy, any nodes
// which are not required but have a required parent will be marked as
// `NotInLock`.
nodes := []*fsnode{currentNode}
// Mark directories of expected projects as required. When each respective
// project is later found while traversing the vendor root hierarchy, its
// status will be updated to reflect whether its digest is empty, or,
// whether or not it matches the expected digest.
for slashPathname := range wantDigests {
slashStatus[slashPathname] = NotInTree
}
for len(queue) > 0 {
// Pop node from the top of queue (depth first traversal, reverse
// lexicographical order inside a directory), clearing the value stored
// in the slice's backing array as we proceed.
lq1 := len(queue) - 1
currentNode, queue[lq1], queue = queue[lq1], nil, queue[:lq1]
slashPathname := filepath.ToSlash(currentNode.osRelative)
osPathname := filepath.Join(osDirname, currentNode.osRelative)
if expectedSum, ok := wantDigests[slashPathname]; ok {
ls := EmptyDigestInLock
if expectedSum.HashVersion != HashVersion {
if !expectedSum.IsEmpty() {
ls = HashVersionMismatch
}
} else if len(expectedSum.Digest) > 0 {
projectSum, err := DigestFromDirectory(osPathname)
if err != nil {
return nil, errors.Wrap(err, "cannot compute dependency hash")
}
if bytes.Equal(projectSum.Digest, expectedSum.Digest) {
ls = NoMismatch
} else {
ls = DigestMismatchInLock
}
}
slashStatus[slashPathname] = ls
// Mark current nodes and all its parents as required.
for i := currentNode.myIndex; i != -1; i = nodes[i].parentIndex {
nodes[i].isRequiredAncestor = true
}
// Do not need to process this directory's contents because we
// already accounted for its contents while calculating its digest.
continue
}
osChildrenNames, err := sortedChildrenFromDirname(osPathname)
if err != nil {
return nil, errors.Wrap(err, "cannot get sorted list of directory children")
}
for _, osChildName := range osChildrenNames {
switch osChildName {
case ".", "..", "vendor", ".bzr", ".git", ".hg", ".svn":
// skip
default:
osChildRelative := filepath.Join(currentNode.osRelative, osChildName)
osChildPathname := filepath.Join(osDirname, osChildRelative)
// Create a new fsnode for this file system node, with a parent
// index set to the index of the current node.
otherNode := &fsnode{osRelative: osChildRelative, myIndex: len(nodes), parentIndex: currentNode.myIndex}
fi, err := os.Stat(osChildPathname)
if err != nil {
return nil, errors.Wrap(err, "cannot Stat")
}
nodes = append(nodes, otherNode) // Track all file system nodes...
if fi.IsDir() {
queue = append(queue, otherNode) // but only need to add directories to the work queue.
}
}
}
}
// Ignoring first node in the list, walk nodes from last to first. Whenever
// the current node is not required, but its parent is required, then the
// current node ought to be marked as `NotInLock`.
for len(nodes) > 1 {
// Pop node from top of queue, clearing the value stored in the slice's
// backing array as we proceed.
ln1 := len(nodes) - 1
currentNode, nodes[ln1], nodes = nodes[ln1], nil, nodes[:ln1]
if !currentNode.isRequiredAncestor && nodes[currentNode.parentIndex].isRequiredAncestor {
slashStatus[filepath.ToSlash(currentNode.osRelative)] = NotInLock
}
}
currentNode, nodes = nil, nil
return slashStatus, nil
}
// sortedChildrenFromDirname returns a lexicographically sorted list of child
// nodes for the specified directory.
func sortedChildrenFromDirname(osDirname string) ([]string, error) {
fh, err := os.Open(osDirname)
if err != nil {
return nil, errors.Wrap(err, "cannot Open")
}
osChildrenNames, err := fh.Readdirnames(0) // 0: read names of all children
if err != nil {
return nil, errors.Wrap(err, "cannot Readdirnames")
}
sort.Strings(osChildrenNames)
// Close the file handle to the open directory without masking possible
// previous error value.
if er := fh.Close(); err == nil {
err = errors.Wrap(er, "cannot Close")
}
return osChildrenNames, err
}