diff --git a/README.md b/README.md index 638fcc9..ddcd09b 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,8 @@ Supporting four different mbox file formats in pure golang. Package mbox implements a reader and writer for working with mbox files. +It also provides a tool to potentially determine the type of mbox format, +although it isn't possible to create tool that can definitively determine this. The package supports four types of mbox files: @@ -29,7 +31,10 @@ Use `mboxcl2` to address the lines starting with 'From ' by doing what mboxcl does, except it doesn't add '>' characters at all. You may need to know which type to use when reading or writing an mbox, for -best results. +best results. However, you can try using `DetectType()` to work out the type +of mbox. Thanks go to [BenjamenMeyer's Thunderbird Mailbox Deduper +code](https://github.com/BenjamenMeyer/go-tb-dedup) for incentivizing me to +create `DetectType()`, even if I took a different approach. NOTE: These routines do not concern themselves with file locking. You may want to consider that while working with mbox files on systems that might actively diff --git a/detect.go b/detect.go new file mode 100644 index 0000000..d5bc712 --- /dev/null +++ b/detect.go @@ -0,0 +1,145 @@ +package mbox + +import ( + "bufio" + "fmt" + "io" + "regexp" + "strconv" + "strings" +) + +func lineFeedType(reader io.ReadSeeker) (bool, error) { + _, err := reader.Seek(0, io.SeekStart) + if err != nil { + return false, err + } + b := make([]byte, 1024) + for count, err := reader.Read(b); err == nil; { + for i := 0; i < count; i++ { + if b[i] == '\r' { + return true, nil + } + if b[i] == '\n' { + return false, nil + } + } + } + return false, fmt.Errorf("no carriage return or line feed") +} + +// DetectType attempts to figure out the type of mbox the reader holds. This +// is a best-effort attempt to determine the type of mbox file format based on +// what it sees within the text. +// +// It tries to work out the type of file by: +// - Looking for 'Content-Length' in a message's header +// - Looking for '>From ' or '>>From ' (or any number of > character in front +// of "From "') in the message's body. +// - Using the length of the 'Content-Length', if present, to determine when +// the body of the message is complete. +// +// With this information, it can guess if the mbox matches one of the file +// types supported by this library with some degree of certainty. +func DetectType(reader io.ReadSeeker) (mboxType int, err error) { + rdMatch := regexp.MustCompile(`^>*>From `) + clMatch := regexp.MustCompile(`^Content-Length:`) + feedType, err := lineFeedType(reader) + if err != nil { + return -1, err + } + _, err = reader.Seek(0, io.SeekStart) + if err != nil { + return -1, err + } + inHeader := false + scanner := bufio.NewScanner(reader) + var hasRd bool = false + var hasCL bool = false + var count int64 = 0 + var clLen int64 = 0 + var finishedFirst bool = false + for scanner.Scan() { + // NOTE: + // The man page for mbox indicates that one can tell different mailings + // apart via lines that start with From followed by a space. It also + // states that the message is RFC 822 encoded. An RFC 822 encoded text + // message separates the header from the body via a 'null' line (CRLF) + // Ergo, for our purposes, we'll scan over the file, assuming From_ + // indicates the start of the header, and a null line indicates the + // start of the body. + + // Trimming the space to ensure extranous characters won't figure into + // this. + line := strings.TrimSpace(scanner.Text()) + if inHeader && len(line) == 0 { + inHeader = false + count = 0 + finishedFirst = true + continue + } + if !inHeader && !hasCL && strings.HasPrefix(line, "From ") { + inHeader = true + } + if !inHeader && hasCL { + count += int64(len(scanner.Text())) + 1 + if feedType { + count += 1 + } + } + + matchRd := rdMatch.MatchString(line) + matchCL := clMatch.MatchString(line) + + if inHeader && matchCL { + hasCL = true + // We have a content-length. We need to parse it to determine the + // length of the body. + sp := strings.Split(line, ":") + if len(sp) < 2 { + // well, er, this is awkward... + continue + } + intCL, err := strconv.ParseInt(strings.TrimSpace(sp[1]), 10, 64) + if err != nil { + // I guess that wasn't an int. + continue + } + clLen = intCL + } + + if !inHeader && matchRd { + hasRd = true + } + + if hasRd && hasCL { + // We have enough evidence: + // This has content length & >From_ in the body. + return MBOXCL, nil + } + if hasCL && !inHeader && strings.HasPrefix(line, "From ") { + // We have enough evidence: + // This has content length, we're in the body, and the line starts + // with 'From '. + return MBOXCL2, nil + } + if !inHeader && clLen == count { + count = 0 + finishedFirst = true + // This isn't technically true, but helps for our tests. + inHeader = true + } + + if finishedFirst && !hasCL && hasRd { + // We have enough evidence: + // This doesn't have content length, but does have lines in the + // body starting with >First. + return MBOXRD, nil + } + } + if hasCL && !hasRd { + // We don't really know. It could be MBOXCL2, or MBOXCL. We will err on the side of caution. + return MBOXCL, nil + } + return MBOXO, nil +} diff --git a/detect_test.go b/detect_test.go new file mode 100644 index 0000000..72634fe --- /dev/null +++ b/detect_test.go @@ -0,0 +1,165 @@ +package mbox_test + +import ( + "strings" + "testing" + + "github.com/tvanriper/mbox" +) + +func TestDetectRD(t *testing.T) { + mb := `From someone +From: chuckles@funbunny.org +To: hhefner@playboy.com +Subject: Closed Captioning + +>From the engineers: do we really need closed captioning on this material? +Please let me know, it'd save a lot of money if we could avoid it. + +From hhefner +From: hhefner@playboy.com +To: chuckles@funbunny.org +Subject: RE: Closed Captioning + +Yes, silly as it sounds, we broadcast this material, and it must therefore have +closed captioning. The deaf will enjoy reading the material. +` + mType, err := mbox.DetectType(strings.NewReader(mb)) + if err != nil { + t.Error(err) + } + if mType != mbox.MBOXRD { + t.Errorf("expected %d but got %d", mbox.MBOXRD, mType) + } + + // Ensure we also detect MBOXRD if the second mail has the character. + mb = `From someone +From: chuckles@funbunny.org +To: hhefner@playboy.com +Subject: Closed Captioning + +Do we really need closed captioning on this material? Please let me know, it'd +save a lot of money if we could avoid it. + +From hhefner +From: hhefner@playboy.com +To: chuckles@funbunny.org +Subject: RE: Closed Captioning + +>From my lawyers: yes, silly as it sounds, we broadcast this material, and it +must therefore have closed captioning. The deaf will enjoy reading the +material. +` + mType, err = mbox.DetectType(strings.NewReader(mb)) + if err != nil { + t.Error(err) + } + if mType != mbox.MBOXRD { + t.Errorf("expected %d but got %d", mbox.MBOXRD, mType) + } +} + +func TestDetectCL(t *testing.T) { + mb := `From someone +From: bubbles@bubbletown.com +To: mrmxpdstk@lazytown.com +Subject: To interpretation +Content-Length: 33 + +We should all try to enjoy life! +From someone-else +>From mug: weird header +Content-Length: 130 +From: mrspam@corporate.corp.com +To: mrmxpdstk@lazytown.com +Subject: Bestest offer in the universe!!11!! + +You won't believe these prices! +>From 1 cent to 11 cents, we carry the least expensive +line of jets this side of the Gobi Desert! +From nobody +From: nobody@nowhere.man +To: mrmxpdstk@lazytown.com +Subject: Mysterious Jenkins +Content-Length: 0 + +` + mType, err := mbox.DetectType(strings.NewReader(mb)) + if err != nil { + t.Error(err) + } + if mType != mbox.MBOXCL { + t.Errorf("expected %d but got %d", mbox.MBOXCL, mType) + } +} + +func TestDetectCL2(t *testing.T) { + mb := `From someone +From: bubbles@bubbletown.com +To: mrmxpdstk@lazytown.com +Subject: To interpretation +Content-Length: 33 + +We should all try to enjoy life! +From someone-else +>From mug: weird header +Content-Length: 129 +From: mrspam@corporate.corp.com +To: mrmxpdstk@lazytown.com +Subject: Bestest offer in the universe!!11!! + +You won't believe these prices! +From 1 cent to 11 cents, we carry the least expensive +line of jets this side of the Gobi Desert! +From nobody +From: nobody@nowhere.man +To: mrmxpdstk@lazytown.com +Subject: Mysterious Jenkins +Content-Length: 0 + +` + mType, err := mbox.DetectType(strings.NewReader(mb)) + if err != nil { + t.Error(err) + } + if mType != mbox.MBOXCL2 { + t.Errorf("expected %d but got %d", mbox.MBOXCL2, mType) + } +} + +func TestDetectO(t *testing.T) { + mb := `From someone +From: bubbles@bubbletown.com +To: mrmxpdstk@lazytown.com +Subject: To interpretation + +From all of us, to all of you, be happy! +From someone-else +From: mrspam@corporate.corp.com +To: mrmxpdstk@lazytown.com +Subject: Bestest offer in the universe!!11!! + +You won't believe these prices! +From 1 cent to 11 cents, we carry the least expensive +line of jets this side of the Gobi Desert! +` + mType, err := mbox.DetectType(strings.NewReader(mb)) + if err != nil { + t.Error(err) + } + if mType != mbox.MBOXO { + t.Errorf("expected %d but got %d", mbox.MBOXO, mType) + } +} + +func TestDetectAltLinefeed(t *testing.T) { + mb := "From someone\r\nFrom: chuckles@funbunny.org\r\nTo: hhefner@playboy.com\r\nSubject: Closed Captioning\r\n\r\n>From the engineers: do we really need closed captioning on this material?\r\nPlease let me know, it'd save a lot of money if we could avoid it.\r\n\r\nFrom hhefner\r\nFrom: hhefner@playboy.com\r\nTo: chuckles@funbunny.org\r\nSubject: RE: Closed Captioning\r\n\r\nYes, silly as it sounds, we broadcast this material, and it must therefore have\r\nclosed captioning. The deaf will enjoy reading the material.\r\n" + mType, err := mbox.DetectType(strings.NewReader(mb)) + if err != nil { + t.Error(err) + } + if mType != mbox.MBOXRD { + t.Errorf("expected %d but got %d", mbox.MBOXRD, mType) + } + +} diff --git a/example_mboxdetecttype_test.go b/example_mboxdetecttype_test.go new file mode 100644 index 0000000..0f1a868 --- /dev/null +++ b/example_mboxdetecttype_test.go @@ -0,0 +1,50 @@ +package mbox_test + +import ( + "bytes" + "fmt" + + "github.com/tvanriper/mbox" +) + +// Imagine this is a file on your filesystem instead of a variable in your code. +const mboxrd2 string = `From bubbles@bubbletown.com Mon Jul 04 14:23:45 2022 +From: bubbles@bubbletown.com +To: mrmxpdstk@lazytown.com +Subject: To interpretation + +>From all of us, to all of you, be happy! +From mrspam@corporate.corp.com Mon Jul 04 15:02:15 2022 +From: mrspam@corporate.corp.com +To: mrmxpdstk@lazytown.com +Subject: Bestest offer in the universe!!11!! + +You won't believe these prices! +>From 1 cent to 11 cents, we carry the least expensive +line of jets this side of the Gobi Desert! +` + +func ExampleDetectType() { + // Imagine you used os.Open instead of bytes.NewBuffer here. + reader := bytes.NewReader([]byte(mboxrd2)) + mbType, err := mbox.DetectType(reader) + if err != nil { + fmt.Printf("failed to detect mbox type: %s\n", err) + return + } + switch mbType { + case mbox.MBOXO: + fmt.Println("MBOXO") + case mbox.MBOXRD: + fmt.Println("MBOXRD") + case mbox.MBOXCL: + fmt.Println("MBOXCL") + case mbox.MBOXCL2: + fmt.Println("MBOXCL2") + default: + fmt.Println("Unknown") + } + + // Output: + // MBOXRD +}