Skip to content

Commit

Permalink
adding badge, adding DetectType()
Browse files Browse the repository at this point in the history
Signed-off-by: Joseph Edwards Van Riper III <vanriper.trey@gmail.com>
  • Loading branch information
tvanriper committed Oct 29, 2023
1 parent 3d66d31 commit 8e5cc68
Show file tree
Hide file tree
Showing 4 changed files with 366 additions and 1 deletion.
7 changes: 6 additions & 1 deletion README.md
Expand Up @@ -7,6 +7,8 @@
Supporting four different mbox file formats in pure golang.

Package mbox implements a reader and writer for working with mbox files.
It also provides a tool to potentially determine the type of mbox format,
although it isn't possible to create tool that can definitively determine this.

The package supports four types of mbox files:

Expand All @@ -29,7 +31,10 @@ Use `mboxcl2` to address the lines starting with 'From ' by doing what
mboxcl does, except it doesn't add '>' characters at all.

You may need to know which type to use when reading or writing an mbox, for
best results.
best results. However, you can try using `DetectType()` to work out the type
of mbox. Thanks go to [BenjamenMeyer's Thunderbird Mailbox Deduper
code](https://github.com/BenjamenMeyer/go-tb-dedup) for incentivizing me to
create `DetectType()`, even if I took a different approach.

NOTE: These routines do not concern themselves with file locking. You may want
to consider that while working with mbox files on systems that might actively
Expand Down
145 changes: 145 additions & 0 deletions detect.go
@@ -0,0 +1,145 @@
package mbox

import (
"bufio"
"fmt"
"io"
"regexp"
"strconv"
"strings"
)

func lineFeedType(reader io.ReadSeeker) (bool, error) {
_, err := reader.Seek(0, io.SeekStart)
if err != nil {
return false, err
}
b := make([]byte, 1024)
for count, err := reader.Read(b); err == nil; {
for i := 0; i < count; i++ {
if b[i] == '\r' {
return true, nil
}
if b[i] == '\n' {
return false, nil
}
}
}
return false, fmt.Errorf("no carriage return or line feed")
}

// DetectType attempts to figure out the type of mbox the reader holds. This
// is a best-effort attempt to determine the type of mbox file format based on
// what it sees within the text.
//
// It tries to work out the type of file by:
// - Looking for 'Content-Length' in a message's header
// - Looking for '>From ' or '>>From ' (or any number of > character in front
// of "From "') in the message's body.
// - Using the length of the 'Content-Length', if present, to determine when
// the body of the message is complete.
//
// With this information, it can guess if the mbox matches one of the file
// types supported by this library with some degree of certainty.
func DetectType(reader io.ReadSeeker) (mboxType int, err error) {
rdMatch := regexp.MustCompile(`^>*>From `)
clMatch := regexp.MustCompile(`^Content-Length:`)
feedType, err := lineFeedType(reader)
if err != nil {
return -1, err
}
_, err = reader.Seek(0, io.SeekStart)
if err != nil {
return -1, err
}
inHeader := false
scanner := bufio.NewScanner(reader)
var hasRd bool = false
var hasCL bool = false
var count int64 = 0
var clLen int64 = 0
var finishedFirst bool = false
for scanner.Scan() {
// NOTE:
// The man page for mbox indicates that one can tell different mailings
// apart via lines that start with From followed by a space. It also
// states that the message is RFC 822 encoded. An RFC 822 encoded text
// message separates the header from the body via a 'null' line (CRLF)
// Ergo, for our purposes, we'll scan over the file, assuming From_
// indicates the start of the header, and a null line indicates the
// start of the body.

// Trimming the space to ensure extranous characters won't figure into
// this.
line := strings.TrimSpace(scanner.Text())
if inHeader && len(line) == 0 {
inHeader = false
count = 0
finishedFirst = true
continue
}
if !inHeader && !hasCL && strings.HasPrefix(line, "From ") {
inHeader = true
}
if !inHeader && hasCL {
count += int64(len(scanner.Text())) + 1
if feedType {
count += 1
}
}

matchRd := rdMatch.MatchString(line)
matchCL := clMatch.MatchString(line)

if inHeader && matchCL {
hasCL = true
// We have a content-length. We need to parse it to determine the
// length of the body.
sp := strings.Split(line, ":")
if len(sp) < 2 {
// well, er, this is awkward...
continue
}
intCL, err := strconv.ParseInt(strings.TrimSpace(sp[1]), 10, 64)
if err != nil {
// I guess that wasn't an int.
continue
}
clLen = intCL
}

if !inHeader && matchRd {
hasRd = true
}

if hasRd && hasCL {
// We have enough evidence:
// This has content length & >From_ in the body.
return MBOXCL, nil
}
if hasCL && !inHeader && strings.HasPrefix(line, "From ") {
// We have enough evidence:
// This has content length, we're in the body, and the line starts
// with 'From '.
return MBOXCL2, nil
}
if !inHeader && clLen == count {
count = 0
finishedFirst = true
// This isn't technically true, but helps for our tests.
inHeader = true
}

if finishedFirst && !hasCL && hasRd {
// We have enough evidence:
// This doesn't have content length, but does have lines in the
// body starting with >First.
return MBOXRD, nil
}
}
if hasCL && !hasRd {
// We don't really know. It could be MBOXCL2, or MBOXCL. We will err on the side of caution.
return MBOXCL, nil
}
return MBOXO, nil
}
165 changes: 165 additions & 0 deletions detect_test.go
@@ -0,0 +1,165 @@
package mbox_test

import (
"strings"
"testing"

"github.com/tvanriper/mbox"
)

func TestDetectRD(t *testing.T) {
mb := `From someone
From: chuckles@funbunny.org
To: hhefner@playboy.com
Subject: Closed Captioning
>From the engineers: do we really need closed captioning on this material?
Please let me know, it'd save a lot of money if we could avoid it.
From hhefner
From: hhefner@playboy.com
To: chuckles@funbunny.org
Subject: RE: Closed Captioning
Yes, silly as it sounds, we broadcast this material, and it must therefore have
closed captioning. The deaf will enjoy reading the material.
`
mType, err := mbox.DetectType(strings.NewReader(mb))
if err != nil {
t.Error(err)
}
if mType != mbox.MBOXRD {
t.Errorf("expected %d but got %d", mbox.MBOXRD, mType)
}

// Ensure we also detect MBOXRD if the second mail has the character.
mb = `From someone
From: chuckles@funbunny.org
To: hhefner@playboy.com
Subject: Closed Captioning
Do we really need closed captioning on this material? Please let me know, it'd
save a lot of money if we could avoid it.
From hhefner
From: hhefner@playboy.com
To: chuckles@funbunny.org
Subject: RE: Closed Captioning
>From my lawyers: yes, silly as it sounds, we broadcast this material, and it
must therefore have closed captioning. The deaf will enjoy reading the
material.
`
mType, err = mbox.DetectType(strings.NewReader(mb))
if err != nil {
t.Error(err)
}
if mType != mbox.MBOXRD {
t.Errorf("expected %d but got %d", mbox.MBOXRD, mType)
}
}

func TestDetectCL(t *testing.T) {
mb := `From someone
From: bubbles@bubbletown.com
To: mrmxpdstk@lazytown.com
Subject: To interpretation
Content-Length: 33
We should all try to enjoy life!
From someone-else
>From mug: weird header
Content-Length: 130
From: mrspam@corporate.corp.com
To: mrmxpdstk@lazytown.com
Subject: Bestest offer in the universe!!11!!
You won't believe these prices!
>From 1 cent to 11 cents, we carry the least expensive
line of jets this side of the Gobi Desert!
From nobody
From: nobody@nowhere.man
To: mrmxpdstk@lazytown.com
Subject: Mysterious Jenkins
Content-Length: 0
`
mType, err := mbox.DetectType(strings.NewReader(mb))
if err != nil {
t.Error(err)
}
if mType != mbox.MBOXCL {
t.Errorf("expected %d but got %d", mbox.MBOXCL, mType)
}
}

func TestDetectCL2(t *testing.T) {
mb := `From someone
From: bubbles@bubbletown.com
To: mrmxpdstk@lazytown.com
Subject: To interpretation
Content-Length: 33
We should all try to enjoy life!
From someone-else
>From mug: weird header
Content-Length: 129
From: mrspam@corporate.corp.com
To: mrmxpdstk@lazytown.com
Subject: Bestest offer in the universe!!11!!
You won't believe these prices!
From 1 cent to 11 cents, we carry the least expensive
line of jets this side of the Gobi Desert!
From nobody
From: nobody@nowhere.man
To: mrmxpdstk@lazytown.com
Subject: Mysterious Jenkins
Content-Length: 0
`
mType, err := mbox.DetectType(strings.NewReader(mb))
if err != nil {
t.Error(err)
}
if mType != mbox.MBOXCL2 {
t.Errorf("expected %d but got %d", mbox.MBOXCL2, mType)
}
}

func TestDetectO(t *testing.T) {
mb := `From someone
From: bubbles@bubbletown.com
To: mrmxpdstk@lazytown.com
Subject: To interpretation
From all of us, to all of you, be happy!
From someone-else
From: mrspam@corporate.corp.com
To: mrmxpdstk@lazytown.com
Subject: Bestest offer in the universe!!11!!
You won't believe these prices!
From 1 cent to 11 cents, we carry the least expensive
line of jets this side of the Gobi Desert!
`
mType, err := mbox.DetectType(strings.NewReader(mb))
if err != nil {
t.Error(err)
}
if mType != mbox.MBOXO {
t.Errorf("expected %d but got %d", mbox.MBOXO, mType)
}
}

func TestDetectAltLinefeed(t *testing.T) {
mb := "From someone\r\nFrom: chuckles@funbunny.org\r\nTo: hhefner@playboy.com\r\nSubject: Closed Captioning\r\n\r\n>From the engineers: do we really need closed captioning on this material?\r\nPlease let me know, it'd save a lot of money if we could avoid it.\r\n\r\nFrom hhefner\r\nFrom: hhefner@playboy.com\r\nTo: chuckles@funbunny.org\r\nSubject: RE: Closed Captioning\r\n\r\nYes, silly as it sounds, we broadcast this material, and it must therefore have\r\nclosed captioning. The deaf will enjoy reading the material.\r\n"
mType, err := mbox.DetectType(strings.NewReader(mb))
if err != nil {
t.Error(err)
}
if mType != mbox.MBOXRD {
t.Errorf("expected %d but got %d", mbox.MBOXRD, mType)
}

}
50 changes: 50 additions & 0 deletions example_mboxdetecttype_test.go
@@ -0,0 +1,50 @@
package mbox_test

import (
"bytes"
"fmt"

"github.com/tvanriper/mbox"
)

// Imagine this is a file on your filesystem instead of a variable in your code.
const mboxrd2 string = `From bubbles@bubbletown.com Mon Jul 04 14:23:45 2022
From: bubbles@bubbletown.com
To: mrmxpdstk@lazytown.com
Subject: To interpretation
>From all of us, to all of you, be happy!
From mrspam@corporate.corp.com Mon Jul 04 15:02:15 2022
From: mrspam@corporate.corp.com
To: mrmxpdstk@lazytown.com
Subject: Bestest offer in the universe!!11!!
You won't believe these prices!
>From 1 cent to 11 cents, we carry the least expensive
line of jets this side of the Gobi Desert!
`

func ExampleDetectType() {
// Imagine you used os.Open instead of bytes.NewBuffer here.
reader := bytes.NewReader([]byte(mboxrd2))
mbType, err := mbox.DetectType(reader)
if err != nil {
fmt.Printf("failed to detect mbox type: %s\n", err)
return
}
switch mbType {
case mbox.MBOXO:
fmt.Println("MBOXO")
case mbox.MBOXRD:
fmt.Println("MBOXRD")
case mbox.MBOXCL:
fmt.Println("MBOXCL")
case mbox.MBOXCL2:
fmt.Println("MBOXCL2")
default:
fmt.Println("Unknown")
}

// Output:
// MBOXRD
}

0 comments on commit 8e5cc68

Please sign in to comment.