Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Joseph Edwards Van Riper III <vanriper.trey@gmail.com>
- Loading branch information
Showing
4 changed files
with
366 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
package mbox | ||
|
||
import ( | ||
"bufio" | ||
"fmt" | ||
"io" | ||
"regexp" | ||
"strconv" | ||
"strings" | ||
) | ||
|
||
func lineFeedType(reader io.ReadSeeker) (bool, error) { | ||
_, err := reader.Seek(0, io.SeekStart) | ||
if err != nil { | ||
return false, err | ||
} | ||
b := make([]byte, 1024) | ||
for count, err := reader.Read(b); err == nil; { | ||
for i := 0; i < count; i++ { | ||
if b[i] == '\r' { | ||
return true, nil | ||
} | ||
if b[i] == '\n' { | ||
return false, nil | ||
} | ||
} | ||
} | ||
return false, fmt.Errorf("no carriage return or line feed") | ||
} | ||
|
||
// DetectType attempts to figure out the type of mbox the reader holds. This | ||
// is a best-effort attempt to determine the type of mbox file format based on | ||
// what it sees within the text. | ||
// | ||
// It tries to work out the type of file by: | ||
// - Looking for 'Content-Length' in a message's header | ||
// - Looking for '>From ' or '>>From ' (or any number of > character in front | ||
// of "From "') in the message's body. | ||
// - Using the length of the 'Content-Length', if present, to determine when | ||
// the body of the message is complete. | ||
// | ||
// With this information, it can guess if the mbox matches one of the file | ||
// types supported by this library with some degree of certainty. | ||
func DetectType(reader io.ReadSeeker) (mboxType int, err error) { | ||
rdMatch := regexp.MustCompile(`^>*>From `) | ||
clMatch := regexp.MustCompile(`^Content-Length:`) | ||
feedType, err := lineFeedType(reader) | ||
if err != nil { | ||
return -1, err | ||
} | ||
_, err = reader.Seek(0, io.SeekStart) | ||
if err != nil { | ||
return -1, err | ||
} | ||
inHeader := false | ||
scanner := bufio.NewScanner(reader) | ||
var hasRd bool = false | ||
var hasCL bool = false | ||
var count int64 = 0 | ||
var clLen int64 = 0 | ||
var finishedFirst bool = false | ||
for scanner.Scan() { | ||
// NOTE: | ||
// The man page for mbox indicates that one can tell different mailings | ||
// apart via lines that start with From followed by a space. It also | ||
// states that the message is RFC 822 encoded. An RFC 822 encoded text | ||
// message separates the header from the body via a 'null' line (CRLF) | ||
// Ergo, for our purposes, we'll scan over the file, assuming From_ | ||
// indicates the start of the header, and a null line indicates the | ||
// start of the body. | ||
|
||
// Trimming the space to ensure extranous characters won't figure into | ||
// this. | ||
line := strings.TrimSpace(scanner.Text()) | ||
if inHeader && len(line) == 0 { | ||
inHeader = false | ||
count = 0 | ||
finishedFirst = true | ||
continue | ||
} | ||
if !inHeader && !hasCL && strings.HasPrefix(line, "From ") { | ||
inHeader = true | ||
} | ||
if !inHeader && hasCL { | ||
count += int64(len(scanner.Text())) + 1 | ||
if feedType { | ||
count += 1 | ||
} | ||
} | ||
|
||
matchRd := rdMatch.MatchString(line) | ||
matchCL := clMatch.MatchString(line) | ||
|
||
if inHeader && matchCL { | ||
hasCL = true | ||
// We have a content-length. We need to parse it to determine the | ||
// length of the body. | ||
sp := strings.Split(line, ":") | ||
if len(sp) < 2 { | ||
// well, er, this is awkward... | ||
continue | ||
} | ||
intCL, err := strconv.ParseInt(strings.TrimSpace(sp[1]), 10, 64) | ||
if err != nil { | ||
// I guess that wasn't an int. | ||
continue | ||
} | ||
clLen = intCL | ||
} | ||
|
||
if !inHeader && matchRd { | ||
hasRd = true | ||
} | ||
|
||
if hasRd && hasCL { | ||
// We have enough evidence: | ||
// This has content length & >From_ in the body. | ||
return MBOXCL, nil | ||
} | ||
if hasCL && !inHeader && strings.HasPrefix(line, "From ") { | ||
// We have enough evidence: | ||
// This has content length, we're in the body, and the line starts | ||
// with 'From '. | ||
return MBOXCL2, nil | ||
} | ||
if !inHeader && clLen == count { | ||
count = 0 | ||
finishedFirst = true | ||
// This isn't technically true, but helps for our tests. | ||
inHeader = true | ||
} | ||
|
||
if finishedFirst && !hasCL && hasRd { | ||
// We have enough evidence: | ||
// This doesn't have content length, but does have lines in the | ||
// body starting with >First. | ||
return MBOXRD, nil | ||
} | ||
} | ||
if hasCL && !hasRd { | ||
// We don't really know. It could be MBOXCL2, or MBOXCL. We will err on the side of caution. | ||
return MBOXCL, nil | ||
} | ||
return MBOXO, nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,165 @@ | ||
package mbox_test | ||
|
||
import ( | ||
"strings" | ||
"testing" | ||
|
||
"github.com/tvanriper/mbox" | ||
) | ||
|
||
func TestDetectRD(t *testing.T) { | ||
mb := `From someone | ||
From: chuckles@funbunny.org | ||
To: hhefner@playboy.com | ||
Subject: Closed Captioning | ||
>From the engineers: do we really need closed captioning on this material? | ||
Please let me know, it'd save a lot of money if we could avoid it. | ||
From hhefner | ||
From: hhefner@playboy.com | ||
To: chuckles@funbunny.org | ||
Subject: RE: Closed Captioning | ||
Yes, silly as it sounds, we broadcast this material, and it must therefore have | ||
closed captioning. The deaf will enjoy reading the material. | ||
` | ||
mType, err := mbox.DetectType(strings.NewReader(mb)) | ||
if err != nil { | ||
t.Error(err) | ||
} | ||
if mType != mbox.MBOXRD { | ||
t.Errorf("expected %d but got %d", mbox.MBOXRD, mType) | ||
} | ||
|
||
// Ensure we also detect MBOXRD if the second mail has the character. | ||
mb = `From someone | ||
From: chuckles@funbunny.org | ||
To: hhefner@playboy.com | ||
Subject: Closed Captioning | ||
Do we really need closed captioning on this material? Please let me know, it'd | ||
save a lot of money if we could avoid it. | ||
From hhefner | ||
From: hhefner@playboy.com | ||
To: chuckles@funbunny.org | ||
Subject: RE: Closed Captioning | ||
>From my lawyers: yes, silly as it sounds, we broadcast this material, and it | ||
must therefore have closed captioning. The deaf will enjoy reading the | ||
material. | ||
` | ||
mType, err = mbox.DetectType(strings.NewReader(mb)) | ||
if err != nil { | ||
t.Error(err) | ||
} | ||
if mType != mbox.MBOXRD { | ||
t.Errorf("expected %d but got %d", mbox.MBOXRD, mType) | ||
} | ||
} | ||
|
||
func TestDetectCL(t *testing.T) { | ||
mb := `From someone | ||
From: bubbles@bubbletown.com | ||
To: mrmxpdstk@lazytown.com | ||
Subject: To interpretation | ||
Content-Length: 33 | ||
We should all try to enjoy life! | ||
From someone-else | ||
>From mug: weird header | ||
Content-Length: 130 | ||
From: mrspam@corporate.corp.com | ||
To: mrmxpdstk@lazytown.com | ||
Subject: Bestest offer in the universe!!11!! | ||
You won't believe these prices! | ||
>From 1 cent to 11 cents, we carry the least expensive | ||
line of jets this side of the Gobi Desert! | ||
From nobody | ||
From: nobody@nowhere.man | ||
To: mrmxpdstk@lazytown.com | ||
Subject: Mysterious Jenkins | ||
Content-Length: 0 | ||
` | ||
mType, err := mbox.DetectType(strings.NewReader(mb)) | ||
if err != nil { | ||
t.Error(err) | ||
} | ||
if mType != mbox.MBOXCL { | ||
t.Errorf("expected %d but got %d", mbox.MBOXCL, mType) | ||
} | ||
} | ||
|
||
func TestDetectCL2(t *testing.T) { | ||
mb := `From someone | ||
From: bubbles@bubbletown.com | ||
To: mrmxpdstk@lazytown.com | ||
Subject: To interpretation | ||
Content-Length: 33 | ||
We should all try to enjoy life! | ||
From someone-else | ||
>From mug: weird header | ||
Content-Length: 129 | ||
From: mrspam@corporate.corp.com | ||
To: mrmxpdstk@lazytown.com | ||
Subject: Bestest offer in the universe!!11!! | ||
You won't believe these prices! | ||
From 1 cent to 11 cents, we carry the least expensive | ||
line of jets this side of the Gobi Desert! | ||
From nobody | ||
From: nobody@nowhere.man | ||
To: mrmxpdstk@lazytown.com | ||
Subject: Mysterious Jenkins | ||
Content-Length: 0 | ||
` | ||
mType, err := mbox.DetectType(strings.NewReader(mb)) | ||
if err != nil { | ||
t.Error(err) | ||
} | ||
if mType != mbox.MBOXCL2 { | ||
t.Errorf("expected %d but got %d", mbox.MBOXCL2, mType) | ||
} | ||
} | ||
|
||
func TestDetectO(t *testing.T) { | ||
mb := `From someone | ||
From: bubbles@bubbletown.com | ||
To: mrmxpdstk@lazytown.com | ||
Subject: To interpretation | ||
From all of us, to all of you, be happy! | ||
From someone-else | ||
From: mrspam@corporate.corp.com | ||
To: mrmxpdstk@lazytown.com | ||
Subject: Bestest offer in the universe!!11!! | ||
You won't believe these prices! | ||
From 1 cent to 11 cents, we carry the least expensive | ||
line of jets this side of the Gobi Desert! | ||
` | ||
mType, err := mbox.DetectType(strings.NewReader(mb)) | ||
if err != nil { | ||
t.Error(err) | ||
} | ||
if mType != mbox.MBOXO { | ||
t.Errorf("expected %d but got %d", mbox.MBOXO, mType) | ||
} | ||
} | ||
|
||
func TestDetectAltLinefeed(t *testing.T) { | ||
mb := "From someone\r\nFrom: chuckles@funbunny.org\r\nTo: hhefner@playboy.com\r\nSubject: Closed Captioning\r\n\r\n>From the engineers: do we really need closed captioning on this material?\r\nPlease let me know, it'd save a lot of money if we could avoid it.\r\n\r\nFrom hhefner\r\nFrom: hhefner@playboy.com\r\nTo: chuckles@funbunny.org\r\nSubject: RE: Closed Captioning\r\n\r\nYes, silly as it sounds, we broadcast this material, and it must therefore have\r\nclosed captioning. The deaf will enjoy reading the material.\r\n" | ||
mType, err := mbox.DetectType(strings.NewReader(mb)) | ||
if err != nil { | ||
t.Error(err) | ||
} | ||
if mType != mbox.MBOXRD { | ||
t.Errorf("expected %d but got %d", mbox.MBOXRD, mType) | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
package mbox_test | ||
|
||
import ( | ||
"bytes" | ||
"fmt" | ||
|
||
"github.com/tvanriper/mbox" | ||
) | ||
|
||
// Imagine this is a file on your filesystem instead of a variable in your code. | ||
const mboxrd2 string = `From bubbles@bubbletown.com Mon Jul 04 14:23:45 2022 | ||
From: bubbles@bubbletown.com | ||
To: mrmxpdstk@lazytown.com | ||
Subject: To interpretation | ||
>From all of us, to all of you, be happy! | ||
From mrspam@corporate.corp.com Mon Jul 04 15:02:15 2022 | ||
From: mrspam@corporate.corp.com | ||
To: mrmxpdstk@lazytown.com | ||
Subject: Bestest offer in the universe!!11!! | ||
You won't believe these prices! | ||
>From 1 cent to 11 cents, we carry the least expensive | ||
line of jets this side of the Gobi Desert! | ||
` | ||
|
||
func ExampleDetectType() { | ||
// Imagine you used os.Open instead of bytes.NewBuffer here. | ||
reader := bytes.NewReader([]byte(mboxrd2)) | ||
mbType, err := mbox.DetectType(reader) | ||
if err != nil { | ||
fmt.Printf("failed to detect mbox type: %s\n", err) | ||
return | ||
} | ||
switch mbType { | ||
case mbox.MBOXO: | ||
fmt.Println("MBOXO") | ||
case mbox.MBOXRD: | ||
fmt.Println("MBOXRD") | ||
case mbox.MBOXCL: | ||
fmt.Println("MBOXCL") | ||
case mbox.MBOXCL2: | ||
fmt.Println("MBOXCL2") | ||
default: | ||
fmt.Println("Unknown") | ||
} | ||
|
||
// Output: | ||
// MBOXRD | ||
} |