diff --git a/cmd/playback/main.go b/cmd/playback/main.go index 7c4270e0..0c410119 100644 --- a/cmd/playback/main.go +++ b/cmd/playback/main.go @@ -3,6 +3,7 @@ package main import ( "context" "fmt" + "net/url" "os" "github.com/spf13/cobra" @@ -30,7 +31,13 @@ func handle(cmd *cobra.Command, args []string) { os.Exit(0) } - collects, _ := wayback.Playback(context.TODO(), args...) + urls, err := unmarshalArgs(args) + if err != nil { + cmd.Println(err) + os.Exit(1) + } + + collects, _ := wayback.Playback(context.TODO(), urls...) for _, collect := range collects { fmt.Printf("[%s]\n", collect.Arc) for orig, dest := range collect.Dst { @@ -39,3 +46,15 @@ func handle(cmd *cobra.Command, args []string) { fmt.Printf("\n") } } + +func unmarshalArgs(args []string) (urls []*url.URL, err error) { + for _, s := range args { + uri, er := url.Parse(s) + if er != nil { + err = fmt.Errorf("%w: unexpect url: %s", err, s) + continue + } + urls = append(urls, uri) + } + return +} diff --git a/cmd/wayback/wayback.go b/cmd/wayback/wayback.go index 6acfec39..d49f2a5a 100644 --- a/cmd/wayback/wayback.go +++ b/cmd/wayback/wayback.go @@ -3,6 +3,8 @@ package main import ( "context" "fmt" + "net/url" + "os" "time" "github.com/spf13/cobra" @@ -20,7 +22,7 @@ func output(tit string, args map[string]string) { func archive(cmd *cobra.Command, args []string) { var bundles reduxer.Bundles - archiving := func(ctx context.Context, urls []string) error { + archiving := func(ctx context.Context, urls []*url.URL) error { g, ctx := errgroup.WithContext(ctx) cols, err := wayback.Wayback(ctx, &bundles, urls...) if err != nil { @@ -45,10 +47,28 @@ func archive(cmd *cobra.Command, args []string) { return nil } + urls, err := unmarshalArgs(args) + if err != nil { + cmd.Println(err) + os.Exit(1) + } + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) defer cancel() - if err := archiving(ctx, args); err != nil { + if err := archiving(ctx, urls); err != nil { cmd.PrintErrln(err) } } + +func unmarshalArgs(args []string) (urls []*url.URL, err error) { + for _, s := range args { + uri, er := url.Parse(s) + if er != nil { + err = fmt.Errorf("%w: unexpect url: %s", err, s) + continue + } + urls = append(urls, uri) + } + return +} diff --git a/reduxer/reduxer.go b/reduxer/reduxer.go index 943de9f0..e87d703e 100644 --- a/reduxer/reduxer.go +++ b/reduxer/reduxer.go @@ -70,7 +70,7 @@ var ytdl, existYoutubeDL = exists("youtube-dl") // Do executes secreenshot, print PDF and export html of given URLs // Returns a set of bundle containing screenshot data and file path // nolint:gocyclo -func Do(ctx context.Context, urls ...string) (bundles Bundles, err error) { +func Do(ctx context.Context, urls ...*url.URL) (bundles Bundles, err error) { bundles = make(Bundles) if !config.Opts.EnabledReduxer() { return bundles, errors.New("Specify directory to environment `WAYBACK_STORAGE_DIR` to enable reduxer") @@ -177,7 +177,7 @@ func Do(ctx context.Context, urls ...string) (bundles Bundles, err error) { } // Capture returns screenshot.Screenshots of given URLs -func Capture(ctx context.Context, urls ...string) (shots []screenshot.Screenshots, err error) { +func Capture(ctx context.Context, urls ...*url.URL) (shots []screenshot.Screenshots, err error) { opts := []screenshot.ScreenshotOption{ screenshot.ScaleFactor(1), screenshot.PrintPDF(true), // print pdf @@ -188,15 +188,10 @@ func Capture(ctx context.Context, urls ...string) (shots []screenshot.Screenshot var mu sync.Mutex var wg sync.WaitGroup shots = make([]screenshot.Screenshots, 0, len(urls)) - for _, uri := range urls { + for _, input := range urls { wg.Add(1) - go func(uri string) { + go func(input *url.URL) { defer wg.Done() - input, err := url.Parse(uri) - if err != nil { - logger.Error("parse url failed: %v", err) - return - } var shot screenshot.Screenshots if remote := remoteHeadless(config.Opts.ChromeRemoteAddr()); remote != nil { @@ -223,7 +218,7 @@ func Capture(ctx context.Context, urls ...string) (shots []screenshot.Screenshot mu.Lock() shots = append(shots, shot) mu.Unlock() - }(uri) + }(input) } wg.Wait() diff --git a/reduxer/reduxer_test.go b/reduxer/reduxer_test.go index f1f22091..88c01306 100644 --- a/reduxer/reduxer_test.go +++ b/reduxer/reduxer_test.go @@ -6,6 +6,7 @@ package reduxer // import "github.com/wabarc/wayback/reduxer" import ( "context" + "net/url" "os" "os/exec" "path/filepath" @@ -35,8 +36,11 @@ func TestDo(t *testing.T) { t.Fatalf("Parse environment variables or flags failed, error: %v", err) } - urls := []string{"https://example.com/"} - res, err := Do(context.Background(), urls...) + inp, err := url.Parse("https://example.com/") + if err != nil { + t.Fatalf("Unexpected parse url: %v", err) + } + res, err := Do(context.Background(), inp) if err != nil { t.Fatalf("Unexpected execute do: %v", err) } diff --git a/service/discord/discord.go b/service/discord/discord.go index dd2d050d..36585ae5 100644 --- a/service/discord/discord.go +++ b/service/discord/discord.go @@ -7,6 +7,7 @@ package discord // import "github.com/wabarc/wayback/service/discord" import ( "context" "encoding/base64" + "net/url" "strconv" "strings" @@ -265,7 +266,7 @@ func (d *Discord) process(m *discord.MessageCreate) (err error) { return nil } -func (d *Discord) wayback(ctx context.Context, m *discord.MessageCreate, urls []string) error { +func (d *Discord) wayback(ctx context.Context, m *discord.MessageCreate, urls []*url.URL) error { stage, err := d.edit(m, "Archiving...") if err != nil { logger.Error("send archiving message failed: %v", err) diff --git a/service/httpd/httpd_test.go b/service/httpd/httpd_test.go index 4449aa1b..e3e3c841 100644 --- a/service/httpd/httpd_test.go +++ b/service/httpd/httpd_test.go @@ -18,6 +18,7 @@ import ( "github.com/wabarc/wayback/config" "github.com/wabarc/wayback/pooling" "github.com/wabarc/wayback/reduxer" + "github.com/wabarc/wayback/service" ) func TestTransform(t *testing.T) { @@ -30,7 +31,7 @@ func TestTransform(t *testing.T) { } text := "some text https://example.com" - urls := helper.MatchURL(text) + urls := service.MatchURL(text) rbes := make(reduxer.Bundles) col, _ := wayback.Wayback(context.TODO(), &rbes, urls...) collector := transform(col) diff --git a/service/slack/slack.go b/service/slack/slack.go index 3bcf4984..75cabee7 100644 --- a/service/slack/slack.go +++ b/service/slack/slack.go @@ -6,6 +6,7 @@ package slack // import "github.com/wabarc/wayback/service/slack" import ( "context" + "net/url" "github.com/fatih/color" "github.com/slack-go/slack" @@ -280,7 +281,7 @@ func (s *Slack) process(ev *event) (err error) { return nil } -func (s *Slack) wayback(ctx context.Context, ev *event, urls []string) error { +func (s *Slack) wayback(ctx context.Context, ev *event, urls []*url.URL) error { tstamp, err := s.edit(ev.Channel, ev.ThreadTimeStamp, "Archiving...") if err != nil { logger.Error("send archiving message failed: %v", err) diff --git a/service/telegram/telegram.go b/service/telegram/telegram.go index c39906d4..e98d67de 100644 --- a/service/telegram/telegram.go +++ b/service/telegram/telegram.go @@ -8,6 +8,7 @@ import ( "context" "encoding/base64" "fmt" + "net/url" "regexp" "strconv" "strings" @@ -238,7 +239,7 @@ func (t *Telegram) process(message *telegram.Message) (err error) { return nil } -func (t *Telegram) wayback(ctx context.Context, message *telegram.Message, urls []string) error { +func (t *Telegram) wayback(ctx context.Context, message *telegram.Message, urls []*url.URL) error { stage, err := t.bot.Edit(message, "Archiving...") if err != nil { logger.Error("send archiving message failed: %v", err) diff --git a/service/twitter/twitter.go b/service/twitter/twitter.go index 18b2521f..a29a0df1 100644 --- a/service/twitter/twitter.go +++ b/service/twitter/twitter.go @@ -6,6 +6,7 @@ package twitter // import "github.com/wabarc/wayback/service/twitter" import ( "context" + "net/url" "sync" "time" @@ -160,9 +161,13 @@ func (t *Twitter) process(event twitter.DirectMessageEvent) error { }() urls := service.MatchURL(text) - var realURLs []string - for _, url := range urls { - realURLs = append(realURLs, helper.RealURI(url)) + var realURLs []*url.URL + for _, uri := range urls { + u, err := url.Parse(helper.RealURI(uri.String())) + if err != nil { + continue + } + realURLs = append(realURLs, u) } logger.Debug("real urls: %v", realURLs) @@ -172,7 +177,7 @@ func (t *Twitter) process(event twitter.DirectMessageEvent) error { } var bundles reduxer.Bundles - cols, err := wayback.Wayback(context.TODO(), &bundles, urls...) + cols, err := wayback.Wayback(context.TODO(), &bundles, realURLs...) if err != nil { logger.Error("archives failure, ", err) return err diff --git a/service/utils.go b/service/utils.go index 36839d8b..a28f9eb3 100644 --- a/service/utils.go +++ b/service/utils.go @@ -5,14 +5,41 @@ package service // import "github.com/wabarc/wayback/service" import ( + "net/url" + "strings" + "github.com/wabarc/helper" "github.com/wabarc/wayback/config" ) // MatchURL returns a slice string contains URLs extracted from the given string. -func MatchURL(s string) []string { +func MatchURL(s string) (urls []*url.URL) { + var matches []string if config.Opts.WaybackFallback() { - return helper.MatchURLFallback(s) + matches = helper.MatchURLFallback(s) + } + matches = helper.MatchURL(s) + + for i := range matches { + u, _ := url.Parse(matches[i]) + urls = append(urls, u) + } + + return removeDuplicates(urls) +} + +func removeDuplicates(elements []*url.URL) (urls []*url.URL) { + encountered := map[string]bool{} + slash := "/" + for _, u := range elements { + key := u.User.String() + u.Host + u.Path + u.RawQuery + u.Fragment + if u.Path == "" && !strings.HasSuffix(key, slash) { + key += slash + } + if !encountered[key] { + encountered[key] = true + urls = append(urls, u) + } } - return helper.MatchURL(s) + return } diff --git a/service/utils_test.go b/service/utils_test.go new file mode 100644 index 00000000..c095a357 --- /dev/null +++ b/service/utils_test.go @@ -0,0 +1,64 @@ +// Copyright 2022 Wayback Archiver. All rights reserved. +// Use of this source code is governed by the GNU GPL v3 +// license that can be found in the LICENSE file. + +package service // import "github.com/wabarc/wayback/service" + +import ( + "strconv" + "testing" + + "github.com/wabarc/wayback/config" +) + +func TestMatchURL(t *testing.T) { + parser := config.NewParser() + var err error + if config.Opts, err = parser.ParseEnvironmentVariables(); err != nil { + t.Fatalf("Parse environment variables or flags failed, error: %v", err) + } + + t.Parallel() + + var ( + u = "http://example.org" + x = "http://example.com" + y = "https://example.com/" + z = "https://example.com/path" + ) + + var tests = []struct { + text string + leng int + }{ + { + text: "", + leng: 0, + }, + { + text: "foo " + x, + leng: 1, + }, + { + text: x + " foo " + y, + leng: 1, + }, + { + text: y + " foo " + z, + leng: 2, + }, + { + text: u + " foo " + x, + leng: 2, + }, + } + + for i, test := range tests { + t.Run(strconv.Itoa(i), func(t *testing.T) { + got := len(MatchURL(test.text)) + if got != test.leng { + t.Fatalf(`Unexpected extract URLs number from text got %d instead of %d`, got, test.leng) + } + }) + } +} diff --git a/wayback.go b/wayback.go index 576933c9..c29fca82 100644 --- a/wayback.go +++ b/wayback.go @@ -171,7 +171,7 @@ func wayback(w Waybacker) string { } // Wayback returns URLs archived to the time capsules of given URLs. -func Wayback(ctx context.Context, bundles *reduxer.Bundles, urls ...string) (cols []Collect, err error) { +func Wayback(ctx context.Context, bundles *reduxer.Bundles, urls ...*url.URL) (cols []Collect, err error) { logger.Debug("start...") ctx, cancel := context.WithTimeout(ctx, config.Opts.WaybackTimeout()) @@ -184,21 +184,17 @@ func Wayback(ctx context.Context, bundles *reduxer.Bundles, urls ...string) (col mu := sync.Mutex{} g, ctx := errgroup.WithContext(ctx) - for _, uri := range urls { + for _, input := range urls { for slot, arc := range config.Opts.Slots() { if !arc { logger.Warn("skipped %s", config.SlotName(slot)) continue } - slot, uri := slot, uri + slot, input := slot, input g.Go(func() error { logger.Debug("archiving slot: %s", slot) - input, err := url.Parse(uri) - if err != nil { - logger.Error("parse uri failed: %v", err) - return err - } + uri := input.String() bundle := (*bundles)[uri] var col Collect switch slot { @@ -235,7 +231,7 @@ func Wayback(ctx context.Context, bundles *reduxer.Bundles, urls ...string) (col } // Playback returns URLs archived from the time capsules. -func Playback(ctx context.Context, urls ...string) (cols []Collect, err error) { +func Playback(ctx context.Context, urls ...*url.URL) (cols []Collect, err error) { logger.Debug("start...") ctx, cancel := context.WithTimeout(ctx, config.Opts.WaybackTimeout()) @@ -244,16 +240,11 @@ func Playback(ctx context.Context, urls ...string) (cols []Collect, err error) { mu := sync.Mutex{} g, ctx := errgroup.WithContext(ctx) var slots = []string{config.SLOT_IA, config.SLOT_IS, config.SLOT_IP, config.SLOT_PH, config.SLOT_TT, config.SLOT_GC} - for _, uri := range urls { + for _, input := range urls { for _, slot := range slots { - slot, uri := slot, uri + slot, input := slot, input g.Go(func() error { logger.Debug("searching slot: %s", slot) - input, err := url.Parse(uri) - if err != nil { - logger.Error("parse uri failed: %v", err) - return err - } var col Collect switch slot { case config.SLOT_IA: @@ -269,7 +260,7 @@ func Playback(ctx context.Context, urls ...string) (cols []Collect, err error) { case config.SLOT_GC: col.Dst = playback.Playback(ctx, playback.GC{URL: input}) } - col.Src = uri + col.Src = input.String() col.Arc = slot col.Ext = slot mu.Lock()