Skip to content

Commit

Permalink
Remove duplicates url
Browse files Browse the repository at this point in the history
  • Loading branch information
waybackarchiver committed Feb 2, 2022
1 parent 36418a0 commit f3c31de
Show file tree
Hide file tree
Showing 12 changed files with 172 additions and 43 deletions.
21 changes: 20 additions & 1 deletion cmd/playback/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package main
import (
"context"
"fmt"
"net/url"
"os"

"github.com/spf13/cobra"
Expand Down Expand Up @@ -30,7 +31,13 @@ func handle(cmd *cobra.Command, args []string) {
os.Exit(0)
}

collects, _ := wayback.Playback(context.TODO(), args...)
urls, err := unmarshalArgs(args)
if err != nil {
cmd.Println(err)
os.Exit(1)
}

collects, _ := wayback.Playback(context.TODO(), urls...)
for _, collect := range collects {
fmt.Printf("[%s]\n", collect.Arc)
for orig, dest := range collect.Dst {
Expand All @@ -39,3 +46,15 @@ func handle(cmd *cobra.Command, args []string) {
fmt.Printf("\n")
}
}

func unmarshalArgs(args []string) (urls []*url.URL, err error) {
for _, s := range args {
uri, er := url.Parse(s)
if er != nil {
err = fmt.Errorf("%w: unexpect url: %s", err, s)
continue
}
urls = append(urls, uri)
}
return
}
24 changes: 22 additions & 2 deletions cmd/wayback/wayback.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ package main
import (
"context"
"fmt"
"net/url"
"os"
"time"

"github.com/spf13/cobra"
Expand All @@ -20,7 +22,7 @@ func output(tit string, args map[string]string) {

func archive(cmd *cobra.Command, args []string) {
var bundles reduxer.Bundles
archiving := func(ctx context.Context, urls []string) error {
archiving := func(ctx context.Context, urls []*url.URL) error {
g, ctx := errgroup.WithContext(ctx)
cols, err := wayback.Wayback(ctx, &bundles, urls...)
if err != nil {
Expand All @@ -45,10 +47,28 @@ func archive(cmd *cobra.Command, args []string) {
return nil
}

urls, err := unmarshalArgs(args)
if err != nil {
cmd.Println(err)
os.Exit(1)
}

ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()

if err := archiving(ctx, args); err != nil {
if err := archiving(ctx, urls); err != nil {
cmd.PrintErrln(err)
}
}

func unmarshalArgs(args []string) (urls []*url.URL, err error) {
for _, s := range args {
uri, er := url.Parse(s)
if er != nil {
err = fmt.Errorf("%w: unexpect url: %s", err, s)
continue
}
urls = append(urls, uri)
}
return
}
15 changes: 5 additions & 10 deletions reduxer/reduxer.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ var ytdl, existYoutubeDL = exists("youtube-dl")
// Do executes secreenshot, print PDF and export html of given URLs
// Returns a set of bundle containing screenshot data and file path
// nolint:gocyclo
func Do(ctx context.Context, urls ...string) (bundles Bundles, err error) {
func Do(ctx context.Context, urls ...*url.URL) (bundles Bundles, err error) {
bundles = make(Bundles)
if !config.Opts.EnabledReduxer() {
return bundles, errors.New("Specify directory to environment `WAYBACK_STORAGE_DIR` to enable reduxer")
Expand Down Expand Up @@ -177,7 +177,7 @@ func Do(ctx context.Context, urls ...string) (bundles Bundles, err error) {
}

// Capture returns screenshot.Screenshots of given URLs
func Capture(ctx context.Context, urls ...string) (shots []screenshot.Screenshots, err error) {
func Capture(ctx context.Context, urls ...*url.URL) (shots []screenshot.Screenshots, err error) {
opts := []screenshot.ScreenshotOption{
screenshot.ScaleFactor(1),
screenshot.PrintPDF(true), // print pdf
Expand All @@ -188,15 +188,10 @@ func Capture(ctx context.Context, urls ...string) (shots []screenshot.Screenshot
var mu sync.Mutex
var wg sync.WaitGroup
shots = make([]screenshot.Screenshots, 0, len(urls))
for _, uri := range urls {
for _, input := range urls {
wg.Add(1)
go func(uri string) {
go func(input *url.URL) {
defer wg.Done()
input, err := url.Parse(uri)
if err != nil {
logger.Error("parse url failed: %v", err)
return
}

var shot screenshot.Screenshots
if remote := remoteHeadless(config.Opts.ChromeRemoteAddr()); remote != nil {
Expand All @@ -223,7 +218,7 @@ func Capture(ctx context.Context, urls ...string) (shots []screenshot.Screenshot
mu.Lock()
shots = append(shots, shot)
mu.Unlock()
}(uri)
}(input)
}
wg.Wait()

Expand Down
8 changes: 6 additions & 2 deletions reduxer/reduxer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package reduxer // import "github.com/wabarc/wayback/reduxer"

import (
"context"
"net/url"
"os"
"os/exec"
"path/filepath"
Expand Down Expand Up @@ -35,8 +36,11 @@ func TestDo(t *testing.T) {
t.Fatalf("Parse environment variables or flags failed, error: %v", err)
}

urls := []string{"https://example.com/"}
res, err := Do(context.Background(), urls...)
inp, err := url.Parse("https://example.com/")
if err != nil {
t.Fatalf("Unexpected parse url: %v", err)
}
res, err := Do(context.Background(), inp)
if err != nil {
t.Fatalf("Unexpected execute do: %v", err)
}
Expand Down
3 changes: 2 additions & 1 deletion service/discord/discord.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ package discord // import "github.com/wabarc/wayback/service/discord"
import (
"context"
"encoding/base64"
"net/url"
"strconv"
"strings"

Expand Down Expand Up @@ -265,7 +266,7 @@ func (d *Discord) process(m *discord.MessageCreate) (err error) {
return nil
}

func (d *Discord) wayback(ctx context.Context, m *discord.MessageCreate, urls []string) error {
func (d *Discord) wayback(ctx context.Context, m *discord.MessageCreate, urls []*url.URL) error {
stage, err := d.edit(m, "Archiving...")
if err != nil {
logger.Error("send archiving message failed: %v", err)
Expand Down
3 changes: 2 additions & 1 deletion service/httpd/httpd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"github.com/wabarc/wayback/config"
"github.com/wabarc/wayback/pooling"
"github.com/wabarc/wayback/reduxer"
"github.com/wabarc/wayback/service"
)

func TestTransform(t *testing.T) {
Expand All @@ -30,7 +31,7 @@ func TestTransform(t *testing.T) {
}

text := "some text https://example.com"
urls := helper.MatchURL(text)
urls := service.MatchURL(text)
rbes := make(reduxer.Bundles)
col, _ := wayback.Wayback(context.TODO(), &rbes, urls...)
collector := transform(col)
Expand Down
3 changes: 2 additions & 1 deletion service/slack/slack.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package slack // import "github.com/wabarc/wayback/service/slack"

import (
"context"
"net/url"

"github.com/fatih/color"
"github.com/slack-go/slack"
Expand Down Expand Up @@ -280,7 +281,7 @@ func (s *Slack) process(ev *event) (err error) {
return nil
}

func (s *Slack) wayback(ctx context.Context, ev *event, urls []string) error {
func (s *Slack) wayback(ctx context.Context, ev *event, urls []*url.URL) error {
tstamp, err := s.edit(ev.Channel, ev.ThreadTimeStamp, "Archiving...")
if err != nil {
logger.Error("send archiving message failed: %v", err)
Expand Down
3 changes: 2 additions & 1 deletion service/telegram/telegram.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"context"
"encoding/base64"
"fmt"
"net/url"
"regexp"
"strconv"
"strings"
Expand Down Expand Up @@ -238,7 +239,7 @@ func (t *Telegram) process(message *telegram.Message) (err error) {
return nil
}

func (t *Telegram) wayback(ctx context.Context, message *telegram.Message, urls []string) error {
func (t *Telegram) wayback(ctx context.Context, message *telegram.Message, urls []*url.URL) error {
stage, err := t.bot.Edit(message, "Archiving...")
if err != nil {
logger.Error("send archiving message failed: %v", err)
Expand Down
13 changes: 9 additions & 4 deletions service/twitter/twitter.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package twitter // import "github.com/wabarc/wayback/service/twitter"

import (
"context"
"net/url"
"sync"
"time"

Expand Down Expand Up @@ -160,9 +161,13 @@ func (t *Twitter) process(event twitter.DirectMessageEvent) error {
}()

urls := service.MatchURL(text)
var realURLs []string
for _, url := range urls {
realURLs = append(realURLs, helper.RealURI(url))
var realURLs []*url.URL
for _, uri := range urls {
u, err := url.Parse(helper.RealURI(uri.String()))
if err != nil {
continue
}
realURLs = append(realURLs, u)
}
logger.Debug("real urls: %v", realURLs)

Expand All @@ -172,7 +177,7 @@ func (t *Twitter) process(event twitter.DirectMessageEvent) error {
}

var bundles reduxer.Bundles
cols, err := wayback.Wayback(context.TODO(), &bundles, urls...)
cols, err := wayback.Wayback(context.TODO(), &bundles, realURLs...)
if err != nil {
logger.Error("archives failure, ", err)
return err
Expand Down
33 changes: 30 additions & 3 deletions service/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,41 @@
package service // import "github.com/wabarc/wayback/service"

import (
"net/url"
"strings"

"github.com/wabarc/helper"
"github.com/wabarc/wayback/config"
)

// MatchURL returns a slice string contains URLs extracted from the given string.
func MatchURL(s string) []string {
func MatchURL(s string) (urls []*url.URL) {
var matches []string
if config.Opts.WaybackFallback() {
return helper.MatchURLFallback(s)
matches = helper.MatchURLFallback(s)
}
matches = helper.MatchURL(s)

for i := range matches {
u, _ := url.Parse(matches[i])
urls = append(urls, u)
}

return removeDuplicates(urls)
}

func removeDuplicates(elements []*url.URL) (urls []*url.URL) {
encountered := map[string]bool{}
slash := "/"
for _, u := range elements {
key := u.User.String() + u.Host + u.Path + u.RawQuery + u.Fragment
if u.Path == "" && !strings.HasSuffix(key, slash) {
key += slash
}
if !encountered[key] {
encountered[key] = true
urls = append(urls, u)
}
}
return helper.MatchURL(s)
return
}
64 changes: 64 additions & 0 deletions service/utils_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
// Copyright 2022 Wayback Archiver. All rights reserved.
// Use of this source code is governed by the GNU GPL v3
// license that can be found in the LICENSE file.

package service // import "github.com/wabarc/wayback/service"

import (
"strconv"
"testing"

"github.com/wabarc/wayback/config"
)

func TestMatchURL(t *testing.T) {
parser := config.NewParser()
var err error
if config.Opts, err = parser.ParseEnvironmentVariables(); err != nil {
t.Fatalf("Parse environment variables or flags failed, error: %v", err)
}

t.Parallel()

var (
u = "http://example.org"
x = "http://example.com"
y = "https://example.com/"
z = "https://example.com/path"
)

var tests = []struct {
text string
leng int
}{
{
text: "",
leng: 0,
},
{
text: "foo " + x,
leng: 1,
},
{
text: x + " foo " + y,
leng: 1,
},
{
text: y + " foo " + z,
leng: 2,
},
{
text: u + " foo " + x,
leng: 2,
},
}

for i, test := range tests {
t.Run(strconv.Itoa(i), func(t *testing.T) {
got := len(MatchURL(test.text))
if got != test.leng {
t.Fatalf(`Unexpected extract URLs number from text got %d instead of %d`, got, test.leng)
}
})
}
}
Loading

0 comments on commit f3c31de

Please sign in to comment.