Skip to content

Commit

Permalink
feat: add jq selector to allow you to reduce selected JSON objects
Browse files Browse the repository at this point in the history
  • Loading branch information
wintermi committed Dec 6, 2023
1 parent 2ac714a commit b8e5b5c
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 14 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ ARGS:
Element Selector (Required)
-i string
CSV File containing URLs to Scrape (Required)
-j string
jq Selector
-o string
Output CSV File (Required)
-v Output Verbose Detail
Expand Down
73 changes: 63 additions & 10 deletions crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ package main
import (
"bufio"
"encoding/csv"
"encoding/json"
"errors"
"fmt"
"net/url"
"os"
Expand All @@ -25,28 +27,31 @@ import (
"time"

"github.com/gocolly/colly"
"github.com/itchyny/gojq"
"github.com/weppos/publicsuffix-go/publicsuffix"
)

type Crawler struct {
Collector *colly.Collector
Selector string
URL []string
ScrapedData []string
Collector *colly.Collector
elementSelector string
jqSelector string
URL []string
ScrapedData []string
}

//---------------------------------------------------------------------------------------

// Return New Instance of a Crawler with an Embedded Colly Collector
func NewCrawler(selector string) *Crawler {
func NewCrawler(elementSelector string, jqSelector string) *Crawler {

// Initialise New Crawler
crawler := new(Crawler)
crawler.Collector = colly.NewCollector(
colly.UserAgent("Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/120.0"),
colly.MaxDepth(5),
)
crawler.Selector = selector
crawler.elementSelector = elementSelector
crawler.jqSelector = jqSelector

return crawler
}
Expand Down Expand Up @@ -183,13 +188,22 @@ func (crawler *Crawler) ExecuteScrape(scrapeXML bool, waitTime int64) error {

// Scrape XML or HTML
if scrapeXML {
crawler.Collector.OnXML(crawler.Selector, func(element *colly.XMLElement) {
// Define the OnXML Selector Callback Function
crawler.Collector.OnXML(crawler.elementSelector, func(element *colly.XMLElement) {
crawler.ScrapedData = append(crawler.ScrapedData, element.Text)
})
} else {
// Define the Selector Callback Function
crawler.Collector.OnHTML(crawler.Selector, func(element *colly.HTMLElement) {
crawler.ScrapedData = append(crawler.ScrapedData, element.Text)
// Define the OnHTML Selector Callback Function
crawler.Collector.OnHTML(crawler.elementSelector, func(element *colly.HTMLElement) {

// Execute the jq Selector
textSelected, err := jqSelect(element.Text, crawler.jqSelector)
if err != nil {
logger.Error().Err(fmt.Errorf("jq Selector Failed: %w", err)).Msg(doubleIndent)
return
}

crawler.ScrapedData = append(crawler.ScrapedData, textSelected)
})
}

Expand Down Expand Up @@ -243,3 +257,42 @@ func (crawler *Crawler) WriteFile(name string, delimiter string) error {

return nil
}

//---------------------------------------------------------------------------------------

// Execute Scraping of URL
func jqSelect(elementText string, query string) (string, error) {

// If the JSON Selector Query was NOT provided then return the element text
if query == "" {
return elementText, nil
}

// Convert the element text to a JSON Object before querying
var jsonData map[string]any
if err := json.Unmarshal([]byte(elementText), &jsonData); err != nil {
return "", fmt.Errorf("Selected Element Text is not a valid JSON Object: %w", err)
}

// Parse the provided jq selector text
jq, err := gojq.Parse(query)
if err != nil {
return "", fmt.Errorf("jq Selector Parse Failed: %w", err)
}

// Execute the jq Selector against the element text only returning the first value
jqSelector := jq.Run(jsonData)
val, ok := jqSelector.Next()
if !ok {
return "", errors.New("jq Selector Failed to Find First Value")
}

// Check if the first value returned is actually an error
if err, ok := val.(error); ok {
return "", fmt.Errorf("jq Selector Run Failed: %w", err)
}

// Convert the first value returned to a raw JSON string and return
rawJSON, _ := json.Marshal(val)
return string(rawJSON), nil
}
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ go 1.20

require (
github.com/gocolly/colly v1.2.0
github.com/itchyny/gojq v0.12.14
github.com/rs/zerolog v1.31.0
github.com/weppos/publicsuffix-go v0.30.1
)
Expand All @@ -17,10 +18,12 @@ require (
github.com/gobwas/glob v0.2.3 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.3 // indirect
github.com/itchyny/timefmt-go v0.1.5 // indirect
github.com/kennygrant/sanitize v1.2.4 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect
github.com/stretchr/testify v1.8.4 // indirect
github.com/temoto/robotstxt v1.1.2 // indirect
golang.org/x/net v0.19.0 // indirect
golang.org/x/sys v0.15.0 // indirect
Expand Down
10 changes: 8 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ github.com/antchfx/xpath v1.2.5/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwq
github.com/bwesterb/go-ristretto v1.2.0/go.mod h1:fUIoIZaG73pV5biE2Blr2xEzDoMj7NFEuV9ekS419A0=
github.com/cloudflare/circl v1.1.0/go.mod h1:prBCrKB9DV4poKZY1l9zBXg2QJY7mvgRvtMxxK7fi4I=
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI=
Expand All @@ -37,6 +37,10 @@ github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/go-github/v50 v50.2.0/go.mod h1:VBY8FB6yPIjrtKhozXv4FQupxKLS6H4m6xFZlT43q8Q=
github.com/google/go-querystring v1.1.0/go.mod h1:Kcdr2DB4koayq7X8pmAG4sNG59So17icRSOU623lUBU=
github.com/itchyny/gojq v0.12.14 h1:6k8vVtsrhQSYgSGg827AD+PVVaB1NLXEdX+dda2oZCc=
github.com/itchyny/gojq v0.12.14/go.mod h1:y1G7oO7XkcR1LPZO59KyoCRy08T3j9vDYRV0GgYSS+s=
github.com/itchyny/timefmt-go v0.1.5 h1:G0INE2la8S6ru/ZI5JecgyzbbJNs5lG1RcBqa7Jm6GE=
github.com/itchyny/timefmt-go v0.1.5/go.mod h1:nEP7L+2YmAbT2kZ2HfSs1d8Xtw9LY8D2stDBckWakZ8=
github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
Expand All @@ -54,8 +58,9 @@ github.com/rs/zerolog v1.31.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWR
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA=
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg=
github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
github.com/weppos/publicsuffix-go v0.30.1 h1:8q+QwBS1MY56Zjfk/50ycu33NN8aa1iCCEQwo/71Oos=
Expand Down Expand Up @@ -138,3 +143,4 @@ google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqw
google.golang.org/protobuf v1.28.1/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8=
google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
6 changes: 4 additions & 2 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ import (
)

var logger zerolog.Logger
var applicationText = "%s 0.1.0%s"
var applicationText = "%s 0.2.0%s"
var copyrightText = "Copyright 2023-2024, Matthew Winter\n"
var indent = "..."
var doubleIndent = "......."
Expand Down Expand Up @@ -55,6 +55,7 @@ func main() {
// Define the Long CLI flag names
var inputCsvFile = flag.String("i", "", "CSV File containing URLs to Scrape (Required)")
var elementSelector = flag.String("e", "", "Element Selector (Required)")
var jqSelector = flag.String("j", "", "jq Selector")
var outputCsvFile = flag.String("o", "", "Output CSV File (Required)")
var fieldDelimiter = flag.String("d", ",", "Field Delimiter (Required)")
var waitTime = flag.Int64("w", 100, "Wait Time in Milliseconds between Colly Visits")
Expand Down Expand Up @@ -93,14 +94,15 @@ func main() {
logger.Info().Msg("Arguments")
logger.Info().Str("CSV File containing URLs to Scrape", *inputCsvFile).Msg(indent)
logger.Info().Str("Element Selector", *elementSelector).Msg(indent)
logger.Info().Str("jq Selector", *jqSelector).Msg(indent)
logger.Info().Str("Output CSV File", *outputCsvFile).Msg(indent)
logger.Info().Str("Field Delimiter", *fieldDelimiter).Msg(indent)
logger.Info().Int64("Wait Time in Milliseconds between Colly Visits", *waitTime).Msg(indent)
logger.Info().Bool("Scrape XML not HTML", *scrapeXML).Msg(indent)
logger.Info().Msg("Begin")

// Load the URLs into memory ready for Colly to crawl & scrape the Linked Data
var crawler = NewCrawler(*elementSelector)
var crawler = NewCrawler(*elementSelector, *jqSelector)
if err := crawler.LoadUrlFile(*inputCsvFile, *fieldDelimiter); err != nil {
logger.Error().Err(err).Msg("Failed Loading Queries")
os.Exit(1)
Expand Down

0 comments on commit b8e5b5c

Please sign in to comment.