Skip to content

Commit

Permalink
feat: add the shuffling of the URL list
Browse files Browse the repository at this point in the history
  • Loading branch information
wintermi committed Dec 9, 2023
1 parent 9dc2414 commit 5971827
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 5 deletions.
20 changes: 20 additions & 0 deletions crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"encoding/json"
"errors"
"fmt"
"math/rand"
"net/http"
"net/url"
"os"
Expand Down Expand Up @@ -75,6 +76,8 @@ func NewCrawler(elementSelector string, jqSelector string, waitTime int, paralle
// Load all URLs from the first column of the provided CSV File
func (c *Crawler) LoadUrlFile(name string, delimiter string) error {

logger.Info().Msgf("%s Loading URL List", indent)

// Check file exists
if _, err := os.Stat(name); err != nil {
return fmt.Errorf("[LoadUrlFile] File Does Not Exist: %w", err)
Expand Down Expand Up @@ -120,6 +123,8 @@ func (c *Crawler) LoadUrlFile(name string, delimiter string) error {
// Deduplicate the list of URLs
func (c *Crawler) DeduplicateURLs() error {

logger.Info().Msgf("%s Deduplicating URL List", indent)

// Define a hash map and deduped array list
bucket := make(map[string]bool)
var deduped []string
Expand All @@ -140,6 +145,21 @@ func (c *Crawler) DeduplicateURLs() error {

//---------------------------------------------------------------------------------------

// Shuffle the list of URLs
func (c *Crawler) ShuffleURLs() error {

logger.Info().Msgf("%s Shuffling URL List", indent)

r := rand.New(rand.NewSource(time.Now().UnixNano()))
r.Shuffle(len(c.URLs), func(i, j int) {
c.URLs[i], c.URLs[j] = c.URLs[j], c.URLs[i]
})

return nil
}

//---------------------------------------------------------------------------------------

// Populate the Collector Allowed Domains
func (c *Crawler) SetAllowedDomains() error {

Expand Down
16 changes: 11 additions & 5 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ import (
)

var logger zerolog.Logger
var applicationText = "%s 0.3.0%s"
var applicationText = "%s 0.3.1%s"
var copyrightText = "Copyright 2023-2024, Matthew Winter\n"
var indent = "..."
var doubleIndent = "......."
Expand Down Expand Up @@ -108,7 +108,7 @@ func main() {
// Load the URLs into memory ready for Colly to crawl & scrape the Linked Data
var crawler = NewCrawler(*elementSelector, *jqSelector, *waitTime, *parallelism)
if err := crawler.LoadUrlFile(*inputCsvFile, *fieldDelimiter); err != nil {
logger.Error().Err(err).Msg("Failed Loading Queries")
logger.Error().Err(err).Msg("Failed Loading URL List")
os.Exit(1)
}

Expand All @@ -118,21 +118,27 @@ func main() {
os.Exit(1)
}

// Shuffle the URL List, changing the order Colly scrapes them
if err := crawler.ShuffleURLs(); err != nil {
logger.Error().Err(err).Msg("Failed to Shuffle URL List")
os.Exit(1)
}

// Execute the Colly Collector
if err := crawler.ExecuteScrape(*scrapeXML); err != nil {
logger.Error().Err(err).Msg("Linked Data Scrape Failed")
logger.Error().Err(err).Msg("Scraping Linked Data Failed")
os.Exit(1)
}

// Write the Scraped Data out to a File
if err := crawler.WriteDataFile(*outputCsvFile, *fieldDelimiter); err != nil {
logger.Error().Err(err).Msg("Write Data File Failed")
logger.Error().Err(err).Msg("Writing Data File Failed")
os.Exit(1)
}

// Write the Failed Request URLs out to a File
if err := crawler.WriteErrorFile(*errorCsvFile, *fieldDelimiter); err != nil {
logger.Error().Err(err).Msg("Write Error File Failed")
logger.Error().Err(err).Msg("Writing Error File Failed")
os.Exit(1)
}

Expand Down

0 comments on commit 5971827

Please sign in to comment.