Skip to content

Commit

Permalink
Update example URLs to use https instead of http.
Browse files Browse the repository at this point in the history
These sites all redirect http requests to the corresponding https equivalent, so to make test and examples work better, update the URLs used to request https in the first place:

http://www.ics.uci.edu
http://www.cnn.com
http://www.wikipedia.org
  • Loading branch information
pgalbraith committed Sep 29, 2018
1 parent 8ee9172 commit 2ed59ba
Show file tree
Hide file tree
Showing 10 changed files with 28 additions and 28 deletions.
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,14 +92,14 @@ public class MyCrawler extends WebCrawler {
* the given url should be crawled or not (based on your crawling logic).
* In this example, we are instructing the crawler to ignore urls that
* have css, js, git, ... extensions and to only accept urls that start
* with "http://www.ics.uci.edu/". In this case, we didn't need the
* with "https://www.ics.uci.edu/". In this case, we didn't need the
* referringPage parameter to make the decision.
*/
@Override
public boolean shouldVisit(Page referringPage, WebURL url) {
String href = url.getURL().toLowerCase();
return !FILTERS.matcher(href).matches()
&& href.startsWith("http://www.ics.uci.edu/");
&& href.startsWith("https://www.ics.uci.edu/");
}

/**
Expand Down Expand Up @@ -158,9 +158,9 @@ public class Controller {
* URLs that are fetched and then the crawler starts following links
* which are found in these pages
*/
controller.addSeed("http://www.ics.uci.edu/~lopes/");
controller.addSeed("http://www.ics.uci.edu/~welling/");
controller.addSeed("http://www.ics.uci.edu/");
controller.addSeed("https://www.ics.uci.edu/~lopes/");
controller.addSeed("https://www.ics.uci.edu/~welling/");
controller.addSeed("https://www.ics.uci.edu/");

/*
* Start the crawl. This is a blocking operation, meaning that your code
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,9 @@ public static void main(String[] args) throws Exception {
* URLs that are fetched and then the crawler starts following links
* which are found in these pages
*/
controller.addSeed("http://www.ics.uci.edu/");
controller.addSeed("http://www.ics.uci.edu/~lopes/");
controller.addSeed("http://www.ics.uci.edu/~welling/");
controller.addSeed("https://www.ics.uci.edu/");
controller.addSeed("https://www.ics.uci.edu/~lopes/");
controller.addSeed("https://www.ics.uci.edu/~welling/");

/*
* Start the crawl. This is a blocking operation, meaning that your code
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ public boolean shouldVisit(Page referringPage, WebURL url) {
}

// Only accept the url if it is in the "www.ics.uci.edu" domain and protocol is "http".
return href.startsWith("http://www.ics.uci.edu/");
return href.startsWith("https://www.ics.uci.edu/");
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ public static void main(String[] args) throws Exception {
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

controller.addSeed("http://www.ics.uci.edu/");
controller.addSeed("https://www.ics.uci.edu/");
controller.start(LocalDataCollectorCrawler.class, numberOfCrawlers);

List<Object> crawlersLocalData = controller.getCrawlersLocalData();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ public LocalDataCollectorCrawler() {
@Override
public boolean shouldVisit(Page referringPage, WebURL url) {
String href = url.getURL().toLowerCase();
return !FILTERS.matcher(href).matches() && href.startsWith("http://www.ics.uci.edu/");
return !FILTERS.matcher(href).matches() && href.startsWith("https://www.ics.uci.edu/");
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,20 +77,20 @@ public static void main(String[] args) throws Exception {
CrawlController controller1 = new CrawlController(config1, pageFetcher1, robotstxtServer);
CrawlController controller2 = new CrawlController(config2, pageFetcher2, robotstxtServer);

String[] crawler1Domains = {"http://www.ics.uci.edu/", "http://www.cnn.com/"};
String[] crawler2Domains = {"http://en.wikipedia.org/"};
String[] crawler1Domains = {"https://www.ics.uci.edu/", "https://www.cnn.com/"};
String[] crawler2Domains = {"https://en.wikipedia.org/"};

controller1.setCustomData(crawler1Domains);
controller2.setCustomData(crawler2Domains);

controller1.addSeed("http://www.ics.uci.edu/");
controller1.addSeed("http://www.cnn.com/");
controller1.addSeed("http://www.ics.uci.edu/~lopes/");
controller1.addSeed("http://www.cnn.com/POLITICS/");
controller1.addSeed("https://www.ics.uci.edu/");
controller1.addSeed("https://www.cnn.com/");
controller1.addSeed("https://www.ics.uci.edu/~lopes/");
controller1.addSeed("https://www.cnn.com/POLITICS/");

controller2.addSeed("http://en.wikipedia.org/wiki/Main_Page");
controller2.addSeed("http://en.wikipedia.org/wiki/Obama");
controller2.addSeed("http://en.wikipedia.org/wiki/Bing");
controller2.addSeed("https://en.wikipedia.org/wiki/Main_Page");
controller2.addSeed("https://en.wikipedia.org/wiki/Obama");
controller2.addSeed("https://en.wikipedia.org/wiki/Bing");

/*
* The first crawler will have 5 concurrent threads and the second
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ public class BasicCrawler extends WebCrawler {
".*(\\.(css|js|bmp|gif|jpe?g|png|tiff?|mid|mp2|mp3|mp4|wav|avi|mov|mpeg|ram|m4v|pdf" +
"|rm|smil|wmv|swf|wma|zip|rar|gz))$");

private static final String DOMAIN = "http://www.ics.uci.edu/";
private static final String DOMAIN = "https://www.ics.uci.edu/";

@Override
public boolean shouldVisit(Page referringPage, WebURL url) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,9 @@ public static void main(String[] args) throws Exception {
* URLs that are fetched and then the crawler starts following links
* which are found in these pages
*/
controller.addSeed("http://www.ics.uci.edu/~welling/");
controller.addSeed("http://www.ics.uci.edu/~lopes/");
controller.addSeed("http://www.ics.uci.edu/");
controller.addSeed("https://www.ics.uci.edu/~welling/");
controller.addSeed("https://www.ics.uci.edu/~lopes/");
controller.addSeed("https://www.ics.uci.edu/");

/*
* Start the crawl. This is a blocking operation, meaning that your code
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,9 @@ public static void main(String[] args) throws Exception {
* URLs that are fetched and then the crawler starts following links
* which are found in these pages
*/
controller.addSeed("http://www.ics.uci.edu/~welling/");
controller.addSeed("http://www.ics.uci.edu/~lopes/");
controller.addSeed("http://www.ics.uci.edu/");
controller.addSeed("https://www.ics.uci.edu/~welling/");
controller.addSeed("https://www.ics.uci.edu/~lopes/");
controller.addSeed("https://www.ics.uci.edu/");

/*
* Start the crawl. This is a blocking operation, meaning that your code
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ public class StatusHandlerCrawler extends WebCrawler {
@Override
public boolean shouldVisit(Page referringPage, WebURL url) {
String href = url.getURL().toLowerCase();
return !FILTERS.matcher(href).matches() && href.startsWith("http://www.ics.uci.edu/");
return !FILTERS.matcher(href).matches() && href.startsWith("https://www.ics.uci.edu/");
}

/**
Expand Down

0 comments on commit 2ed59ba

Please sign in to comment.