-
Notifications
You must be signed in to change notification settings - Fork 368
/
Copy pathWeb Crawler Multithreaded.java
108 lines (95 loc) · 2.89 KB
/
Web Crawler Multithreaded.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
/**
* // This is the HtmlParser's API interface.
* // You should not implement it, or speculate about its implementation
* interface HtmlParser {
* public List<String> getUrls(String url) {}
* }
*/
import java.net.URI;
import java.net.URISyntaxException;
class Solution {
public List<String> crawl(String startUrl, HtmlParser htmlParser) {
ResultRecord resultRecord = new ResultRecord();
CrawlTask task = new CrawlTask(
startUrl, htmlParser, resultRecord, UrlUtil.parseHostname(startUrl));
try {
task.start();
task.join();
} catch (InterruptedException e) {
e.printStackTrace();
}
return resultRecord.getResultList();
}
}
class CrawlTask extends Thread {
private final String url;
private final HtmlParser htmlParser;
private final ResultRecord resultRecord;
private final String parentHost;
public CrawlTask(String url,
HtmlParser htmlParser,
ResultRecord resultRecord,
String parentHost) {
this.url = url;
this.htmlParser = htmlParser;
this.resultRecord = resultRecord;
this.parentHost = parentHost;
}
public void run() {
String hostname = UrlUtil.parseHostname(url);
if (!hostname.equals(parentHost)) {
return;
}
if (resultRecord.addIfNotExists(url)) {
List<String> childUrls = htmlParser.getUrls(url);
List<CrawlTask> tasks = new ArrayList<>();
for (String childUrl : childUrls) {
tasks.add(new CrawlTask(
childUrl, htmlParser, resultRecord, parentHost));
}
try {
for (CrawlTask task : tasks) {
task.start();
}
for (CrawlTask task : tasks) {
task.join();
}
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
class UrlUtil {
public static String parseHostname(String url) {
try {
URI uri = new URI(url);
return uri.getHost();
} catch(URISyntaxException e) {
e.printStackTrace();
}
return null;
}
}
class ResultRecord {
private Set<String> urls;
private Semaphore mutex;
public ResultRecord() {
this.urls = new HashSet<>();
this.mutex = new Semaphore(1);
}
public boolean addIfNotExists(String url) {
try {
this.mutex.acquire();
boolean added = this.urls.add(url);
this.mutex.release();
return added;
} catch (InterruptedException e) {
e.printStackTrace();
}
return false;
}
public List<String> getResultList() {
return new ArrayList<>(urls);
}
}