Permalink
Browse files

added a crawl filter based on <div> tag class names

When a crawl is started, a new field to exclude content from scraping is
available. The field can be identified with the class name of div tags.
All text contained in such a div tag where the configured class name(s)
match are not indexed, while the remaining page is indexed.
  • Loading branch information...
Orbiter committed Dec 9, 2017
1 parent 607b39b commit 25573bd5abf52c3bd4f483e83f47739f1ddb39ae
@@ -366,6 +366,18 @@ <h2>Expert Crawl Start</h2>
</dd>
</dl>
</fieldset>
<fieldset>
<legend>Content Filter</legend>
<p>These are limitations on parts of a document. The filter will be applied after a web page was loaded.</p>
<dl>
<dt>Filter div class names</dt>
<dd>
<table border="0">
<tr><td width="110">set of class names</td><td><input name="ignoreclassname" id="ignoreclassname" type="text" size="55" maxlength="100000" value="#[ignoreclassname]#" onblur="if (this.value=='') this.value='';"/></td><td>comma-separated list of div class names which should be filtered out</td></tr>
</table>
</dd>
</dl>
</fieldset>
<fieldset>
<legend>Clean-Up before Crawl Start</legend>
<dl>
@@ -513,6 +513,14 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
}
prop.put("agentSelect_defaultAgentName", ClientIdentification.yacyInternetCrawlerAgentName);
// ---------- Ignore Class Name
if (post != null && post.containsKey("ignoreclassname")) {
prop.put("ignoreclassname",
post.get("ignoreclassname", ""));
} else {
prop.put("ignoreclassname", "");
}
// ---------- Enrich Vocabulary
Collection<Tagging> vocs = LibraryProvider.autotagging.getVocabularies();
if (vocs.size() == 0) {
@@ -468,6 +468,15 @@ public static serverObjects respond(final RequestHeader header, final serverObje
boolean snapshotsReplaceOld = post.getBoolean("snapshotsReplaceOld");
String snapshotsMustnotmatch = post.get("snapshotsMustnotmatch", "");
String ignoreclassname_s = post.get("ignoreclassname");
Set<String> ignoreclassname = new HashSet<>();
if (ignoreclassname_s != null) {
String[] ignoreclassname_a = ignoreclassname_s.trim().split(",");
for (int i = 0; i < ignoreclassname_a.length; i++) {
ignoreclassname.add(ignoreclassname_a[i].trim());
}
}
// get vocabulary scraper info
JSONObject vocabulary_scraper = new JSONObject(); // key = vocabulary_name, value = properties with key = type (i.e. 'class') and value = keyword in context
for (String key: post.keySet()) {
@@ -552,6 +561,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje
cachePolicy,
collection,
agentName,
ignoreclassname,
new VocabularyScraper(vocabulary_scraper),
timezoneOffset);
handle = ASCII.getBytes(profile.handle());
@@ -646,7 +656,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje
/* No restriction on domains or subpath : we scrape now links and asynchronously push them to the crawlStacker */
final String crawlingFileContent = post.get("crawlingFile$file", "");
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000,
new VocabularyScraper(), profile.timezoneOffset());
new HashSet<String>(), new VocabularyScraper(), profile.timezoneOffset());
FileCrawlStarterTask crawlStarterTask = new FileCrawlStarterTask(crawlingFile, crawlingFileContent, scraper, profile,
sb.crawlStacker, sb.peers.mySeed().hash.getBytes());
sb.crawler.putActive(handle, profile);
@@ -784,7 +794,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje
final String crawlingFileContent) throws MalformedURLException, IOException, FileNotFoundException {
List<AnchorURL> hyperlinks_from_file;
// check if the crawl filter works correctly
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new VocabularyScraper(), timezoneOffset);
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new HashSet<String>(), new VocabularyScraper(), timezoneOffset);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
if((crawlingFileContent == null || crawlingFileContent.isEmpty()) && crawlingFile != null) {
/* Let's report here detailed error to help user when he selected a wrong file */
@@ -159,7 +159,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje
CacheStrategy.IFFRESH,
collection,
ClientIdentification.yacyIntranetCrawlerAgentName,
null,
null, null,
timezoneOffset);
sb.crawler.putActive(pe.handle().getBytes(), pe);
} catch (final Exception e) {
@@ -297,7 +297,7 @@ private void initActiveCrawlProfiles() {
CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_AUTOCRAWL_DEEP,
ClientIdentification.yacyInternetCrawlerAgentName,
null,
null, null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultAutocrawlDeepProfile.handle()),
@@ -330,7 +330,7 @@ private void initActiveCrawlProfiles() {
CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_AUTOCRAWL_SHALLOW,
ClientIdentification.yacyInternetCrawlerAgentName,
null,
null, null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultAutocrawlShallowProfile.handle()),
@@ -362,7 +362,7 @@ private void initActiveCrawlProfiles() {
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_PROXY,
ClientIdentification.yacyProxyAgentName,
null,
null, null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultProxyProfile.handle()),
@@ -394,7 +394,7 @@ private void initActiveCrawlProfiles() {
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_REMOTE,
ClientIdentification.yacyInternetCrawlerAgentName,
null,
null, null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultRemoteProfile.handle()),
@@ -426,7 +426,7 @@ private void initActiveCrawlProfiles() {
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName,
null,
null, null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()),
@@ -458,7 +458,7 @@ private void initActiveCrawlProfiles() {
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName,
null,
null, null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()),
@@ -491,7 +491,7 @@ private void initActiveCrawlProfiles() {
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT,
ClientIdentification.browserAgentName,
null,
null, null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()),
@@ -523,7 +523,7 @@ private void initActiveCrawlProfiles() {
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName,
null,
null, null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()),
@@ -555,7 +555,7 @@ private void initActiveCrawlProfiles() {
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName,
null,
null, null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()),
@@ -587,7 +587,7 @@ private void initActiveCrawlProfiles() {
CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_SURROGATE,
ClientIdentification.yacyIntranetCrawlerAgentName,
null,
null, null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultSurrogateProfile.handle()),
@@ -622,7 +622,7 @@ public CrawlProfile getPushCrawlProfile(String collection) {
CacheStrategy.NOCACHE,
collection,
ClientIdentification.yacyIntranetCrawlerAgentName,
null,
null, null,
0);
this.profilesActiveCrawls.put(UTF8.getBytes(genericPushProfile.handle()), genericPushProfile);
this.defaultPushProfiles.put(collection, genericPushProfile);
@@ -28,10 +28,12 @@
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
@@ -44,6 +46,8 @@
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.JSONArray;
import net.yacy.cora.util.JSONTokener;
import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.data.word.Word;
@@ -96,6 +100,7 @@
INDEX_TEXT ("indexText", false, CrawlAttribute.BOOLEAN, "Index Text"),
INDEX_MEDIA ("indexMedia", false, CrawlAttribute.BOOLEAN, "Index Media"),
COLLECTIONS ("collections", false, CrawlAttribute.STRING, "Collections (comma-separated list)"),
IGNORE_DIV_CLASS_NAME ("ignore_class_name", false, CrawlAttribute.STRING, "Ignore DIV Class names"),
SCRAPER ("scraper", false, CrawlAttribute.STRING, "Declaration for Vocabulary Scraper"),
TIMEZONEOFFSET ("timezoneOffset", true, CrawlAttribute.INTEGER, "Time Zone of Crawl Start Agent");
@@ -128,6 +133,7 @@ public String toString() {
private Pattern snapshotsMustnotmatch = null;
private final Map<String, AtomicInteger> doms;
private final Set<String> ignore_class_name;
private final VocabularyScraper scraper;
/**
@@ -190,6 +196,7 @@ public CrawlProfile(
final CacheStrategy cacheStrategy,
final String collections,
final String userAgentName,
final Set<String> ignore_class_name,
final VocabularyScraper scraper,
final int timezoneOffset) {
super(40);
@@ -230,9 +237,12 @@ public CrawlProfile(
put(CrawlAttribute.SNAPSHOTS_MUSTNOTMATCH.key, snapshotsMustnotmatch);
put(CrawlAttribute.CACHE_STRAGEGY.key, cacheStrategy.toString());
put(CrawlAttribute.COLLECTIONS.key, CommonPattern.SPACE.matcher(collections.trim()).replaceAll(""));
// we transform the scraper information into a JSON Array
// we transform the ignore_class_name and scraper information into a JSON Array
this.ignore_class_name = ignore_class_name == null ? new HashSet<String>() : ignore_class_name;
String jsonString = new JSONArray(ignore_class_name).toString();
put(CrawlAttribute.IGNORE_DIV_CLASS_NAME.key, jsonString);
this.scraper = scraper == null ? new VocabularyScraper() : scraper;
String jsonString = this.scraper.toString();
jsonString = this.scraper.toString();
assert jsonString != null && jsonString.length() > 0 && jsonString.charAt(0) == '{' : "jsonString = " + jsonString;
put(CrawlAttribute.SCRAPER.key, jsonString);
put(CrawlAttribute.TIMEZONEOFFSET.key, timezoneOffset);
@@ -246,10 +256,18 @@ public CrawlProfile(final Map<String, String> ext) {
super(ext == null ? 1 : ext.size());
if (ext != null) putAll(ext);
this.doms = new ConcurrentHashMap<String, AtomicInteger>();
String jsonString = ext.get(CrawlAttribute.SCRAPER.key);
String jsonString = ext.get(CrawlAttribute.IGNORE_DIV_CLASS_NAME.key);
JSONArray a = jsonString == null ? new JSONArray() : new JSONArray(new JSONTokener(jsonString));
this.ignore_class_name = new HashSet<String>();
for (int i = 0; i < a.length(); i++) this.ignore_class_name.add(a.getString(i));
jsonString = ext.get(CrawlAttribute.SCRAPER.key);
this.scraper = jsonString == null || jsonString.length() == 0 ? new VocabularyScraper() : new VocabularyScraper(jsonString);
}
public Set<String> ignoreDivClassName() {
return this.ignore_class_name;
}
public VocabularyScraper scraper() {
return this.scraper;
}
@@ -798,4 +816,19 @@ public void putProfileEntry(
prop.put(CRAWL_PROFILE_PREFIX+count+"_crawlingDomFilterContent", i);
}
public static void main(String[] args) {
// test to convert the key set from set to string and back
Set<String> a = new HashSet<>();
a.add("eins"); a.add("zwei"); a.add("drei");
JSONArray j = new JSONArray(a);
String s = j.toString();
System.out.println(s);
JSONTokener o = new JSONTokener(s);
j = new JSONArray(o);
System.out.println(j);
Set<String> h = new HashSet<String>();
for (int i = 0; i < j.length(); i++) h.add(j.getString(i));
System.out.println(h);
}
}
@@ -28,6 +28,7 @@
import java.nio.charset.StandardCharsets;
import java.util.Date;
import java.util.HashSet;
import java.util.Locale;
import net.yacy.cora.document.analysis.Classification;
@@ -861,7 +862,7 @@ public EventOrigin processCase(final String mySeedHash) {
final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.getContentType());
if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url());
try {
return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.getContentType(), this.responseHeader == null ? StandardCharsets.UTF_8.name() : this.responseHeader.getCharacterEncoding(), new VocabularyScraper(), this.request.timezoneOffset(), this.request.depth(), this.content);
return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.getContentType(), this.responseHeader == null ? StandardCharsets.UTF_8.name() : this.responseHeader.getCharacterEncoding(), new HashSet<String>(), new VocabularyScraper(), this.request.timezoneOffset(), this.request.depth(), this.content);
} catch(Parser.Failure e) {
throw e;
} catch (final Exception e) {
@@ -135,7 +135,7 @@ private static int importFromBookmarks(final BookmarksDB db, final DigestURL bas
final Set<String> tags=ListManager.string2set(tag); //this allow multiple default tags
try {
//load the links
final ContentScraper scraper = new ContentScraper(baseURL, 10000, new VocabularyScraper(), 0);
final ContentScraper scraper = new ContentScraper(baseURL, 10000, new HashSet<String>(), new VocabularyScraper(), 0);
//OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
FileUtils.copy(input,writer);
@@ -190,7 +190,7 @@ protected static String crawlStart(
CacheStrategy.IFFRESH,
"robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName,
null,
null, null,
0); // TODO: make this a default profile in CrawlSwitchboard
sb.crawler.putActive(pe.handle().getBytes(), pe);
return sb.crawlStacker.stackCrawl(new Request(
Oops, something went wrong.

0 comments on commit 25573bd

Please sign in to comment.