Skip to content

Commit

Permalink
- the webgraph shall store all links which appear on a web page and not
Browse files Browse the repository at this point in the history
all unique links! This made it necessary, that a large portion of the
parser and link processing classes must be adopted to carry a different
type of link collection which carry a property attribute which are
attached to web anchors.
- introduction of a new URL class, AnchorURL
- the other url classes, DigestURI and MultiProtocolURI had been renamed
and refactored to fit into a new document package schema, document.id
- cleanup of net.yacy.cora.document package and refactoring
  • Loading branch information
Orbiter committed Sep 14, 2013
1 parent 1a8c641 commit 5e31bad
Show file tree
Hide file tree
Showing 313 changed files with 1,716 additions and 1,632 deletions.
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -12,3 +12,4 @@ RELEASE/
lib/yacy-cora.jar
/DATA.bkp
/DATA.1
/gen
6 changes: 3 additions & 3 deletions htroot/BlacklistTest_p.java
Expand Up @@ -31,8 +31,8 @@

import java.net.MalformedURLException;

import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.repository.Blacklist;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard;
Expand All @@ -55,9 +55,9 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
!urlstring.startsWith("ftp://") &&
!urlstring.startsWith("smb://") &&
!urlstring.startsWith("file://")) urlstring = "http://" + urlstring;
DigestURI testurl = null;
DigestURL testurl = null;
try {
testurl = new DigestURI(urlstring);
testurl = new DigestURL(urlstring);
} catch (final MalformedURLException e) {
testurl = null;
}
Expand Down
6 changes: 3 additions & 3 deletions htroot/Blacklist_p.java
Expand Up @@ -35,12 +35,12 @@
import java.util.Arrays;
import java.util.List;

import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.ListManager;
import net.yacy.data.WorkTables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.repository.Blacklist;
import net.yacy.repository.Blacklist.BlacklistType;
Expand Down Expand Up @@ -81,9 +81,9 @@ public static serverObjects respond(final RequestHeader header, final serverObje
!urlstring.startsWith("file://")) {
urlstring = "http://"+urlstring;
}
DigestURI testurl;
DigestURL testurl;
try {
testurl = new DigestURI(urlstring);
testurl = new DigestURL(urlstring);
} catch (final MalformedURLException e) {
testurl = null;
}
Expand Down
2 changes: 1 addition & 1 deletion htroot/Blog.java
Expand Up @@ -38,7 +38,7 @@
import java.util.Locale;
import java.util.Map;

import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
Expand Down
2 changes: 1 addition & 1 deletion htroot/BlogComments.java
Expand Up @@ -35,7 +35,7 @@
import java.util.Date;
import java.util.Iterator;

import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
Expand Down
6 changes: 3 additions & 3 deletions htroot/Bookmarks.java
Expand Up @@ -39,7 +39,8 @@
import java.util.Set;

import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.HeaderFramework;
Expand All @@ -53,7 +54,6 @@
import net.yacy.data.BookmarksDB.Tag;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.peers.NewsPool;
import net.yacy.search.Switchboard;
Expand Down Expand Up @@ -247,7 +247,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje

try {
final File file = new File(post.get("htmlfile"));
BookmarkHelper.importFromBookmarks(sb.bookmarksDB, new DigestURI(file), post.get("htmlfile$file"), tags, isPublic);
BookmarkHelper.importFromBookmarks(sb.bookmarksDB, new DigestURL(file), post.get("htmlfile$file"), tags, isPublic);
} catch (final MalformedURLException e) {}

ConcurrentLog.info("BOOKMARKS", "success!!");
Expand Down
6 changes: 3 additions & 3 deletions htroot/CacheResource_p.java
Expand Up @@ -24,13 +24,13 @@

import java.net.MalformedURLException;

import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.Cache;
import net.yacy.document.ImageParser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
import net.yacy.server.servletProperties;
Expand All @@ -44,9 +44,9 @@ public static Object respond(final RequestHeader header, final serverObjects pos
if (post == null) return prop;

final String u = post.get("url", "");
DigestURI url;
DigestURL url;
try {
url = new DigestURI(u);
url = new DigestURL(u);
} catch (final MalformedURLException e) {
ConcurrentLog.logException(e);
return prop;
Expand Down
6 changes: 3 additions & 3 deletions htroot/Collage.java
Expand Up @@ -24,7 +24,7 @@

import java.util.Random;

import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.crawler.data.ResultImages;
Expand Down Expand Up @@ -89,8 +89,8 @@ public static serverObjects respond(final RequestHeader header, final serverObje
final int yOffset = embed ? 0 : 70;
for (int i = 0; i < fifoSize; i++) {

final MultiProtocolURI baseURL = origins[i].baseURL;
final MultiProtocolURI imageURL = origins[i].imageEntry.url();
final MultiProtocolURL baseURL = origins[i].baseURL;
final MultiProtocolURL imageURL = origins[i].imageEntry.url();

// check if this loads a page from localhost, which must be prevented to protect the server
// against attacks to the administration interface when localhost access is granted
Expand Down
4 changes: 2 additions & 2 deletions htroot/ConfigAppearance_p.java
Expand Up @@ -39,9 +39,9 @@
import java.util.List;
import java.util.Map;

import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
Expand Down Expand Up @@ -101,7 +101,7 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea

final Iterator<String> it;
try {
final DigestURI u = new DigestURI(url);
final DigestURL u = new DigestURL(url);
it = FileUtils.strings(u.get(ClientIdentification.yacyInternetCrawlerAgent));
} catch (final IOException e) {
prop.put("status", "1");// unable to get URL
Expand Down
2 changes: 1 addition & 1 deletion htroot/ConfigHeuristics_p.java
Expand Up @@ -239,7 +239,7 @@ private static void writeopensearchcfg(final Switchboard sb, final serverObjects

// re-read config (and create/update work table)
if (sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, true)) {
OpenSearchConnector os = new OpenSearchConnector(sb, true);
new OpenSearchConnector(sb, true);
}
}
}
4 changes: 2 additions & 2 deletions htroot/ConfigLanguage_p.java
Expand Up @@ -39,11 +39,11 @@
import java.util.List;
import java.util.Map;

import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.data.Translator;
import net.yacy.data.WorkTables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
Expand Down Expand Up @@ -100,7 +100,7 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
final String url = post.get("url");
Iterator<String> it;
try {
final DigestURI u = new DigestURI(url);
final DigestURL u = new DigestURL(url);
it = FileUtils.strings(u.get(ClientIdentification.yacyInternetCrawlerAgent));
} catch(final IOException e) {
prop.put("status", "1");//unable to get url
Expand Down
2 changes: 1 addition & 1 deletion htroot/ConfigNetwork_p.java
Expand Up @@ -30,7 +30,7 @@
import java.io.IOException;
import java.util.Set;

import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.data.WorkTables;
import net.yacy.kelondro.util.FileUtils;
Expand Down
5 changes: 3 additions & 2 deletions htroot/ConfigPortal.java
Expand Up @@ -30,10 +30,11 @@
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Properties;

import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.WorkTables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.server.serverObjects;
Expand Down Expand Up @@ -98,7 +99,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje

String excludehosts = post.get("search.excludehosts", "");
sb.setConfig("search.excludehosts", excludehosts);
sb.setConfig("search.excludehosth", DigestURI.hosthashes(excludehosts));
sb.setConfig("search.excludehosth", DigestURL.hosthashes(excludehosts));
}
if (post.containsKey("searchpage_default")) {
// load defaults from defaults/yacy.init file
Expand Down
4 changes: 2 additions & 2 deletions htroot/ConfigUpdate_p.java
Expand Up @@ -32,9 +32,9 @@
import java.util.Set;
import java.util.TreeSet;

import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.OS;
import net.yacy.peers.operation.yacyBuildProperties;
Expand Down Expand Up @@ -86,7 +86,7 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
final String release = post.get("releasedownload", "");
if (!release.isEmpty()) {
try {
yacyRelease versionToDownload = new yacyRelease(new DigestURI(release));
yacyRelease versionToDownload = new yacyRelease(new DigestURL(release));

// replace this version with version which contains public key
final yacyRelease.DevAndMainVersions allReleases = yacyRelease.allReleases(false, false);
Expand Down
8 changes: 4 additions & 4 deletions htroot/CrawlCheck_p.java
Expand Up @@ -24,14 +24,14 @@
import java.util.Set;
import java.util.regex.Pattern;

import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.retrieval.Response;
import net.yacy.crawler.robots.RobotsTxtEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
Expand All @@ -49,7 +49,7 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
if (post.containsKey("crawlcheck")) {

// get the list of rootURls for this crawl start
Set<DigestURI> rootURLs = new HashSet<DigestURI>();
Set<DigestURL> rootURLs = new HashSet<DigestURL>();
String crawlingStart0 = post.get("crawlingURLs","").trim();
String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|"));
for (String crawlingStart: rootURLs0) {
Expand All @@ -61,7 +61,7 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
if (crawlingStart.startsWith("ftp")) crawlingStart = "ftp://" + crawlingStart;
}
try {
DigestURI crawlingStartURL = new DigestURI(crawlingStart);
DigestURL crawlingStartURL = new DigestURL(crawlingStart);
rootURLs.add(crawlingStartURL);
} catch (final MalformedURLException e) {
ConcurrentLog.logException(e);
Expand All @@ -78,7 +78,7 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
// and analyze the urls to make the table rows
StringBuilder s = new StringBuilder(300);
int row = 0;
for (DigestURI u: rootURLs) {
for (DigestURL u: rootURLs) {
s.append(u.toNormalform(true)).append('\n');
prop.put("table_list_" + row + "_url", u.toNormalform(true));

Expand Down
4 changes: 2 additions & 2 deletions htroot/CrawlResults.java
Expand Up @@ -31,8 +31,8 @@
import java.util.Locale;
import java.util.Map;

import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.ResultURLs;
Expand Down
10 changes: 5 additions & 5 deletions htroot/CrawlStartScanner_p.java
Expand Up @@ -31,6 +31,7 @@
import java.util.TreeMap;
import java.util.regex.Pattern;

import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.Domains;
Expand All @@ -40,7 +41,6 @@
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.WorkTables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.query.SearchEventCache;
Expand Down Expand Up @@ -193,10 +193,10 @@ public static serverObjects respond(
if ( post.containsKey("crawl") ) {
// make a pk/url mapping
final Iterator<Map.Entry<Scanner.Service, Scanner.Access>> se = Scanner.scancacheEntries();
final Map<byte[], DigestURI> pkmap = new TreeMap<byte[], DigestURI>(Base64Order.enhancedCoder);
final Map<byte[], DigestURL> pkmap = new TreeMap<byte[], DigestURL>(Base64Order.enhancedCoder);
while (se.hasNext()) {
final Scanner.Service u = se.next().getKey();
DigestURI uu;
DigestURL uu;
try {
uu = u.url();
pkmap.put(uu.hash(), uu);
Expand All @@ -208,7 +208,7 @@ public static serverObjects respond(
for ( final Map.Entry<String, String> entry : post.entrySet() ) {
if ( entry.getValue().startsWith("mark_") ) {
final byte[] pk = entry.getValue().substring(5).getBytes();
final DigestURI url = pkmap.get(pk);
final DigestURL url = pkmap.get(pk);
if ( url != null ) {
String path = "/Crawler_p.html?createBookmark=off&xsstopw=off&crawlingDomMaxPages=10000&intention=&range=domain&indexMedia=on&recrawl=nodoubles&xdstopw=off&storeHTCache=on&sitemapURL=&repeat_time=7&crawlingQ=on&cachePolicy=iffresh&indexText=on&crawlingMode=url&mustnotmatch=&crawlingDomFilterDepth=1&crawlingDomFilterCheck=off&crawlingstart=Start%20New%20Crawl&xpstopw=off&repeat_unit=seldays&crawlingDepth=99&directDocByURL=off";
path += "&crawlingURL=" + url.toNormalform(true);
Expand Down Expand Up @@ -244,7 +244,7 @@ public static serverObjects respond(
final Map<byte[], String> apiCommentCache = WorkTables.commentCache(sb);

String urlString;
DigestURI u;
DigestURL u;
try {
final Iterator<Map.Entry<Scanner.Service, Scanner.Access>> se = Scanner.scancacheEntries();
Map.Entry<Scanner.Service, Scanner.Access> host;
Expand Down

0 comments on commit 5e31bad

Please sign in to comment.