Skip to content

Commit

Permalink
- enhanced parser: collection of audio, video, image and application …
Browse files Browse the repository at this point in the history
…links

- enhanced condenser: better handling of utf-8 and pre-formatted texts


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3017 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Nov 28, 2006
1 parent 984285b commit ceb9e3a
Show file tree
Hide file tree
Showing 16 changed files with 263 additions and 203 deletions.
7 changes: 5 additions & 2 deletions htroot/CacheAdmin_p.java
Expand Up @@ -124,11 +124,14 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
info.append("<b>TITLE:</b><br>").append(scraper.getTitle()).append("<br>").append("<br>")
.append("<b>SECTION HEADLINES:</b><br>").append(formatTitles(document.getSectionTitles())).append("<br>")
.append("<b>HREF:</b><br>").append(formatAnchor(document.getHyperlinks())).append("<br>")
.append("<b>MEDIA:</b><br>").append(formatAnchor(document.getMedialinks())).append("<br>")
.append("<b>IMAGE:</b><br>").append(formatAnchor(document.getImagelinks())).append("<br>")
.append("<b>AUDIO:</b><br>").append(formatAnchor(document.getAudiolinks())).append("<br>")
.append("<b>VIDEO:</b><br>").append(formatAnchor(document.getVideolinks())).append("<br>")
.append("<b>APPS:</b><br>").append(formatAnchor(document.getApplinks())).append("<br>")
.append("<b>EMAIL:</b><br>").append(formatAnchor(document.getEmaillinks())).append("<br>")
.append("<b>TEXT:</b><br><span class=\"small\">").append(new String(scraper.getText())).append("</span><br>")
.append("<b>LINES:</b><br><span class=\"small\">");
final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
final Enumeration sentences = document.getSentences(false);
if (sentences != null) while (sentences.hasMoreElements()) {
info.append((String) sentences.nextElement()).append("<br>");
}
Expand Down
11 changes: 7 additions & 4 deletions htroot/ViewFile.java
Expand Up @@ -57,6 +57,7 @@
import de.anomic.http.httpc;
import de.anomic.index.indexURLEntry;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard;
Expand Down Expand Up @@ -99,7 +100,8 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
URL url = null;
String descr = "";
int wordCount = 0;
int size = 0;
int size = 0;
boolean pre = false;

// getting the url hash from which the content should be loaded
String urlHash = post.get("urlHash","");
Expand All @@ -124,6 +126,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
descr = comp.descr();
urlEntry.wordCount();
size = urlEntry.size();
pre = urlEntry.flags().get(plasmaCondenser.flag_cat_indexof);
}

// alternatively, get the url simply from a url String
Expand All @@ -140,6 +143,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve

// define an url by post parameter
url = new URL(urlString);
pre = post.get("pre", "false").equals("true");
} catch (MalformedURLException e) {}


Expand Down Expand Up @@ -303,14 +307,13 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
prop.put("viewMode_parsedText", content);
} else {
prop.put("viewMode", VIEW_MODE_AS_PARSED_SENTENCES);
final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
final Enumeration sentences = document.getSentences(pre);

boolean dark = true;
int i = 0;
if (sentences != null)
while (sentences.hasMoreElements()) {
String currentSentence = wikiCode
.replaceHTML((String) sentences.nextElement());
String currentSentence = wikiCode.replaceHTML((String) sentences.nextElement());

// Search word highlighting
String words = post.get("words", null);
Expand Down
4 changes: 2 additions & 2 deletions htroot/htdocsdefault/dir.java
Expand Up @@ -360,7 +360,7 @@ public static String yacyhURL(yacySeed seed, String filename, String md5) {
public static void indexPhrase(plasmaSwitchboard switchboard, String urlstring, String phrase, String descr, byte[] md5) {
try {
final URL url = new URL(urlstring);
final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes()));
final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes()), "UTF-8");
final indexURLEntry newEntry = switchboard.urlPool.loadedURL.newEntry(
url,
"YaCyShare: " + descr,
Expand Down Expand Up @@ -395,7 +395,7 @@ public static void indexPhrase(plasmaSwitchboard switchboard, String urlstring,
public static void deletePhrase(plasmaSwitchboard switchboard, String urlstring, String phrase, String descr) {
try {
final String urlhash = plasmaURL.urlHash(new URL(urlstring));
final Iterator words = plasmaCondenser.getWords(("yacyshare " + phrase + " " + descr).getBytes("UTF-8"));
final Iterator words = plasmaCondenser.getWords(("yacyshare " + phrase + " " + descr).getBytes("UTF-8"), "UTF-8");
Map.Entry entry;
while (words.hasNext()) {
entry = (Map.Entry) words.next();
Expand Down
10 changes: 6 additions & 4 deletions htroot/xml/snippet.java
Expand Up @@ -28,6 +28,9 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
// if 'remove' is set to true, then RWI references to URLs that do not have the snippet are removed
boolean remove = post.get("remove", "false").equals("true");

// boolean line_end_with_punctuation
boolean pre = post.get("pre", "false").equals("true");

String querystring = post.get("search", "").trim();
if ((querystring.length() > 2) && (querystring.charAt(0) == '"') && (querystring.charAt(querystring.length() - 1) == '"')) {
querystring = querystring.substring(1, querystring.length() - 1).trim();
Expand All @@ -40,10 +43,9 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
kelondroMSetTools.excludeDestructive(query, plasmaSwitchboard.stopwords);
}

// do the search
Set queryHashes = plasmaCondenser.words2hashes(query);

plasmaSnippetCache.Snippet snippet = switchboard.snippetCache.retrieveSnippet(url, queryHashes, true, 260, 10000);
// find snippet
Set queryHashes = plasmaCondenser.words2hashes(query);
plasmaSnippetCache.Snippet snippet = switchboard.snippetCache.retrieveSnippet(url, queryHashes, true, pre, 260, 10000);
prop.put("status",snippet.getSource());
if (snippet.getSource() < 11) {
//prop.put("text", (snippet.exists()) ? snippet.getLineMarked(queryHashes) : "unknown");
Expand Down
3 changes: 2 additions & 1 deletion htroot/yacy/search.java
Expand Up @@ -56,6 +56,7 @@
import de.anomic.index.indexContainer;
import de.anomic.plasma.plasmaURL;
import de.anomic.index.indexURLEntry;
import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchRankingProfile;
Expand Down Expand Up @@ -256,7 +257,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
while ((acc.hasMoreElements()) && (i < squery.wantedResults)) {
urlentry = (indexURLEntry) acc.nextElement();
if (includesnippet) {
snippet = sb.snippetCache.retrieveSnippet(urlentry.comp().url(), squery.queryHashes, false, 260, 1000);
snippet = sb.snippetCache.retrieveSnippet(urlentry.comp().url(), squery.queryHashes, false, urlentry.flags().get(plasmaCondenser.flag_cat_indexof), 260, 1000);
} else {
snippet = null;
}
Expand Down
2 changes: 2 additions & 0 deletions source/de/anomic/index/indexURLEntry.java
Expand Up @@ -31,6 +31,7 @@
import java.net.MalformedURLException;
import java.util.Date;

import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL;
import de.anomic.index.indexRWIEntry;
Expand All @@ -49,6 +50,7 @@ public interface indexURLEntry {
public int size();
public int wordCount();
public String snippet();
public kelondroBitfield flags();
public indexRWIEntry word();
public boolean isOlder(indexURLEntry other);
public String toString(String snippet);
Expand Down
5 changes: 5 additions & 0 deletions source/de/anomic/index/indexURLEntryOld.java
Expand Up @@ -35,6 +35,7 @@
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaURL;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.crypt;
Expand Down Expand Up @@ -262,6 +263,10 @@ public int size() {
public int wordCount() {
return wordCount;
}

public kelondroBitfield flags() {
return plasmaSearchQuery.empty_constraint;
}

public String snippet() {
// the snippet may appear here if the url was transported in a remote search
Expand Down

0 comments on commit ceb9e3a

Please sign in to comment.