Skip to content

Commit

Permalink
*) Avoid rejecting of html content by the crawler when the file exten…
Browse files Browse the repository at this point in the history
…sion is not set properly

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1074 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
theli committed Nov 14, 2005
1 parent e9d6def commit 445e3a6
Showing 1 changed file with 9 additions and 2 deletions.
11 changes: 9 additions & 2 deletions source/de/anomic/plasma/plasmaParser.java
Expand Up @@ -258,6 +258,11 @@ public static boolean realtimeParsableMimeTypesContains(String mimeType) {
}

public static boolean supportedContent(URL url, String mimeType) {
// TODO: we need some exceptions here to index URLs like this
// http://www.musicabona.com/respighi/12668/cd/index.html.fr
if ((mimeType!=null)&&(mimeType.trim().equalsIgnoreCase("text/html"))) {
return supportedMimeTypesContains(mimeType);
}
return supportedMimeTypesContains(mimeType) && supportedFileExt(url);
}

Expand Down Expand Up @@ -715,8 +720,10 @@ public static void main(String[] args) {
//plasmaParserDocument document = theParser.parseSource(new URL("http://brain.yacy"), "application/pdf", theOutput.toByteArray());
//byte[] theText = document.getText();
//serverFileUtils.write(theText, out);
String[] sentences = document.getSentences();
for (int i = 0; i < sentences.length; i++) System.out.println("line " + i + ":" + sentences[i]);
if (document != null) {
String[] sentences = document.getSentences();
if (sentences != null) for (int i = 0; i < sentences.length; i++) System.out.println("line " + i + ":" + sentences[i]);
}
} catch (Exception e) {
e.printStackTrace();
}
Expand Down

0 comments on commit 445e3a6

Please sign in to comment.