Skip to content
Permalink
Browse files

Support parsing audio URLs without file extension

Added also a Junit for the audio tag parser
  • Loading branch information...
luccioman committed Apr 9, 2019
1 parent 42c8a25 commit e90405b6f042cdf19d1008cbc8b5b3ece217fc1d
@@ -63,9 +63,9 @@
public class audioTagParser extends AbstractParser implements Parser {

/**
* Enumeration of internet media types supported by the {@link audioTagParser}.
* Enumeration of audio formats supported by the {@link audioTagParser}.
*/
public enum SupportedAudioMediaType {
public enum SupportedAudioFormat {

AIF("audio/aiff", new String[] { "audio/x-aiff" }, new String[] { SupportedFileFormat.AIF.getFilesuffix(),
SupportedFileFormat.AIFC.getFilesuffix(), SupportedFileFormat.AIFF.getFilesuffix() }),
@@ -118,7 +118,7 @@
* @param mediaType the media type, formatted as "type/subtype"
* @param fileExtensions a set of file extensions matching the given media type
*/
private SupportedAudioMediaType(final String mediaType, final String[] fileExtensions) {
private SupportedAudioFormat(final String mediaType, final String[] fileExtensions) {
this(mediaType, new String[] {}, fileExtensions);
}

@@ -127,7 +127,7 @@ private SupportedAudioMediaType(final String mediaType, final String[] fileExten
* @param alternateMediaTypes alternate flavors the the main media type, all formatted as "type/subtype"
* @param fileExtensions a set of file extensions matching the given media type
*/
private SupportedAudioMediaType(final String mediaType, final String[] alternateMediaTypes, final String[] fileExtensions) {
private SupportedAudioFormat(final String mediaType, final String[] alternateMediaTypes, final String[] fileExtensions) {
this.mediaType = mediaType.toLowerCase(Locale.ROOT);
Set<String> alternates = new HashSet<>();
for (final String alternateMediaType : alternateMediaTypes) {
@@ -179,7 +179,7 @@ public String getMediaType() {
*/
public static Set<String> getAllMediaTypes() {
final Set<String> mediaTypes = new HashSet<>();
for(final SupportedAudioMediaType mediaType : SupportedAudioMediaType.values()) {
for(final SupportedAudioFormat mediaType : SupportedAudioFormat.values()) {
mediaTypes.add(mediaType.getMediaType());
for(final String mediaTypeString : mediaType.getAlternateMediaTypes()) {
mediaTypes.add(mediaTypeString);
@@ -193,15 +193,18 @@ public String getMediaType() {
*/
public static Set<String> getAllFileExtensions() {
final Set<String> extensions = new HashSet<>();
for(final SupportedAudioMediaType mediaType : SupportedAudioMediaType.values()) {
for(final SupportedAudioFormat mediaType : SupportedAudioFormat.values()) {
extensions.addAll(mediaType.getFileExtensions());
}
return extensions;
}
}

/** Map from each supported audio file extensions to a single audio media type */
private final Map<String, SupportedAudioMediaType> ext2NormalMediaType;
/** Map from each supported audio file extensions to audio format */
private final Map<String, SupportedAudioFormat> ext2Format;

/** Map from each supported audio media type to audio format */
private final Map<String, SupportedAudioFormat> mediaType2Format;

/** Space character */
private static final char SPACE_CHAR = ' ';
@@ -217,18 +220,25 @@ public String getMediaType() {
public audioTagParser() {
super("Audio File Meta-Tag Parser");

final Map<String, SupportedAudioMediaType> normalMap = new HashMap<>();
final Map<String, SupportedAudioFormat> ext2Formats = new HashMap<>();

final Map<String, SupportedAudioFormat> mediaType2Formats = new HashMap<>();

for(final SupportedAudioMediaType mediaType : SupportedAudioMediaType.values()) {
for(final SupportedAudioFormat mediaType : SupportedAudioFormat.values()) {
this.SUPPORTED_MIME_TYPES.add(mediaType.getMediaType());
this.SUPPORTED_MIME_TYPES.addAll(mediaType.getAlternateMediaTypes());
this.SUPPORTED_EXTENSIONS.addAll(mediaType.getFileExtensions());
for(final String fileExtension : mediaType.getFileExtensions()) {
normalMap.put(fileExtension, mediaType);
ext2Formats.put(fileExtension, mediaType);
}
mediaType2Formats.put(mediaType.getMediaType(), mediaType);
for(final String mediaTypeStr : mediaType.getAlternateMediaTypes()) {
mediaType2Formats.put(mediaTypeStr, mediaType);
}
}

this.ext2NormalMediaType = Collections.unmodifiableMap(normalMap);
this.ext2Format = Collections.unmodifiableMap(ext2Formats);
this.mediaType2Format = Collections.unmodifiableMap(mediaType2Formats);
}

@Override
@@ -246,9 +256,34 @@ public audioTagParser() {
@Override
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset, final VocabularyScraper scraper,
final int timezoneOffset, final InputStream source, final int maxLinks, final long maxBytes)
throws UnsupportedOperationException, Failure, InterruptedException {
throws Failure, InterruptedException {
String filename = location.getFileName();
final String fileext = MultiProtocolURL.getFileExtension(filename);
String fileExt = MultiProtocolURL.getFileExtension(filename);


SupportedAudioFormat audioFormat = null;
if(fileExt != null) {
audioFormat = this.ext2Format.get(fileExt);
}
if(audioFormat == null) {
audioFormat = this.mediaType2Format.get(mimeType);
}

String normalizedMediaType = mimeType;
if(audioFormat != null) {
/* normalize to a single Media Type. Advantages :
* - index document with the right media type when HTTP response header "Content-Type" is missing or has a wrong value
* - for easier search by CollectionSchema.content_type in the index
*/
normalizedMediaType = audioFormat.getMediaType();

if(fileExt.isEmpty() || !ext2Format.containsKey(fileExt)) {
/* Normalize extension to a one known by jaudiotagger */
fileExt = audioFormat.getFileExtensions().iterator().next();
}
}


filename = filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename);

File tempFile = null;
@@ -260,7 +295,7 @@ public audioTagParser() {
f = AudioFileIO.read(location.getFSFile());
} else {
// create a temporary file, as jaudiotagger requires a file rather than an input stream
tempFile = File.createTempFile(filename, "." + fileext);
tempFile = File.createTempFile(filename, "." + fileExt);
long bytesCopied = FileUtils.copy(source, tempFile, maxBytes);
partiallyParsed = bytesCopied == maxBytes && source.read() != -1;
f = AudioFileIO.read(tempFile);
@@ -316,21 +351,9 @@ public audioTagParser() {
detectedUrls = Collections.emptySet();
}

/* normalize to a single Media Type. Advantages :
* - index document with the right media type when HTTP response header "Content-Type" is missing or has a wrong value
* - for easier search by CollectionSchema.content_type in the index
*/
String mime = mimeType;
if(fileext != null && !fileext.isEmpty() ) {
final SupportedAudioMediaType mediaType = this.ext2NormalMediaType.get(fileext);
if(mediaType != null) {
mime = mediaType.getMediaType();
}
}

final Document doc = new Document(
location,
mime,
normalizedMediaType,
charset,
this,
lang, // languages
@@ -441,11 +441,11 @@ public static void migrateDisabledAudioFormats(final Switchboard sb) {
* All old audio file extensions and media types are denied : we add newly
* supported ones to theses deny lists
*/
deniedExtensions.addAll(audioTagParser.SupportedAudioMediaType.getAllFileExtensions());
deniedExtensions.addAll(audioTagParser.SupportedAudioFormat.getAllFileExtensions());

sb.setConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, deniedExtensions);

deniedMediaTypes.addAll(audioTagParser.SupportedAudioMediaType.getAllMediaTypes());
deniedMediaTypes.addAll(audioTagParser.SupportedAudioFormat.getAllMediaTypes());

sb.setConfig(SwitchboardConstants.PARSER_MIME_DENY, deniedMediaTypes);

@@ -887,8 +887,8 @@ public void run() {

/* audioTagParser is disabled by default as it needs a temporary file (because of the JAudiotagger implementation) for each parsed document */
if (!enableAudioTags) {
denyExt.addAll(audioTagParser.SupportedAudioMediaType.getAllFileExtensions());
denyMime.addAll(audioTagParser.SupportedAudioMediaType.getAllMediaTypes());
denyExt.addAll(audioTagParser.SupportedAudioFormat.getAllFileExtensions());
denyMime.addAll(audioTagParser.SupportedAudioFormat.getAllMediaTypes());

setConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, denyExt);
setConfig(SwitchboardConstants.PARSER_MIME_DENY, denyMime);
@@ -0,0 +1,142 @@
// audioTagParserTest.java
// ---------------------------
// Copyright 2019 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

package net.yacy.document.parser;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Collection;

import org.junit.Before;
import org.junit.Test;

import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.Document;
import net.yacy.document.Parser.Failure;
import net.yacy.document.VocabularyScraper;

/**
* Unit tests for the {@link audioTagParser} class
*
*/
public class audioTagParserTest {

/** Folder containing test files */
private static final File TEST_FOLDER = new File("test", "parsertest");

/** The parser under test */
private audioTagParser parser;

@Before
public void before() {
this.parser = new audioTagParser();
}

/**
* Unit test for the
* {@link audioTagParser#parse(DigestURL, String, String, VocabularyScraper, int, java.io.InputStream)}
* function with some small (1 second length) test files.
*
* @throws Failure when a file could not be parsed
* @throws InterruptedException when the test was interrupted before its
* termination
* @throws IOException when a read/write error occurred
*/
@Test
public void testParse() throws Failure, InterruptedException, IOException {
final String[] fileNames = { "umlaute_windows.aiff", "umlaute_windows.flac", "umlaute_windows.m4a",
"umlaute_windows.mp3", "umlaute_windows.ogg", "umlaute_windows.wav" };

for (final String fileName : fileNames) {
final DigestURL location = new DigestURL("http://localhost/" + fileName);
try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLDER, fileName));) {
final Document[] documents = this.parser.parse(location, "audio/ogg", StandardCharsets.UTF_8.name(),
new VocabularyScraper(), 0, inStream);
assertNotNull("Parser result must not be null for file " + fileName, documents);
assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
assertTrue("Parsed text must contain test word with umlaut char" + fileName,
documents[0].getTextString().contains("Maßkrügen"));
final Collection<AnchorURL> anchors = documents[0].getAnchors();
assertNotNull("Detected URLS must not be null for file " + fileName, anchors);
assertEquals("One URL must have been detected for file " + fileName, 1, anchors.size());
assertTrue(anchors.iterator().next().toString().equals("https://yacy.net/"));
}
}
}

/**
* Test support for parsing audio document with proper Media Type but without
* extension or unrelated extension in its file name.
*
* @throws Failure when the file could not be parsed
* @throws InterruptedException when the test was interrupted before its
* termination
* @throws IOException when a read/write error occurred
*/
@Test
public void testParseDocUrlWithoutFileExt() throws Failure, InterruptedException, IOException {
final String testFileName = "umlaute_windows.ogg";
final String[] locations = { "http://localhost/audioTrack", "http://localhost/example.audio" };

for (final String locationStr : locations) {
final DigestURL location = new DigestURL(locationStr);
try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLDER, testFileName));) {
final Document[] documents = this.parser.parse(location, "audio/ogg", StandardCharsets.UTF_8.name(),
new VocabularyScraper(), 0, inStream);
assertNotNull("Parser result must not be null for URL " + location, documents);
}
}

}

/**
* Test support for parsing audio document with unknown or generic Media Type
*
* @throws Failure when the file could not be parsed
* @throws InterruptedException when the test was interrupted before its
* termination
* @throws IOException when a read/write error occurred
*/
@Test
public void testParseUnkownMediaType() throws Failure, InterruptedException, IOException {
final String testFileName = "umlaute_windows.ogg";
final DigestURL location = new DigestURL("http://localhost/" + testFileName);
final String[] mediaTypes = { null, "application/octet-stream" };

for (final String mediaType : mediaTypes) {
try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLDER, testFileName));) {
final Document[] documents = this.parser.parse(location, mediaType, StandardCharsets.UTF_8.name(),
new VocabularyScraper(), 0, inStream);
assertNotNull("Parser result must not be null for Media Type " + mediaType, documents);
}
}

}

}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 comments on commit e90405b

Please sign in to comment.
You can’t perform that action at this time.