Skip to content

Commit

Permalink
Updated the list of audio file formats supported by the audioTagParser
Browse files Browse the repository at this point in the history
Follows upgrade to Jaudiotagger dependency to version 2.2.5.
  • Loading branch information
luccioman committed Feb 27, 2018
1 parent 5753ce0 commit c3ff50c
Show file tree
Hide file tree
Showing 4 changed files with 241 additions and 48 deletions.
2 changes: 2 additions & 0 deletions defaults/yacy.init
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,8 @@ releases = DATA/RELEASE
# parser.mime.deny: specifies mime-types that shall not be indexed
parser.mime.deny=
parser.extensions.deny=
# The audioTagParser is disabled by default as it needs to create a temporary file each time an audio resource is parsed
# Audio file extensions and media types can be enabled in the ConfigParser_p.html page if this is not a problem with your install
parser.enableAudioTags=false

# experimental single-page parser for pdf files: split one pdf into individual pages;
Expand Down
210 changes: 177 additions & 33 deletions source/net/yacy/document/parser/audioTagParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,21 @@
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

import org.jaudiotagger.audio.AudioFile;
import org.jaudiotagger.audio.AudioFileIO;
import org.jaudiotagger.audio.SupportedFileFormat;
import org.jaudiotagger.tag.FieldKey;
import org.jaudiotagger.tag.Tag;

import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.util.ConcurrentLog;
Expand All @@ -43,30 +53,168 @@
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;

import org.jaudiotagger.audio.AudioFile;
import org.jaudiotagger.audio.AudioFileIO;
import org.jaudiotagger.tag.FieldKey;
import org.jaudiotagger.tag.Tag;

/**
* this parser can parse id3 tags of mp3 audio files
*/
public class audioTagParser extends AbstractParser implements Parser {

public static String EXTENSIONS = "mp3,ogg,oga,m4a,m4p,flac,wma";
public static String MIME_TYPES = "audio/mpeg,audio/MPA,audio/mpa-robust,audio/mp4,audio/flac,audio/x-flac,audio/x-ms-wma,audio/x-ms-asf";
public static String SEPERATOR = ",";
/**
* Enumeration of internet media types supported by the {@link audioTagParser}.
*/
public enum SupportedAudioMediaType {

AIF("audio/aiff", new String[] { "audio/x-aiff" }, new String[] { SupportedFileFormat.AIF.getFilesuffix(),
SupportedFileFormat.AIFC.getFilesuffix(), SupportedFileFormat.AIFF.getFilesuffix() }),

/** @see <a href="https://www.iana.org/assignments/media-types/audio/mpeg">mpeg assignment at IANA</a> */
MPEG("audio/mpeg", new String[] {"audio/MPA"}, new String[] {SupportedFileFormat.MP3.getFilesuffix()}),

/** @see <a href="https://www.iana.org/assignments/media-types/audio/MPA">MPA assignment at IANA</a> */
MPA("audio/MPA", new String[] {}),

/** @see <a href="https://www.iana.org/assignments/media-types/audio/mpa-robust">mpa-robust assignment at IANA</a> */
MPA_ROBUST("audio/mpa-robust", new String[] {}),

/** @see <a href="https://www.iana.org/assignments/media-types/audio/mp4">mp4 assignment at IANA</a> */
MP4("audio/mp4",
new String[] { SupportedFileFormat.M4A.getFilesuffix() /* Audio-only MPEG-4 */,
SupportedFileFormat.M4B.getFilesuffix()/* Audio book (Apple) */,
SupportedFileFormat.M4P.getFilesuffix()/* Apple iTunes */,
SupportedFileFormat.MP4.getFilesuffix() /* Standard extension */ }),

/** @see <a href="https://xiph.org/flac/index.html*>FLAC home page</a> */
FLAC("audio/flac", new String[] { "audio/x-flac" }, new String[] { SupportedFileFormat.FLAC.getFilesuffix() }),

/** @see <a href="https://www.iana.org/assignments/media-types/audio/ogg">ogg assignment at IANA</a> */
OGG("audio/ogg", new String[] {SupportedFileFormat.OGG.getFilesuffix()}),

WMA("audio/x-ms-wma", new String[] { "audio/x-ms-asf" },
new String[] { SupportedFileFormat.WMA.getFilesuffix() }),

REAL_AUDIO("audio/vnd.rn-realaudio", new String[] { "audio/x-pn-realaudio" },
new String[] { SupportedFileFormat.RA.getFilesuffix(), SupportedFileFormat.RM.getFilesuffix() }),

/** @see <a href="https://tools.ietf.org/html/rfc2361">RFC 2361 memo (not a standard)</a> */
WAV("audio/vnd.wave", new String[] { "audio/wav", "audio/wave", "audio/x-wav" },
new String[] { SupportedFileFormat.WAV.getFilesuffix() });

/**
* Lower case media type.
* When possible the subtype not starting with a "x-" prefix is preferred.
* @see <a href="https://tools.ietf.org/html/rfc6648">RFC 6648 about Deprecating the "X-" Prefix</a>*/
private final String mediaType;

/** Lower case alternate flavors ot the media type */
private final Set<String> alternateMediaTypes;

/** Lower case file extensions */
private final Set<String> fileExtensions;

/**
* @param mediaType the media type, formatted as "type/subtype"
* @param fileExtensions a set of file extensions matching the given media type
*/
private SupportedAudioMediaType(final String mediaType, final String[] fileExtensions) {
this(mediaType, new String[] {}, fileExtensions);
}

/**
* @param mediaType the main media type, formatted as "type/subtype"
* @param alternateMediaTypes alternate flavors the the main media type, all formatted as "type/subtype"
* @param fileExtensions a set of file extensions matching the given media type
*/
private SupportedAudioMediaType(final String mediaType, final String[] alternateMediaTypes, final String[] fileExtensions) {
this.mediaType = mediaType.toLowerCase(Locale.ROOT);
Set<String> alternates = new HashSet<>();
for (final String alternateMediaType : alternateMediaTypes) {
alternates.add(alternateMediaType.toLowerCase(Locale.ROOT));
}
if (alternates.isEmpty()) {
this.alternateMediaTypes = Collections.emptySet();
} else {
this.alternateMediaTypes = Collections.unmodifiableSet(alternates);
}

Set<String> extensions = new HashSet<>();
for (final String fileExtension : fileExtensions) {
extensions.add(fileExtension.toLowerCase(Locale.ROOT));
}
if (extensions.isEmpty()) {
this.fileExtensions = Collections.emptySet();
} else {
this.fileExtensions = Collections.unmodifiableSet(extensions);
}

}

/**
* @return the lower cased standard or preferred media type in the form
* "type/subtype"
*/
public String getMediaType() {
return this.mediaType;
}

/**
* @return a set of alternate media types in the form "type/subtype", equivalent
* to the main media type. May be empty.
*/
public Set<String> getAlternateMediaTypes() {
return this.alternateMediaTypes;
}

/**
* @return the set of file extensions related to this media type
*/
public Set<String> getFileExtensions() {
return this.fileExtensions;
}

/**
* @return all the supported media types as strings
*/
public static Set<String> getAllMediaTypes() {
final Set<String> mediaTypes = new HashSet<>();
for(final SupportedAudioMediaType mediaType : SupportedAudioMediaType.values()) {
mediaTypes.add(mediaType.getMediaType());
for(final String mediaTypeString : mediaType.getAlternateMediaTypes()) {
mediaTypes.add(mediaTypeString);
}
}
return mediaTypes;
}

/**
* @return all the supported file extensions
*/
public static Set<String> getAllFileExtensions() {
final Set<String> extensions = new HashSet<>();
for(final SupportedAudioMediaType mediaType : SupportedAudioMediaType.values()) {
extensions.addAll(mediaType.getFileExtensions());
}
return extensions;
}
}

/** Map from each supported audio file extensions to a single audio media type */
private final Map<String, SupportedAudioMediaType> ext2NormalMediaType;


public audioTagParser() {
super("Audio File Meta-Tag Parser");
final String[] extArray = EXTENSIONS.split(SEPERATOR);
for (final String ext : extArray) {
this.SUPPORTED_EXTENSIONS.add(ext);
}
final String[] mimeArray = MIME_TYPES.split(SEPERATOR);
for (final String mime : mimeArray) {
this.SUPPORTED_MIME_TYPES.add(mime);

final Map<String, SupportedAudioMediaType> normalMap = new HashMap<>();

for(final SupportedAudioMediaType mediaType : SupportedAudioMediaType.values()) {
this.SUPPORTED_MIME_TYPES.add(mediaType.getMediaType());
this.SUPPORTED_MIME_TYPES.addAll(mediaType.getAlternateMediaTypes());
this.SUPPORTED_EXTENSIONS.addAll(mediaType.getFileExtensions());
for(final String fileExtension : mediaType.getFileExtensions()) {
normalMap.put(fileExtension, mediaType);
}
}

this.ext2NormalMediaType = Collections.unmodifiableMap(normalMap);
}

@Override
Expand All @@ -80,25 +228,9 @@ public Document[] parse(
throws Parser.Failure, InterruptedException {

String filename = location.getFileName();
final String fileext = '.' + MultiProtocolURL.getFileExtension(filename);
final String fileext = MultiProtocolURL.getFileExtension(filename);
filename = filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename);
String mime = mimeType;

// fix mimeType
if(!this.SUPPORTED_MIME_TYPES.contains(mimeType)) {
if(fileext.equals("mp3")) {
mime = "audio/mpeg";
} else if(fileext.equals("ogg")) {
mime = "audio/ogg";
} else if(fileext.equals("flac")) {
mime = "audio/flac";
} else if(fileext.equals("wma")) {
mime = "audio/x-ms-wma";
} else if(fileext.startsWith("m4")) {
mime = "audio/mp4";
}
}

Document[] docs;
BufferedOutputStream fout = null;
File tempFile = null;
Expand All @@ -109,7 +241,7 @@ public Document[] parse(
f = AudioFileIO.read(location.getFSFile());
} else {
// create a temporary file, as jaudiotagger requires a file rather than an input stream
tempFile = File.createTempFile(filename,fileext);
tempFile = File.createTempFile(filename, "." + fileext);
fout = new BufferedOutputStream(new FileOutputStream(tempFile));
int c;
while ((c = source.read()) != -1) {
Expand Down Expand Up @@ -159,6 +291,18 @@ public Document[] parse(
// dc:subject
final String[] subject = new String[1];
subject[0] = tag.getFirst(FieldKey.GENRE);

/* normalize to a single Media Type. Advantages :
* - index document with the right media type when HTTP response header "Content-Type" is missing or has a wrong value
* - for easier search by CollectionSchema.content_type in the index
*/
String mime = mimeType;
if(fileext != null && !fileext.isEmpty() ) {
final SupportedAudioMediaType mediaType = this.ext2NormalMediaType.get(fileext);
if(mediaType != null) {
mime = mediaType.getMediaType();
}
}

docs = new Document[]{new Document(
location,
Expand Down
55 changes: 55 additions & 0 deletions source/net/yacy/migration.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
Expand All @@ -42,6 +43,8 @@
import net.yacy.cora.protocol.TimeoutRequest;
import net.yacy.cora.storage.Configuration.Entry;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.TextParser;
import net.yacy.document.parser.audioTagParser;
import net.yacy.kelondro.workflow.BusyThread;
import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.CollectionSchema;
Expand All @@ -57,6 +60,9 @@ public class migration {

/** Removal of deprecated IPAccessHandler for white list implementation (serverClient setting) */
public static final double NEW_IPPATTERNS = 1.92109489;

/** Addition of supplementary audio file formats supported by the audioTagParser */
public static final double ADDITIONAL_AUDIO_TAG_FORMATS = 1.92109589;

/**
* Migrates older configuratin to current version
Expand All @@ -75,6 +81,9 @@ public static void migrate(final Switchboard sb, final double fromVer, final dou
if (fromVer < NEW_IPPATTERNS) {
migrateServerClientSetting(sb);
}
if (fromVer < ADDITIONAL_AUDIO_TAG_FORMATS) {
migrateDisabledAudioFormats(sb);
}
// use String.format to cut-off small rounding errors
ConcurrentLog.info("MIGRATION", "Migrating from "+ String.format(Locale.US, "%.8f",fromVer) + " to " + String.format(Locale.US, "%.8f",toVer));
if (fromVer < 0.47d) {
Expand Down Expand Up @@ -398,6 +407,52 @@ protected static boolean migrateIPAddressPatterns(final String patternSeparator,
}
return hasDeprecated;
}

/**
* Handle audioTagParser newly supported audio formats. This parser is disabled
* by default, so its supported file extensions and media types are added to the
* deny lists at first install. Therefore, on existing installs, newly supported
* formats must be added to the deny lists if the parser has not been enabled.
*
* @param sb
* the main Switchboard instance. Must not be null.
*/
public static void migrateDisabledAudioFormats(final Switchboard sb) {
/*
* Previously supported audio file extensions (formerly in
* audioTagParser.EXTENSIONS constant)
*/
final Set<String> oldAudioExtensions = new HashSet<>();
Collections.addAll(oldAudioExtensions, new String[] { "mp3", "ogg", "oga", "m4a", "m4p", "flac", "wma" });

/*
* Previously supported audio media types (formerly in audioTagParser.MIME_TYPES
* constant)
*/
final Set<String> oldAudioMediaTypes = new HashSet<>();
Collections.addAll(oldAudioMediaTypes, new String[] { "audio/mpeg", "audio/MPA", "audio/mpa-robust", "audio/mp4",
"audio/flac", "audio/x-flac", "audio/x-ms-wma", "audio/x-ms-asf" });

final Set<String> deniedExtensions = sb.getConfigSet(SwitchboardConstants.PARSER_EXTENSIONS_DENY);
final Set<String> deniedMediaTypes = sb.getConfigSet(SwitchboardConstants.PARSER_MIME_DENY);

if(deniedExtensions.containsAll(oldAudioExtensions) && deniedMediaTypes.containsAll(oldAudioMediaTypes)) {
/*
* All old audio file extensions and media types are denied : we add newly
* supported ones to theses deny lists
*/
deniedExtensions.addAll(audioTagParser.SupportedAudioMediaType.getAllFileExtensions());

sb.setConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, deniedExtensions);

deniedMediaTypes.addAll(audioTagParser.SupportedAudioMediaType.getAllMediaTypes());

sb.setConfig(SwitchboardConstants.PARSER_MIME_DENY, deniedMediaTypes);

TextParser.setDenyMime(sb.getConfig(SwitchboardConstants.PARSER_MIME_DENY, ""));
TextParser.setDenyExtension(sb.getConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, ""));
}
}

/**
* Reindex embedded solr index
Expand Down
22 changes: 7 additions & 15 deletions source/net/yacy/search/Switchboard.java
Original file line number Diff line number Diff line change
Expand Up @@ -867,24 +867,16 @@ public void run() {

final boolean enableAudioTags = getConfigBool("parser.enableAudioTags", false);
log.config("Parser: parser.enableAudioTags= "+enableAudioTags);
final StringBuilder denyExt = new StringBuilder(256);
final StringBuilder denyMime = new StringBuilder(256);
denyExt.append(getConfig(SwitchboardConstants.PARSER_MIME_DENY, ""));
denyMime.append(getConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, ""));
final Set<String> denyExt = getConfigSet(SwitchboardConstants.PARSER_EXTENSIONS_DENY);
final Set<String> denyMime = getConfigSet(SwitchboardConstants.PARSER_MIME_DENY);

/* audioTagParser is disabled by default as it needs a temporary file (because of the JAudiotagger implementation) for each parsed document */
if (!enableAudioTags) {
if(denyExt.length()>0) {
denyExt.append(audioTagParser.SEPERATOR);
}
denyExt.append(audioTagParser.EXTENSIONS);

if(denyMime.length()>0) {
denyMime.append(audioTagParser.SEPERATOR);
}
denyMime.append(audioTagParser.MIME_TYPES);
denyExt.addAll(audioTagParser.SupportedAudioMediaType.getAllFileExtensions());
denyMime.addAll(audioTagParser.SupportedAudioMediaType.getAllMediaTypes());

setConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, denyExt.toString());
setConfig(SwitchboardConstants.PARSER_MIME_DENY, denyMime.toString());
setConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, denyExt);
setConfig(SwitchboardConstants.PARSER_MIME_DENY, denyMime);
setConfig("parser.enableAudioTags", true);
}

Expand Down

0 comments on commit c3ff50c

Please sign in to comment.