Skip to content

Commit

Permalink
Added a mediafilter to extract text from Word/PowerPoint/Excel includ…
Browse files Browse the repository at this point in the history
…ing their XML format
  • Loading branch information
zuki committed Apr 22, 2014
1 parent 454061f commit 302de5d
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 11 deletions.
@@ -0,0 +1,112 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.app.mediafilter;

import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.IOException;

import org.apache.log4j.Logger;

import org.apache.poi.POITextExtractor;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;

/*
*
* to do: helpful error messages - can't find mediafilter.cfg - can't
* instantiate filter - bitstream format doesn't exist.
*
*/
public class MSOfficeFilter extends MediaFilter
{

private static Logger log = Logger.getLogger(MSOfficeFilter.class);

public String getFilteredName(String oldFilename)
{
return oldFilename + ".txt";
}

/**
* @return String bundle name
*
*/
public String getBundleName()
{
return "TEXT";
}

/**
* @return String bitstreamformat
*/
public String getFormatString()
{
return "Text";
}

/**
* @return String description
*/
public String getDescription()
{
return "Extracted text";
}

/**
* @param source
* source input stream
*
* @return InputStream the resulting input stream
*/
public InputStream getDestinationStream(InputStream source)
throws Exception
{
// get input stream from bitstream
// pass to filter, get string back
try
{
String extractedText;
POITextExtractor extractor = ExtractorFactory.createExtractor(source);
if (extractor instanceof PowerPointExtractor)
{
extractedText = ((PowerPointExtractor) extractor)
.getText(true, true);
}
else if (extractor instanceof XSLFPowerPointExtractor)
{
extractedText = ((XSLFPowerPointExtractor) extractor)
.getText(true, true);
}
else
{
extractedText = extractor.getText();
}

// if verbose flag is set, print out extracted text
// to STDOUT
if (MediaFilterManager.isVerbose)
{
System.out.println(extractedText);
}

// generate an input stream with the extracted text
byte[] textBytes = extractedText.getBytes();
ByteArrayInputStream bais = new ByteArrayInputStream(textBytes);

return bais; // will this work? or will the byte array be out of scope?
}
catch (Exception e)
{
System.out.println("MSOfficeFilter Exception");
log.error("Error detected : " + e.getMessage(), e);
throw e;
}
}
}
13 changes: 5 additions & 8 deletions dspace/config/dspace.cfg
Expand Up @@ -431,26 +431,23 @@ http.proxy.port = ${http.proxy.port}

#Names of the enabled MediaFilter or FormatFilter plugins
filter.plugins = PDF Text Extractor, HTML Text Extractor, \
PowerPoint Text Extractor, \
Word Text Extractor, JPEG Thumbnail
MSOffice Text Extractor, JPEG Thumbnail
# [To enable Branded Preview]: remove last line above, and uncomment 2 lines below
# Word Text Extractor, JPEG Thumbnail, \
# Branded Preview JPEG
# MSOffice Text Extractor, JPEG Thumbnail, \
# Branded Preview JPEG

#Assign 'human-understandable' names to each filter
plugin.named.org.dspace.app.mediafilter.FormatFilter = \
org.dspace.app.mediafilter.PDFFilter = PDF Text Extractor, \
org.dspace.app.mediafilter.HTMLFilter = HTML Text Extractor, \
org.dspace.app.mediafilter.WordFilter = Word Text Extractor, \
org.dspace.app.mediafilter.PowerPointFilter = PowerPoint Text Extractor, \
org.dspace.app.mediafilter.MSOfficeFilter = MSOffice Text Extractor, \
org.dspace.app.mediafilter.JPEGFilter = JPEG Thumbnail, \
org.dspace.app.mediafilter.BrandedPreviewJPEGFilter = Branded Preview JPEG

#Configure each filter's input format(s)
filter.org.dspace.app.mediafilter.PDFFilter.inputFormats = Adobe PDF
filter.org.dspace.app.mediafilter.HTMLFilter.inputFormats = HTML, Text
filter.org.dspace.app.mediafilter.WordFilter.inputFormats = Microsoft Word
filter.org.dspace.app.mediafilter.PowerPointFilter.inputFormats = Microsoft Powerpoint, Microsoft Powerpoint XML
filter.org.dspace.app.mediafilter.MSOfficeFilter.inputFormats = Microsoft Word, Microsoft Word XML, Microsoft Powerpoint, Microsoft Powerpoint XML, Microsoft Excel, Microsoft Excel XML
filter.org.dspace.app.mediafilter.JPEGFilter.inputFormats = BMP, GIF, JPEG, image/png
filter.org.dspace.app.mediafilter.BrandedPreviewJPEGFilter.inputFormats = BMP, GIF, JPEG, image/png

Expand Down
6 changes: 3 additions & 3 deletions pom.xml
Expand Up @@ -897,17 +897,17 @@
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.6</version>
<version>3.10-FINAL</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.6</version>
<version>3.10-FINAL</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.6</version>
<version>3.10-FINAL</version>
</dependency>
<dependency>
<groupId>rome</groupId>
Expand Down

0 comments on commit 302de5d

Please sign in to comment.