Permalink
Browse files

Fixes to StartOffsetDocumentCollection.

Cleaned up Maven dependencies and assembly config
Moved to Git.
  • Loading branch information...
1 parent ff10643 commit 09de3bafeecf7f6b31c5daabe337cb585127d49f @Timpy Timpy committed Dec 5, 2012
View
@@ -0,0 +1,4 @@
+.settings/
+.classpath
+.project
+target/
View
97 pom.xml
@@ -12,7 +12,7 @@
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.build.resourceEncoding>UTF-8</project.build.resourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
- <hadoop.version>0.23.4.3.1210310949</hadoop.version>
+ <hadoop.version>0.23.4</hadoop.version>
</properties>
<build>
@@ -29,9 +29,9 @@
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
- <descriptorRefs>
- <descriptorRef>jar-with-dependencies</descriptorRef>
- </descriptorRefs>
+ <descriptors>
+ <descriptor>src/main/assembly/jar-for-hadoop.xml</descriptor>
+ </descriptors>
</configuration>
</plugin>
<plugin>
@@ -60,7 +60,7 @@
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
- <scope>compile</scope>
+ <scope>provided</scope>
</dependency>
<dependency>
<groupId>org.semanticweb.yars</groupId>
@@ -80,39 +80,21 @@
<artifactId>servlet-api</artifactId>
<groupId>javax.servlet</groupId>
</exclusion>
+ <exclusion>
+ <artifactId>org.apache.hadoop</artifactId>
+ <groupId>hadoop-core</groupId>
+ </exclusion>
</exclusions>
</dependency>
<dependency>
- <artifactId>servlet-api</artifactId>
- <groupId>javax.servlet</groupId>
- <version>2.5</version>
- <scope>provided</scope>
- </dependency>
- <dependency>
- <groupId>it.unimi.dsi</groupId>
- <artifactId>fastutil</artifactId>
- <version>6.4.4</version>
- </dependency>
- <dependency>
<groupId>it.unimi.dsi</groupId>
<artifactId>dsiutils</artifactId>
- <version>2.0.6</version>
+ <version>2.0.9</version>
</dependency>
<dependency>
<groupId>it.unimi.dsi</groupId>
<artifactId>sux4j</artifactId>
- <version>3.0.4</version>
- </dependency>
- <dependency>
- <groupId>it.unimi.dsi</groupId>
- <artifactId>webgraph</artifactId>
- <version>3.0.7</version>
- </dependency>
- <dependency>
- <groupId>colt</groupId>
- <artifactId>colt</artifactId>
- <version>1.2.0</version>
- <scope>compile</scope>
+ <version>3.0.5</version>
</dependency>
<dependency>
<groupId>com.martiansoftware</groupId>
@@ -125,62 +107,6 @@
<version>1.2.16</version>
</dependency>
<dependency>
- <groupId>commons-io</groupId>
- <artifactId>commons-io</artifactId>
- <version>2.2</version>
- </dependency>
- <dependency>
- <groupId>commons-lang</groupId>
- <artifactId>commons-lang</artifactId>
- <version>2.6</version>
- </dependency>
- <dependency>
- <groupId>commons-codec</groupId>
- <artifactId>commons-codec</artifactId>
- <version>1.5</version>
- </dependency>
- <dependency>
- <groupId>commons-collections</groupId>
- <artifactId>commons-collections</artifactId>
- <version>3.2.1</version>
- </dependency>
- <dependency>
- <groupId>commons-configuration</groupId>
- <artifactId>commons-configuration</artifactId>
- <version>1.8</version>
- </dependency>
- <dependency>
- <groupId>commons-digester</groupId>
- <artifactId>commons-digester</artifactId>
- <version>2.1</version>
- </dependency>
- <dependency>
- <groupId>commons-httpclient</groupId>
- <artifactId>commons-httpclient</artifactId>
- <version>3.1</version>
- <scope>compile</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-core</artifactId>
- <version>1.1</version>
- </dependency>
- <dependency>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parsers</artifactId>
- <version>1.1</version>
- </dependency>
- <dependency>
- <groupId>javax.mail</groupId>
- <artifactId>mail</artifactId>
- <version>1.4.5</version>
- </dependency>
- <dependency>
- <groupId>com.google.guava</groupId>
- <artifactId>guava</artifactId>
- <version>11.0.2</version>
- </dependency>
- <dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.1</version>
@@ -194,6 +120,7 @@
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.10</version>
+ <scope>test</scope>
</dependency>
<dependency>
<groupId>org.jmock</groupId>
View
@@ -52,7 +52,7 @@ LOCAL_BUILD_DIR="${HOME}/tmp/index-${BUILD_NAME}"
QUEUE=${QUEUE:-default}
-PROJECT_JAR="../target/Glimmer-0.0.1-SNAPSHOT-jar-with-dependencies.jar"
+JAR_FOR_HADOOP="../target/Glimmer-0.0.1-SNAPSHOT-jar-for-hadoop.jar"
HADOOP_CACHE_FILES="../target/classes/blacklist.txt"
COMPRESSION_CODEC="org.apache.hadoop.io.compress.BZip2Codec"
@@ -63,8 +63,8 @@ HASH_EXTENSION=".smap"
#INDEX_FILE_EXTENSIONS="frequencies index offsets positions posnumbits properties stats termmap terms"
INDEX_FILE_EXTENSIONS="counts countsoffsets frequencies occurrencies pointers pointersoffsets positions positionsoffsets properties sumsmaxpos terms"
-if [ ! -f ${PROJECT_JAR} ] ; then
- echo "Projects jar file missing!! ${PROJECT_JAR}"
+if [ ! -f ${JAR_FOR_HADOOP} ] ; then
+ echo "Projects jar file missing!! ${JAR_FOR_HADOOP}"
exit 1
fi
@@ -161,7 +161,7 @@ function groupBySubject () {
HADOOP_FILES="-files ${PREP_FILTER_FILE}#FilterXml"
fi
- local CMD="${HADOOP_CMD} jar ${PROJECT_JAR} com.yahoo.glimmer.indexing.preprocessor.PrepTool \
+ local CMD="${HADOOP_CMD} jar ${JAR_FOR_HADOOP} com.yahoo.glimmer.indexing.preprocessor.PrepTool \
-Dio.compression.codecs=${COMPRESSION_CODECS} \
-Dmapreduce.map.speculative=true \
-Dmapred.child.java.opts=-Xmx800m \
@@ -198,7 +198,7 @@ function computeHashes () {
echo " If you get out of heap errors try setting hadoop's HADOOP_HEAPSIZE or HADOOP_CLIENT_OPTS=\"-Xmx3500m\""
echo
# Generate Hashes for subjects, predicates and objects and all
- CMD="$HADOOP_CMD jar ${PROJECT_JAR} com.yahoo.glimmer.util.ComputeHashTool \
+ CMD="$HADOOP_CMD jar ${JAR_FOR_HADOOP} com.yahoo.glimmer.util.ComputeHashTool \
-Dio.compression.codecs=${COMPRESSION_CODECS} \
-sui ${FILES}"
echo ${CMD}; ${CMD}
@@ -256,7 +256,7 @@ function generateIndex () {
fi
echo Generating index..
- local CMD="${HADOOP_CMD} jar ${PROJECT_JAR} com.yahoo.glimmer.indexing.generator.TripleIndexGenerator \
+ local CMD="${HADOOP_CMD} jar ${JAR_FOR_HADOOP} com.yahoo.glimmer.indexing.generator.TripleIndexGenerator \
-Dio.compression.codecs=${COMPRESSION_CODECS} \
-Dmapreduce.map.speculative=true \
-Dmapreduce.job.reduces=${SUBINDICES} \
@@ -352,7 +352,7 @@ function mergeSubIndexes() {
NO_COUNTS_OPTIONS="-cCOUNTS:NONE -cPOSITIONS:NONE"
fi
- CMD="java -Xmx2G -cp ${PROJECT_JAR} it.unimi.di.mg4j.tool.Merge ${NO_COUNTS_OPTIONS} ${INDEX_DIR}/${INDEX_NAME} ${SUB_INDEXES}"
+ CMD="java -Xmx2G -cp ${JAR_FOR_HADOOP} it.unimi.di.mg4j.tool.Merge ${NO_COUNTS_OPTIONS} ${INDEX_DIR}/${INDEX_NAME} ${SUB_INDEXES}"
echo ${CMD}
${CMD}
@@ -367,7 +367,7 @@ function mergeSubIndexes() {
rm ${PART_DIR}/${INDEX_NAME}.*
done
- CMD="java -cp ${PROJECT_JAR} it.unimi.dsi.util.ImmutableExternalPrefixMap ${INDEX_DIR}/${INDEX_NAME}.termmap -o ${INDEX_DIR}/${INDEX_NAME}.terms"
+ CMD="java -cp ${JAR_FOR_HADOOP} it.unimi.dsi.util.ImmutableExternalPrefixMap ${INDEX_DIR}/${INDEX_NAME}.termmap -o ${INDEX_DIR}/${INDEX_NAME}.terms"
echo ${CMD}
${CMD}
@@ -393,7 +393,7 @@ function generateDocSizes () {
DFS_SIZES_DIR="${DFS_BUILD_DIR}/${METHOD}.sizes"
REDUCE_TASKS=$(( 1 + ${NUMBER_OF_DOCS} / 10000000 ))
- CMD="${HADOOP_CMD} jar ${PROJECT_JAR} com.yahoo.glimmer.indexing.DocSizesGenerator \
+ CMD="${HADOOP_CMD} jar ${JAR_FOR_HADOOP} com.yahoo.glimmer.indexing.DocSizesGenerator \
-Dmapreduce.map.failures.maxpercent=1 \
-Dmapreduce.map.speculative=true \
-Dmapreduce.job.reduces=${REDUCE_TASKS} \
@@ -441,7 +441,7 @@ function buildCollection () {
# wasn't partitioned.
# Increasing the mapreduce.input.fileinputformat.split.minsize reduces the number of mappers.
# Probably best to keep the number of mappers low (5-20) at the expense of runtime.
- CMD="${HADOOP_CMD} jar ${PROJECT_JAR} com.yahoo.glimmer.indexing.BySubjectCollectionBuilder \
+ CMD="${HADOOP_CMD} jar ${JAR_FOR_HADOOP} com.yahoo.glimmer.indexing.BySubjectCollectionBuilder \
-Dmapreduce.map.maxattempts=2 \
-Dmapreduce.map.speculative=false \
-Dmapred.child.java.opts=-Xmx900m \
View
@@ -21,7 +21,7 @@ LOCAL_BUILD_DIR="${HOME}/tmp/index-${BUILD_NAME}"
INDEX_DIR="${LOCAL_BUILD_DIR}/${METHOD}"
-PROJECT_JAR="../target/Glimmer-0.0.1-SNAPSHOT-jar-with-dependencies.jar"
+JAR_FOR_HADOOP="../target/Glimmer-0.0.1-SNAPSHOT-jar-for-hadoop.jar"
RLWRAP=$(which rlwrap)
if [ -z ${RLWRAP} ] ; then
@@ -50,4 +50,4 @@ do
fi
done
echo $BASENAMES
-${RLWRAP} java -Xmx3500m -cp $PROJECT_JAR it.unimi.di.mg4j.query.Query -n -v -T ${LOCAL_BUILD_DIR}/all.txt $BASENAMES
+${RLWRAP} java -Xmx3500m -cp $JAR_FOR_HADOOP it.unimi.di.mg4j.query.Query -n -v -T ${LOCAL_BUILD_DIR}/all.txt $BASENAMES
@@ -0,0 +1,37 @@
+<assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
+ <!-- TODO: a jarjar format would be better -->
+ <id>jar-for-hadoop</id>
+ <formats>
+ <format>jar</format>
+ </formats>
+ <includeBaseDirectory>false</includeBaseDirectory>
+ <dependencySets>
+ <dependencySet>
+ <outputDirectory>/</outputDirectory>
+ <useProjectArtifact>false</useProjectArtifact>
+ <unpack>true</unpack>
+ <scope>runtime</scope>
+ <excludes>
+ <exclude>org.apache.hadoop:*</exclude>
+ <exclude>org.springframework:*</exclude>
+ <exclude>commons-httpclient:commons-httpclient</exclude>
+ <exclude>javax.validation:validation-api</exclude>
+ <exclude>velocity:velocity</exclude>
+ <exclude>oro:oro</exclude>
+ <exclude>tomcat:jasper-compiler</exclude>
+ <exclude>tomcat:jasper-runtime</exclude>
+ <exclude>org.codehaus.jackson:*</exclude>
+ <exclude>org.slf4j:*</exclude>
+ </excludes>
+ </dependencySet>
+ </dependencySets>
+ <fileSets>
+ <fileSet>
+ <directory>target/classes</directory>
+ <outputDirectory>/</outputDirectory>
+ <includes>
+ <include>**/*</include>
+ </includes>
+ </fileSet>
+ </fileSets>
+</assembly>
@@ -40,12 +40,14 @@
public class BySubjectCollectionBuilder extends Configured implements Tool {
static class BuilderOutputWriter extends RecordWriter<LongWritable, Text> {
private static final String COLLECTION_PREFIX = "collection-";
- private static int count;
+ private static final MutableString TAB_WORD = new MutableString("\t");
private BySubjectRecord bySubjectRecord = new BySubjectRecord();
private final MutableString word = new MutableString();
private final MutableString nonWord = new MutableString();
FastBufferedReader fbr = new FastBufferedReader();
+
+ private int count;
private final DocumentCollectionBuilder builder;
@@ -114,6 +116,10 @@ public void write(LongWritable key, Text value) throws IOException, InterruptedE
builder.startDocument(bySubjectRecord.getSubject(), bySubjectRecord.getSubject());
builder.startTextField();
+ addField(Integer.toString(bySubjectRecord.getId()));
+ addField(Integer.toString(bySubjectRecord.getPreviousId()));
+ addField(bySubjectRecord.getSubject());
+
fbr.setReader(bySubjectRecord.getRelationsReader());
while (fbr.next(word, nonWord)) {
builder.add(word, nonWord);
@@ -123,6 +129,12 @@ public void write(LongWritable key, Text value) throws IOException, InterruptedE
builder.endTextField();
builder.endDocument();
}
+
+ private void addField(CharSequence value) throws IOException {
+ word.setLength(0);
+ word.append(value);
+ builder.add(word, TAB_WORD);
+ }
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
Oops, something went wrong.

0 comments on commit 09de3ba

Please sign in to comment.