Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improvements to Pig support #48

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -28,6 +28,7 @@
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

import org.apache.log4j.Logger;
import voldemort.VoldemortException;
import voldemort.client.protocol.admin.AdminClient;
import voldemort.client.protocol.admin.AdminClientConfig;
Expand All @@ -39,6 +40,8 @@

public class VoldemortInputFormat extends InputFormat<ByteArray, Versioned<byte[]>> {

private final Logger logger = Logger.getLogger(VoldemortInputFormat.class);

/**
* Create a new connection to admin client and give it to RecordReader.
* Called on the TaskTracker
Expand Down Expand Up @@ -77,15 +80,15 @@ public List<InputSplit> getSplits(JobContext context) throws IOException, Interr
throw new VoldemortException("Store '" + storeName + "' not found");
}

// Generate splits
// Generate one split per node.
// Should consider a config setting allowing one split per partition.
Iterator<Node> nodeIter = cluster.getNodes().iterator();
List<InputSplit> splits = new ArrayList<InputSplit>();
while(nodeIter.hasNext()) {
Node currentNode = nodeIter.next();
VoldemortInputSplit split = new VoldemortInputSplit(storeName, currentNode);
splits.add(split);
}

adminClient.stop();
return splits;
}
Expand Down
Expand Up @@ -40,12 +40,15 @@ public VoldemortInputSplit(String storeName, Node node) {
}

/**
* Is used to order the splits so that the largest get processed first, in
* an attempt to minimize the job runtime...Voldemort doesn't care!
* Pig will order the splits so that the largest get processed first. This has no
* consequence for Voldemort, but newer version of pig will also try to combine
* splits if the size of them are less than config <i>pig.maxCombinedSplitSize</i>.
* This does not map well to the Voldemort Storage. To avoid splitting altogether,
* we return Long.MAX_VALUE.
*/
@Override
public long getLength() throws IOException, InterruptedException {
return 0;
return Long.MAX_VALUE;
}

public String getStoreName() {
Expand Down
Expand Up @@ -25,6 +25,7 @@
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

import org.apache.log4j.Logger;
import voldemort.client.protocol.admin.AdminClient;
import voldemort.client.protocol.admin.AdminClientConfig;
import voldemort.utils.ByteArray;
Expand All @@ -33,6 +34,8 @@

public class VoldemortRecordReader extends RecordReader<ByteArray, Versioned<byte[]>> {

private final Logger logger = Logger.getLogger(VoldemortRecordReader.class);

private AdminClient adminClient;
private Iterator<Pair<ByteArray, Versioned<byte[]>>> iter = null;
private Pair<ByteArray, Versioned<byte[]>> currentPair = null;
Expand Down Expand Up @@ -67,6 +70,7 @@ public void initialize(InputSplit split, TaskAttemptContext context) throws IOEx
partitionIds.addAll(adminClient.getAdminClientCluster()
.getNodeById(voldemortSplit.getNodeId())
.getPartitionIds());
logger.info("Initializing split for node " + voldemortSplit.getNodeId() + ", partitions " + partitionIds);
this.iter = adminClient.fetchEntries(voldemortSplit.getNodeId(),
voldemortSplit.getStoreName(),
partitionIds,
Expand Down
@@ -1,49 +1,47 @@
/*
* Copyright 2010 LinkedIn, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/

package voldemort.hadoop.pig;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.pig.LoadFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;

import voldemort.hadoop.VoldemortHadoopConfig;
import voldemort.hadoop.VoldemortInputFormat;
import voldemort.utils.ByteArray;
import voldemort.versioning.Versioned;

@SuppressWarnings("unchecked")
public class VoldemortStore extends LoadFunc {
import java.io.IOException;

private RecordReader reader;
/**
* Superclass for Voldemort Pig Stores. The Tuple format is specified by subclasses.
*/
public abstract class AbstractVoldemortStore extends LoadFunc {
protected RecordReader reader;

@Override
public InputFormat getInputFormat() throws IOException {
VoldemortInputFormat inputFormat = new VoldemortInputFormat();
return inputFormat;
}

@Override
public void prepareToRead(RecordReader reader, PigSplit split) throws IOException {
this.reader = reader;
}

@Override
public void setLocation(String location, Job job) throws IOException {
if(!location.startsWith("tcp://"))
throw new IOException("The correct format is tcp://<url:port>/storeName");
String[] subParts = location.split("/+");
Configuration conf = job.getConfiguration();
VoldemortHadoopConfig.setVoldemortURL(conf, subParts[0] + "//" + subParts[1]);
VoldemortHadoopConfig.setVoldemortStoreName(conf, subParts[2]);
}

@Override
public Tuple getNext() throws IOException {
ByteArray key = null;
Expand All @@ -64,24 +62,8 @@ public Tuple getNext() throws IOException {
return null;
}

Tuple tuple = TupleFactory.getInstance().newTuple(2);
tuple.set(0, new DataByteArray(key.get()));
tuple.set(1, new String(value.getValue()));
return tuple;
}

@Override
public void prepareToRead(RecordReader reader, PigSplit split) throws IOException {
this.reader = reader;
return extractTuple(key, value);
}

@Override
public void setLocation(String location, Job job) throws IOException {
if(!location.startsWith("tcp://"))
throw new IOException("The correct format is tcp://<url:port>/storeName");
String[] subParts = location.split("/+");
Configuration conf = job.getConfiguration();
VoldemortHadoopConfig.setVoldemortURL(conf, subParts[0] + "//" + subParts[1]);
VoldemortHadoopConfig.setVoldemortStoreName(conf, subParts[2]);
}
protected abstract Tuple extractTuple(ByteArray key, Versioned<byte[]> value) throws ExecException;
}
@@ -0,0 +1,57 @@
/*
* Copyright 2010 LinkedIn, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/

package voldemort.hadoop.pig;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.pig.LoadFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import voldemort.hadoop.VoldemortHadoopConfig;
import voldemort.hadoop.VoldemortInputFormat;
import voldemort.utils.ByteArray;
import voldemort.versioning.Versioned;

import java.io.IOException;

/**
* Voldemort store which exposes values as DataByteArrays. Useful for loading
* binary format data (e.g protobufs, thrift).
*
* To use with Twitter's Elephant-Bird:
*
* <pre>
* dataset = LOAD 'tcp://localhost:6666/storename' USING BinaryVoldemortStore();
* DEFINE XProtoFormat x.x.x.pig.piggybank.XProtobufBytesToTuple();
* result = FOREACH dataset GENERATE $0 as key, XProtoFormat($1).fieldName as fieldName;
* </pre>
*/
public class BinaryVoldemortStore extends AbstractVoldemortStore {
@Override
protected Tuple extractTuple(ByteArray key, Versioned<byte[]> value) throws ExecException {
Tuple tuple = TupleFactory.getInstance().newTuple(2);
tuple.set(0, new DataByteArray(key.get()));
tuple.set(1, new DataByteArray(value.getValue()));
return tuple;

}
}
@@ -0,0 +1,50 @@
/*
* Copyright 2010 LinkedIn, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/

package voldemort.hadoop.pig;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.pig.LoadFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;

import voldemort.hadoop.VoldemortHadoopConfig;
import voldemort.hadoop.VoldemortInputFormat;
import voldemort.utils.ByteArray;
import voldemort.versioning.Versioned;

/**
* Voldemort store which exposes values as Strings.
*/
public class StringVoldemortStore extends AbstractVoldemortStore {

@Override
protected Tuple extractTuple(ByteArray key, Versioned<byte[]> value) throws ExecException {
Tuple tuple = TupleFactory.getInstance().newTuple(2);
tuple.set(0, new DataByteArray(key.get()));
tuple.set(1, new String(value.getValue()));
return tuple;
}

}
1 change: 1 addition & 0 deletions src/java/voldemort/server/ServiceType.java
Expand Up @@ -10,6 +10,7 @@ public enum ServiceType {
SOCKET("socket-service"),
ADMIN("admin-service"),
JMX("jmx-service"),
JMX_REMOTE("jmx-remote-service"),
SCHEDULER("scheduler-service"),
STORAGE("storage-service"),
VOLDEMORT("voldemort-server"),
Expand Down