Skip to content

Commit

Permalink
added index splitter
Browse files Browse the repository at this point in the history
  • Loading branch information
javasoze committed Jan 10, 2013
1 parent a2b16a8 commit 7ea0bea
Show file tree
Hide file tree
Showing 3 changed files with 237 additions and 0 deletions.
8 changes: 8 additions & 0 deletions sensei-tools/bin/split.sh
@@ -0,0 +1,8 @@
#!/usr/bin/env bash

bin=`dirname "$0"`
bin=`cd "$bin"; pwd`

home=`cd "$bin/..";pwd`

java -classpath target/lib/*:target/* com.senseidb.tools.IndexSplitter $1 $2 $3 $4
133 changes: 133 additions & 0 deletions sensei-tools/pom.xml
@@ -0,0 +1,133 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">

<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.senseidb</groupId>
<artifactId>sensei-parent</artifactId>
<version>1.5.1-SNAPSHOT</version>
<relativePath>../sensei-parent/pom.xml</relativePath>
</parent>

<artifactId>sensei-tools</artifactId>
<packaging>jar</packaging>
<name>sensei tools</name>
<description>sensei tools</description>

<!-- Set the compiler to java6 -->
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.1</version>
<configuration>
<source>1.6</source>
<target>1.6</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<!--
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>2.2</version>
<configuration>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
-->

<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>2.1.2</version>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.8.1</version>
<executions>
<execution>
<id>attach-javadocs</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-dependency-plugin</artifactId>
<version>2.4</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>copy-dependencies</goal>
</goals>
<configuration>
<outputDirectory>${project.build.directory}/lib</outputDirectory>
</configuration>
</execution>
</executions>
</plugin>

<plugin>
<groupId>pl.project13.maven</groupId>
<artifactId>git-commit-id-plugin</artifactId>
<version>1.9</version>
<executions>
<execution>
<goals>
<goal>revision</goal>
</goals>
</execution>
</executions>
<configuration>
<prefix>git</prefix>
<dateFormat>dd.MM.yyyy '@' HH:mm:ss z</dateFormat>
<verbose>true</verbose>
<dotGitDirectory>${project.basedir}/../.git</dotGitDirectory>
<generateGitPropertiesFile>true</generateGitPropertiesFile>
<generateGitPropertiesFilename>src/main/resources/git.properties</generateGitPropertiesFilename>
</configuration>
</plugin>


</plugins>
</build>


<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>

<dependencies>
<dependency>
<groupId>com.linkedin.zoie</groupId>
<artifactId>zoie-core</artifactId>
<version>3.2.1-SNAPSHOT</version>
<exclusions>
<exclusion>
<groupId>fastutil</groupId>
<artifactId>fastutil</artifactId>
</exclusion>
</exclusions>
</dependency>

<dependency>
<groupId>it.unimi.dsi</groupId>
<artifactId>fastutil</artifactId>
<version>6.4.3</version>
<type>jar</type>
</dependency>
</dependencies>
</project>
96 changes: 96 additions & 0 deletions sensei-tools/src/main/java/com/senseidb/tools/IndexSplitter.java
@@ -0,0 +1,96 @@
package com.senseidb.tools;

import java.io.File;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

import proj.zoie.api.ZoieIndexReader;

public class IndexSplitter {

private final File srcIdx;
private final File targetDir;
private final int maxShardId;

public IndexSplitter(File srcIdx,int maxShardId, File targetDir){
this.srcIdx = srcIdx;
this.targetDir = targetDir;
this.maxShardId = maxShardId;
}

public void splitTo(int[] targetPartitions) throws Exception{
IndexReader reader = IndexReader.open(FSDirectory.open(srcIdx), false);
ZoieIndexReader<?> zreader = ZoieIndexReader.open(reader);
int maxdoc = reader.maxDoc();
int numdoc = reader.numDocs();

System.out.println("total doccount: "+numdoc);
System.out.println("total numdel: "+reader.numDeletedDocs());

for (int k : targetPartitions){
File idxDir = new File(targetDir,"shard"+k);
for (int i = 0; i< maxdoc; ++i){
if (zreader.isDeleted(i)) continue;
long uid = zreader.getUID(i);
int shard = (int)(uid % maxShardId);
if (k != shard){
reader.deleteDocument(i);
}
}
IndexWriterConfig writerConf = new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35));
IndexWriter writer = new IndexWriter(FSDirectory.open(idxDir), writerConf);
writer.addIndexes(reader);
writer.commit();
writer.close();
Runtime.getRuntime().exec("cp "+srcIdx.getAbsolutePath()+"/index.directory "+idxDir.getAbsolutePath());
// verify
IndexReader tmpReader = IndexReader.open(FSDirectory.open(idxDir));
ZoieIndexReader<?> zTmpReader = ZoieIndexReader.open(tmpReader);
System.out.println("verifying shard: "+k+", numdocs: "+tmpReader.numDocs());
for (int i=0;i<tmpReader.maxDoc(); ++i){
long uid = zTmpReader.getUID(i);
int shard = (int)(uid % maxShardId);
if (shard != k){
System.out.println("error: "+uid+" did not belong to shard: "+k+", instead it has shard: "+shard);
break;
}
}
zreader.undeleteAll();
}
zreader.close();
reader.close();
}

static void usage(){
System.out.println("Usage: <src-idx-dir> <max partition id> <target-dir> partitions, e.g. p1,p2,p3...");
}

public static void main(String[] args) throws Exception{
File srcDir = null;
File targetDir = null;
int maxPartitionId = 0;
int[] partitions = null;
try{
srcDir = new File(args[0]);
maxPartitionId = Integer.parseInt(args[1]);
targetDir = new File(args[2]);
String partString = args[3];
String[] parts = partString.split(",");
partitions = new int[parts.length];
for (int i = 0; i < partitions.length; ++i){
partitions[i] = Integer.parseInt(parts[i]);
}
}
catch(Exception e){
usage();
System.exit(1);
}
IndexSplitter splitter = new IndexSplitter(srcDir,maxPartitionId,targetDir);
splitter.splitTo(partitions);
}
}

0 comments on commit 7ea0bea

Please sign in to comment.