Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

build works now

minor fixes
added short cmd option names
  • Loading branch information...
commit f7eb4973e46626b2c7e6f7b72701867a42c87bb0 1 parent 7874886
@vadimzalunin authored
Showing with 979 additions and 537 deletions.
  1. +3 −2 .classpath
  2. +3 −3 .settings/org.eclipse.jdt.core.prefs
  3. +2 −2 build.number
  4. +5 −5 build/build.xml
  5. BIN  cramone.jar
  6. BIN  cramtools-1.0.jar
  7. BIN  lib/junit-4.5.jar
  8. BIN  lib/picard-1.79.jar
  9. BIN  lib/picard-tools-1.79.zip
  10. +14 −14 src/main/java/net/sf/cram/BLOCK_PROTO.java
  11. +8 −8 src/main/java/net/sf/cram/Bam2Cram.java
  12. +3 −3 src/main/java/net/sf/cram/Cram2Bam.java
  13. +41 −5 src/main/java/net/sf/cram/CramFileIterator.java
  14. +2 −2 src/main/java/net/sf/cram/CramNormalizer.java
  15. +8 −0 src/main/java/net/sf/cram/Preservation.java
  16. +3 −3 src/main/java/net/sf/cram/ReadWrite.java
  17. +9 −9 src/main/java/net/sf/cram/SCSTest.java
  18. +7 −8 src/main/java/net/sf/cram/ValidateCramFile.java
  19. +4 −4 src/main/java/net/sf/cram/encoding/EncodingFactory.java
  20. +8 −8 src/main/java/net/sf/cram/encoding/Reader.java
  21. +26 −26 src/main/java/net/sf/cram/io/ByteBufferUtils.java
  22. +1 −1  src/main/java/net/sf/cram/lossy/PreservationPolicy.java
  23. +2 −2 src/main/java/net/sf/cram/lossy/QualityScorePreservation.java
  24. +10 −10 src/main/java/net/sf/cram/stats/CompressionHeaderFactory.java
  25. +820 −0 src/main/java/net/sf/picard/sam/SamFileValidator.java
  26. +0 −422 src/main/java/net/sf/samtools/BinaryTagCodec.java
View
5 .classpath
@@ -4,7 +4,8 @@
<classpathentry kind="src" path="src/test/java"/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.7"/>
<classpathentry kind="lib" path="lib/jcommander-1.7.jar"/>
+ <classpathentry kind="lib" path="lib/sam-1.79.jar"/>
+ <classpathentry kind="lib" path="lib/picard-1.79.jar"/>
<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
- <classpathentry combineaccessrules="false" kind="src" path="/picard"/>
- <classpathentry kind="output" path="target"/>
+ <classpathentry kind="output" path="bin"/>
</classpath>
View
6 .settings/org.eclipse.jdt.core.prefs
@@ -1,11 +1,11 @@
eclipse.preferences.version=1
org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
-org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
-org.eclipse.jdt.core.compiler.compliance=1.7
+org.eclipse.jdt.core.compiler.compliance=1.6
org.eclipse.jdt.core.compiler.debug.lineNumber=generate
org.eclipse.jdt.core.compiler.debug.localVariable=generate
org.eclipse.jdt.core.compiler.debug.sourceFile=generate
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
-org.eclipse.jdt.core.compiler.source=1.7
+org.eclipse.jdt.core.compiler.source=1.6
View
4 build.number
@@ -1,3 +1,3 @@
#Build Number for ANT. Do not edit!
-#Wed Aug 29 11:46:55 BST 2012
-build.number=0
+#Thu Nov 08 16:45:34 GMT 2012
+build.number=11
View
10 build/build.xml
@@ -10,7 +10,7 @@
<property name="target.dir" value="${basedir}/target" />
<property name="classes" value="${target.dir}/classes" />
<property name="crammerClasses" value="target/classes" />
- <property name="jar.file.name" value="cramone.jar" />
+ <property name="jar.file.name" value="cramtools-1.0.jar" />
<path id="classpath">
<fileset dir="${lib.home}" includes="*.jar" />
@@ -42,10 +42,10 @@
<format property="TODAY" pattern="yyyy-MM-dd HH:mm:ss" />
</tstamp>
- <jar destfile="${basedir}/cramtools.jar" filesetmanifest="skip" duplicate="preserve">
+ <jar destfile="${basedir}/${jar.file.name}" filesetmanifest="skip" duplicate="preserve">
<fileset dir="${classes}" />
<zipgroupfileset dir="lib" includes="*.jar" >
- <exclude name="**/SAMFileReader.class" />
+ <exclude name="**/SamFileValidator.class" />
</zipgroupfileset>
<manifest>
@@ -86,11 +86,11 @@
<target name="clean">
<delete dir="${target.dir}" includeEmptyDirs="true" />
- <delete file="cramtools.jar" />
+ <delete file="${jar.file.name}" />
</target>
<target name="compile" depends="setup">
- <javac debug="true" fork="true" destdir="${classes}" srcdir="${src}/main/java:${src}/test/java:../picard/src/java">
+ <javac debug="true" fork="true" destdir="${classes}" srcdir="${src}/main/java:${src}/test/java">
<include name="**/*.java" />
<classpath refid="classpath" />
</javac>
View
BIN  cramone.jar
Binary file not shown
View
BIN  cramtools-1.0.jar
Binary file not shown
View
BIN  lib/junit-4.5.jar
Binary file not shown
View
BIN  lib/picard-1.79.jar
Binary file not shown
View
BIN  lib/picard-tools-1.79.zip
Binary file not shown
View
28 src/main/java/net/sf/cram/BLOCK_PROTO.java
@@ -59,7 +59,7 @@
SAMFileHeader fileHeader) throws IllegalArgumentException,
IllegalAccessException, IOException {
long time1 = System.nanoTime();
- List<CramRecord> records = new ArrayList<>();
+ List<CramRecord> records = new ArrayList<CramRecord>();
for (Slice s : c.slices)
records.addAll(records(s, h, fileHeader));
@@ -78,7 +78,7 @@
SAMSequenceRecord sequence = fileHeader.getSequence(s.sequenceId);
String seqName = sequence.getSequenceName();
DataReaderFactory f = new DataReaderFactory();
- Map<Integer, InputStream> inputMap = new HashMap<>();
+ Map<Integer, InputStream> inputMap = new HashMap<Integer, InputStream>();
for (Integer exId : s.external.keySet()) {
inputMap.put(exId, new ByteArrayInputStream(
s.external.get(exId).content));
@@ -87,7 +87,7 @@
Reader reader = f.buildReader(new DefaultBitInputStream(
new ByteArrayInputStream(s.coreBlock.content)), inputMap, h);
- List<CramRecord> records = new ArrayList<>();
+ List<CramRecord> records = new ArrayList<CramRecord>();
for (int i = 0; i < s.nofRecords; i++) {
CramRecord r = new CramRecord();
r.setSequenceName(seqName);
@@ -107,7 +107,7 @@
}
static Container writeContainer(List<CramRecord> records,
- SAMFileHeader fileHeader) throws IllegalArgumentException,
+ SAMFileHeader fileHeader, boolean preserveReadNames) throws IllegalArgumentException,
IllegalAccessException, IOException {
// get stats, create compression header and slices
long time1 = System.nanoTime();
@@ -116,11 +116,11 @@ static Container writeContainer(List<CramRecord> records,
h.mappedQualityScoreIncluded = true;
h.unmappedQualityScoreIncluded = true;
- h.readNamesIncluded = true;
+ h.readNamesIncluded = preserveReadNames;
int recordsPerSlice = 10000;
- List<Slice> slices = new ArrayList<>();
+ List<Slice> slices = new ArrayList<Slice>();
Container c = new Container();
c.h = h;
@@ -159,7 +159,7 @@ private static Slice writeSlice(List<CramRecord> records,
CompressionHeader h, SAMFileHeader fileHeader)
throws IllegalArgumentException, IllegalAccessException,
IOException {
- Map<Integer, ExposedByteArrayOutputStream> map = new HashMap<>();
+ Map<Integer, ExposedByteArrayOutputStream> map = new HashMap<Integer, ExposedByteArrayOutputStream>();
for (int id : h.externalIds) {
map.put(id, new ExposedByteArrayOutputStream());
}
@@ -193,7 +193,7 @@ private static Slice writeSlice(List<CramRecord> records,
slice.coreBlock.contentType = BlockContentType.CORE;
bos.close();
- slice.external = new HashMap<>();
+ slice.external = new HashMap<Integer, Block>();
for (Integer i : map.keySet()) {
ExposedByteArrayOutputStream os = map.get(i);
@@ -215,7 +215,7 @@ private static void randomStressTest() throws IOException,
long baseCount = 0;
Random random = new Random();
- List<CramRecord> records = new ArrayList<>();
+ List<CramRecord> records = new ArrayList<CramRecord>();
for (int i = 0; i < 100000; i++) {
int len = random.nextInt(100) + 50;
byte[] bases = new byte[len];
@@ -310,7 +310,7 @@ private static void randomStressTest() throws IOException,
}
long time1 = System.nanoTime();
- Container c = writeContainer(records, samFileHeader);
+ Container c = writeContainer(records, samFileHeader, true);
long time2 = System.nanoTime();
System.out.println("Container written in " + (time2 - time1) / 1000000
+ " milli seconds");
@@ -345,7 +345,7 @@ private static void randomStressTest() throws IOException,
baos.size() * 8f / baseCount);
}
- public static void main(String[] args) throws IllegalArgumentException,
+ public static void main(String[] args, boolean preserveReadNames) throws IllegalArgumentException,
IllegalAccessException, IOException {
File bamFile = new File(
"c:/temp/HG00096.mapped.illumina.mosaik.GBR.exome.20110411.chr20.bam");
@@ -365,7 +365,7 @@ public static void main(String[] args) throws IllegalArgumentException,
}
int maxRecords = 100000;
- List<SAMRecord> samRecords = new ArrayList<>(maxRecords);
+ List<SAMRecord> samRecords = new ArrayList<SAMRecord>(maxRecords);
int alStart = Integer.MAX_VALUE;
int alEnd = 0;
@@ -400,7 +400,7 @@ public static void main(String[] args) throws IllegalArgumentException,
Sam2CramRecordFactory f = new Sam2CramRecordFactory(sequence.getBases());
f.captureUnmappedBases = true;
f.captureUnmappedScores = true;
- List<CramRecord> cramRecords = new ArrayList<>(maxRecords);
+ List<CramRecord> cramRecords = new ArrayList<CramRecord>(maxRecords);
int prevAlStart = samRecords.get(0).getAlignmentStart();
int index = 0;
QualityScorePreservation preservation = new QualityScorePreservation(
@@ -476,7 +476,7 @@ public static void main(String[] args) throws IllegalArgumentException,
System.out.println();
long time1 = System.nanoTime();
- Container c = writeContainer(cramRecords, samFileReader.getFileHeader());
+ Container c = writeContainer(cramRecords, samFileReader.getFileHeader(), preserveReadNames);
long time2 = System.nanoTime();
System.out.println("Container written in " + (time2 - time1) / 1000000
+ " milli seconds");
View
16 src/main/java/net/sf/cram/Bam2Cram.java
@@ -50,7 +50,7 @@
Sam2CramRecordFactory f = new Sam2CramRecordFactory(ref);
f.captureUnmappedBases = true;
f.captureUnmappedScores = true;
- List<CramRecord> cramRecords = new ArrayList<>();
+ List<CramRecord> cramRecords = new ArrayList<CramRecord>() ;
int prevAlStart = samRecords.get(0).getAlignmentStart();
int index = 0;
for (SAMRecord samRecord : samRecords) {
@@ -188,7 +188,7 @@ public static void main(String[] args) throws IOException,
sequence = referenceSequenceFile.getSequence(seqName);
}
- List<SAMRecord> samRecords = new ArrayList<>(params.maxContainerSize);
+ List<SAMRecord> samRecords = new ArrayList<SAMRecord>(params.maxContainerSize);
QualityScorePreservation preservation = new QualityScorePreservation(
params.qsSpec);
@@ -214,7 +214,7 @@ public static void main(String[] args) throws IOException,
samFileReader.getFileHeader(), ref, preservation);
samRecords.clear();
Container container = BLOCK_PROTO.writeContainer(records,
- samFileReader.getFileHeader());
+ samFileReader.getFileHeader(), params.preserveReadNames);
records.clear();
ReadWrite.writeContainer(container, os);
@@ -244,7 +244,7 @@ public static void main(String[] args) throws IOException,
samFileReader.getFileHeader(), ref, preservation);
samRecords.clear();
Container container = BLOCK_PROTO.writeContainer(records,
- samFileReader.getFileHeader());
+ samFileReader.getFileHeader(), params.preserveReadNames);
records.clear();
ReadWrite.writeContainer(container, os);
for (Slice s : container.slices) {
@@ -276,13 +276,13 @@ public static void main(String[] args) throws IOException,
@Parameters(commandDescription = "BAM to CRAM converter. ")
static class Params {
- @Parameter(names = { "--input-bam-file" }, converter = FileConverter.class, description = "Path to a BAM file to be converted to CRAM. Omit if standard input (pipe).")
+ @Parameter(names = { "--input-bam-file", "-I" }, converter = FileConverter.class, description = "Path to a BAM file to be converted to CRAM. Omit if standard input (pipe).")
File bamFile;
- @Parameter(names = { "--reference-fasta-file" }, converter = FileConverter.class, description = "The reference fasta file, uncompressed and indexed (.fai file, use 'samtools faidx'). ")
+ @Parameter(names = { "--reference-fasta-file", "-R" }, converter = FileConverter.class, description = "The reference fasta file, uncompressed and indexed (.fai file, use 'samtools faidx'). ")
File referenceFasta;
- @Parameter(names = { "--output-cram-file" }, converter = FileConverter.class, description = "The path for the output CRAM file. Omit if standard output (pipe).")
+ @Parameter(names = { "--output-cram-file", "-O" }, converter = FileConverter.class, description = "The path for the output CRAM file. Omit if standard output (pipe).")
File outputCramFile = null;
@Parameter(names = { "--max-records" }, description = "Stop after compressing this many records. ")
@@ -315,7 +315,7 @@ public static void main(String[] args) throws IOException,
@Parameter(names = { "--preserve-read-names" }, description = "Preserve all read names.")
boolean preserveReadNames = false;
- @Parameter(names = { "--lossy-quality-score-spec" }, description = "A string specifying what quality scores should be preserved.")
+ @Parameter(names = { "--lossy-quality-score-spec", "-L" }, description = "A string specifying what quality scores should be preserved.")
String qsSpec = "";
}
}
View
6 src/main/java/net/sf/cram/Cram2Bam.java
@@ -144,13 +144,13 @@ public static void main(String[] args) throws IOException,
@Parameters(commandDescription = "CRAM to BAM conversion. ")
static class Params {
- @Parameter(names = { "--input-cram-file" }, converter = FileConverter.class, description = "The path to the CRAM file to uncompress. Omit if standard input (pipe).")
+ @Parameter(names = { "--input-cram-file", "-I" }, converter = FileConverter.class, description = "The path to the CRAM file to uncompress. Omit if standard input (pipe).")
File cramFile;
- @Parameter(names = { "--reference-fasta-file" }, converter = FileConverter.class, description = "Path to the reference fasta file, it must be uncompressed and indexed (use 'samtools faidx' for example).")
+ @Parameter(names = { "--reference-fasta-file", "-R" }, converter = FileConverter.class, description = "Path to the reference fasta file, it must be uncompressed and indexed (use 'samtools faidx' for example).")
File reference;
- @Parameter(names = { "--output-bam-file" }, converter = FileConverter.class, description = "The path to the output BAM file.")
+ @Parameter(names = { "--output-bam-file", "-O" }, converter = FileConverter.class, description = "The path to the output BAM file.")
File outputFile;
@Parameter(names = { "-h", "--help" }, description = "Print help and quit")
View
46 src/main/java/net/sf/cram/CramFileIterator.java
@@ -1,9 +1,13 @@
package net.sf.cram;
+import java.io.BufferedInputStream;
import java.io.EOFException;
+import java.io.File;
+import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
+import java.util.Iterator;
import java.util.List;
import net.sf.cram.ReadWrite.CramHeader;
@@ -11,12 +15,14 @@
import net.sf.picard.reference.ReferenceSequence;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.picard.util.Log;
+import net.sf.samtools.SAMFileHeader.SortOrder;
import net.sf.samtools.SAMRecord;
+import net.sf.samtools.SAMRecordIterator;
import net.sf.samtools.SAMSequenceRecord;
import net.sf.samtools.util.CloseableIterator;
import net.sf.samtools.util.RuntimeEOFException;
-public class CramFileIterator implements CloseableIterator<SAMRecord> {
+public class CramFileIterator implements SAMRecordIterator {
private static Log log = Log.getInstance(CramFileIterator.class);
private InputStream is;
private CramHeader cramHeader;
@@ -30,9 +36,9 @@ public CramFileIterator(InputStream is,
this.is = is;
this.referenceSequenceFile = referenceSequenceFile;
cramHeader = ReadWrite.readCramHeader(is);
- records = new ArrayList<>(100000) ;
+ records = new ArrayList<SAMRecord>(100000);
}
-
+
public CramHeader getCramHeader() {
return cramHeader;
}
@@ -40,7 +46,7 @@ public CramHeader getCramHeader() {
private void nextContainer() throws IOException, IllegalArgumentException,
IllegalAccessException {
if (records == null)
- records = new ArrayList<>(100000);
+ records = new ArrayList<SAMRecord>(100000);
records.clear();
recordCounter = 0;
@@ -91,7 +97,8 @@ public boolean hasNext() {
if (recordCounter + 1 >= records.size()) {
try {
nextContainer();
- if (records.isEmpty()) return false ;
+ if (records.isEmpty())
+ return false;
} catch (Exception e) {
throw new RuntimeEOFException(e);
}
@@ -120,4 +127,33 @@ public void close() {
}
}
+ public static class CramFileIterable implements Iterable<SAMRecord> {
+ private ReferenceSequenceFile referenceSequenceFile;
+ private File cramFile;
+
+ public CramFileIterable(File cramFile, ReferenceSequenceFile referenceSequenceFile) {
+ this.referenceSequenceFile = referenceSequenceFile;
+ this.cramFile = cramFile;
+ }
+
+ @Override
+ public Iterator<SAMRecord> iterator() {
+ try {
+ FileInputStream fis = new FileInputStream(cramFile);
+ BufferedInputStream bis = new BufferedInputStream(fis);
+ CramFileIterator iterator = new CramFileIterator(bis,
+ referenceSequenceFile);
+ return iterator;
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ }
+
+ @Override
+ public SAMRecordIterator assertSorted(SortOrder sortOrder) {
+ throw new RuntimeException("Not implemented.") ;
+ }
+
}
View
4 src/main/java/net/sf/cram/CramNormalizer.java
@@ -23,7 +23,7 @@
private int alignmentStart = 1;
private byte defaultQualityScore = '?' - '!';
- private Map<Integer, CramRecord> pairingByIndexMap = new HashMap<>();
+ private Map<Integer, CramRecord> pairingByIndexMap = new HashMap<Integer, CramRecord>();
private byte[] ref;
public CramNormalizer(SAMFileHeader header, byte[] ref, int alignmentStart) {
@@ -98,7 +98,7 @@ public void normalize(List<CramRecord> records, boolean resetPairing) {
// assign some read names if needed:
for (CramRecord r : records) {
if (r.getReadName() == null) {
- String name = readNamePrefix + readCounter;
+ String name = readNamePrefix + r.index;
r.setReadName(name);
if (r.next != null)
r.next.setReadName(name);
View
8 src/main/java/net/sf/cram/Preservation.java
@@ -0,0 +1,8 @@
+package net.sf.cram;
+
+public class Preservation {
+ public boolean mappedQualityScoreIncluded;
+ public boolean unmappedQualityScoreIncluded;
+ public boolean unmappedPlacedQualityScoreIncluded;
+ public boolean readNamesIncluded;
+}
View
6 src/main/java/net/sf/cram/ReadWrite.java
@@ -407,7 +407,7 @@ else if ("RN".equals(key))
{ // tag encoding map:
int byteSize = ByteBufferUtils.readUnsignedITF8(buf);
int mapSize = ByteBufferUtils.readUnsignedITF8(buf);
- h.tMap = new TreeMap<>();
+ h.tMap = new TreeMap<String, EncodingParams>();
for (int i = 0; i < mapSize; i++) {
String key = new String(new byte[] { buf.get(), buf.get(),
buf.get() });
@@ -485,14 +485,14 @@ public static Container readContainer(SAMFileHeader samFileHeader,
// System.err.println(c.toString());
- LinkedList<Block> blocks = new LinkedList<>();
+ LinkedList<Block> blocks = new LinkedList<Block>();
for (int i = 0; i < c.blockCount; i++) {
blocks.add(readBlock(is));
}
c.h = readCompressionHeader(blocks.removeFirst());
- List<Slice> slices = new ArrayList<>();
+ List<Slice> slices = new ArrayList<Slice>();
while (!blocks.isEmpty()) {
slices.add(readMappedSlice(blocks));
}
View
18 src/main/java/net/sf/cram/SCSTest.java
@@ -43,8 +43,8 @@ public static void main(String[] args) throws IOException,
}
int maxRecords = 100000;
- List<SAMRecord> samRecords = new ArrayList<>(maxRecords);
- Map<String, List<SAMRecord>> origSamMap = new TreeMap<>();
+ List<SAMRecord> samRecords = new ArrayList<SAMRecord>(maxRecords);
+ Map<String, List<SAMRecord>> origSamMap = new TreeMap<String, List<SAMRecord>>();
int alStart = Integer.MAX_VALUE;
int alEnd = 0;
@@ -60,7 +60,7 @@ public static void main(String[] args) throws IOException,
samRecords.add(samRecord);
List<SAMRecord> list = origSamMap.get(samRecord.getReadName());
if (list == null) {
- list = new ArrayList<>(2);
+ list = new ArrayList<SAMRecord>(2);
origSamMap.put(samRecord.getReadName(), list);
}
list.add(samRecord);
@@ -83,7 +83,7 @@ public static void main(String[] args) throws IOException,
Sam2CramRecordFactory f = new Sam2CramRecordFactory(sequence.getBases());
f.captureUnmappedBases = true;
f.captureUnmappedScores = true;
- List<CramRecord> cramRecords = new ArrayList<>(maxRecords);
+ List<CramRecord> cramRecords = new ArrayList<CramRecord>(maxRecords);
int prevAlStart = samRecords.get(0).getAlignmentStart();
int index = 0;
QualityScorePreservation preservation = new QualityScorePreservation(
@@ -165,14 +165,14 @@ public static void main(String[] args) throws IOException,
}
List<CramRecord> old = cramRecords ;
- Container c = BLOCK_PROTO.writeContainer(cramRecords, samFileReader.getFileHeader()) ;
+ Container c = BLOCK_PROTO.writeContainer(cramRecords, samFileReader.getFileHeader(), true) ;
System.err.println("Written " + Writer.detachedCount + " detached records.");
try {
cramRecords = BLOCK_PROTO.records(c.h, c, samFileReader.getFileHeader()) ;
} catch (Exception e1) {
System.err.println("Read " + Reader.detachedCount + " detached records.");
- throw e1 ;
+ throw new RuntimeException(e1) ;
}
for (int i=0; i<cramRecords.size(); i++) {
@@ -199,15 +199,15 @@ public static void main(String[] args) throws IOException,
Cram2BamRecordFactory c2sFactory = new Cram2BamRecordFactory(
samFileReader.getFileHeader());
- List<SAMRecord> newSAMRecords = new ArrayList<>();
- Map<String, List<SAMRecord>> newSamMap = new TreeMap<>();
+ List<SAMRecord> newSAMRecords = new ArrayList<SAMRecord>();
+ Map<String, List<SAMRecord>> newSamMap = new TreeMap<String, List<SAMRecord>>();
for (CramRecord r : cramRecords) {
SAMRecord s = c2sFactory.create(r);
newSAMRecords.add(s);
List<SAMRecord> list = newSamMap.get(s.getReadName());
if (list == null) {
- list = new ArrayList<>(2);
+ list = new ArrayList<SAMRecord>(2);
newSamMap.put(s.getReadName(), list);
}
list.add(s);
View
15 src/main/java/net/sf/cram/ValidateCramFile.java
@@ -15,7 +15,6 @@
import net.sf.picard.util.Log;
import net.sf.picard.util.Log.LogLevel;
import net.sf.picard.util.ProgressLogger;
-import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMValidationError;
import net.sf.samtools.SAMValidationError.Type;
@@ -80,29 +79,29 @@ public static void main(String[] args) throws IOException,
referenceSequenceFile);
CramHeader cramHeader = iterator.getCramHeader();
+ iterator.close();
ProgressLogger progress = new ProgressLogger(log, 100000,
"Validated Read");
SamFileValidator v = new SamFileValidator(new PrintWriter(System.out),
1);
- List<SAMValidationError.Type> errors = new ArrayList<>();
+ List<SAMValidationError.Type> errors = new ArrayList<SAMValidationError.Type>();
errors.add(Type.MATE_NOT_FOUND);
errors.add(Type.MISSING_TAG_NM);
v.setErrorsToIgnore(errors);
v.init(referenceSequenceFile, cramHeader.samFileHeader);
- while (iterator.hasNext()) {
- SAMRecord s = iterator.next();
- v.validateRecord(progress, s, cramHeader.samFileHeader);
- }
+ v.validateSamRecords(new CramFileIterator.CramFileIterable(
+ params.cramFile, referenceSequenceFile),
+ cramHeader.samFileHeader);
log.info("Elapsed seconds: " + progress.getElapsedSeconds());
}
@Parameters(commandDescription = "CRAM to BAM conversion. ")
static class Params {
- @Parameter(names = { "--input-cram-file" }, converter = FileConverter.class, description = "The path to the CRAM file to uncompress. Omit if standard input (pipe).")
+ @Parameter(names = { "--input-cram-file", "-I" }, converter = FileConverter.class, description = "The path to the CRAM file to uncompress. Omit if standard input (pipe).")
File cramFile;
- @Parameter(names = { "--reference-fasta-file" }, converter = FileConverter.class, description = "Path to the reference fasta file, it must be uncompressed and indexed (use 'samtools faidx' for example).")
+ @Parameter(names = { "--reference-fasta-file", "-R" }, converter = FileConverter.class, description = "Path to the reference fasta file, it must be uncompressed and indexed (use 'samtools faidx' for example).")
File reference;
@Parameter(names = { "-h", "--help" }, description = "Print help and quit")
View
8 src/main/java/net/sf/cram/encoding/EncodingFactory.java
@@ -15,7 +15,7 @@
case HUFFMAN:
return (Encoding<T>) new HuffmanByteEncoding();
case NULL:
- return new NullEncoding<>();
+ return new NullEncoding<T>();
default:
break;
@@ -28,7 +28,7 @@
case HUFFMAN:
return (Encoding<T>) new HuffmanIntegerEncoding();
case NULL:
- return new NullEncoding<>();
+ return new NullEncoding<T>();
case EXTERNAL:
return (Encoding<T>) new ExternalIntegerEncoding();
case GOLOMB:
@@ -50,7 +50,7 @@
case LONG:
switch (id) {
case NULL:
- return new NullEncoding<>();
+ return new NullEncoding<T>();
case GOLOMB:
return (Encoding<T>) new GolombLongEncoding();
case EXTERNAL:
@@ -64,7 +64,7 @@
case BYTE_ARRAY:
switch (id) {
case NULL:
- return new NullEncoding<>();
+ return new NullEncoding<T>();
case BYTE_ARRAY_LEN:
return (Encoding<T>) new ByteArrayLenEncoding();
case EXTERNAL:
View
16 src/main/java/net/sf/cram/encoding/Reader.java
@@ -130,7 +130,7 @@ public void read(CramRecord r) throws IOException {
// tag records:
if (r.tags != null) {
int tagCount = tagCountC.readData();
- r.tags = new ArrayList<>();
+ r.tags = new ArrayList<ReadTag>();
for (int i = 0; i < tagCount; i++) {
byte[] name = tagNameAndTypeC.readData();
String tagId = new String(new byte[] { name[0], name[1], ':',
@@ -146,7 +146,7 @@ public void read(CramRecord r) throws IOException {
if (!r.segmentUnmapped) {
// writing read features:
- java.util.List<ReadFeature> rf = new ArrayList<>();
+ java.util.List<ReadFeature> rf = new ArrayList<ReadFeature>();
r.setReadFeatures(rf);
int size = nfc.readData();
int prevPos = 0;
@@ -220,12 +220,12 @@ public void read(CramRecord r) throws IOException {
}
}
- int mark = testC.readData() ;
- if (Writer.TEST_MARK != mark) {
- System.err.println("Record counter=" + recordCount);
- System.err.println(r.toString());
- throw new RuntimeException("Test mark not found.") ;
- }
+// int mark = testC.readData() ;
+// if (Writer.TEST_MARK != mark) {
+// System.err.println("Record counter=" + recordCount);
+// System.err.println(r.toString());
+// throw new RuntimeException("Test mark not found.") ;
+// }
recordCount++ ;
}
View
52 src/main/java/net/sf/cram/io/ByteBufferUtils.java
@@ -12,24 +12,24 @@ public static final int readUnsignedITF8(InputStream is) throws IOException {
int b1 = is.read();
if (b1 == -1) throw new EOFException() ;
- if ((b1 & 0b10000000) == 0)
+ if ((b1 & 128) == 0)
return b1;
- if ((b1 & 0b01000000) == 0)
- return ((b1 & 0b01111111) << 8) | is.read();
+ if ((b1 & 64) == 0)
+ return ((b1 & 127) << 8) | is.read();
- if ((b1 & 0b00100000) == 0) {
+ if ((b1 & 32) == 0) {
int b2 = is.read();
int b3 = is.read();
- return ((b1 & 0b00111111) << 16) | b2 << 8 | b3;
+ return ((b1 & 63) << 16) | b2 << 8 | b3;
}
- if ((b1 & 0b00010000) == 0)
- return ((b1 & 0b00011111) << 24) | is.read() << 16 | is.read() << 8
+ if ((b1 & 16) == 0)
+ return ((b1 & 31) << 24) | is.read() << 16 | is.read() << 8
| is.read();
- return ((b1 & 0b00001111) << 28) | is.read() << 20 | is.read() << 12
- | is.read() << 4 | (0b00001111 & is.read());
+ return ((b1 & 15) << 28) | is.read() << 20 | is.read() << 12
+ | is.read() << 4 | (15 & is.read());
}
public static final int writeUnsignedITF8(int value, OutputStream os) throws IOException {
@@ -39,27 +39,27 @@ public static final int writeUnsignedITF8(int value, OutputStream os) throws IOE
}
if ((value >>> 14) == 0) {
- os.write( ((value >> 8) | 0b10000000));
+ os.write( ((value >> 8) | 128));
os.write( (value & 0xFF));
return 16;
}
if ((value >>> 21) == 0) {
- os.write( ((value >> 16) | 0b11000000));
+ os.write( ((value >> 16) | 192));
os.write( ((value >> 8) & 0xFF));
os.write( (value & 0xFF));
return 24;
}
if ((value >>> 28) == 0) {
- os.write( ((value >> 24) | 0b11100000));
+ os.write( ((value >> 24) | 224));
os.write( ((value >> 16) & 0xFF));
os.write( ((value >> 8) & 0xFF));
os.write( (value & 0xFF));
return 32 ;
}
- os.write( ((value >> 28) | 0b11110000));
+ os.write( ((value >> 28) | 240));
os.write( ((value >> 20) & 0xFF));
os.write( ((value >> 12) & 0xFF));
os.write( ((value >> 4) & 0xFF));
@@ -90,25 +90,25 @@ public static final int readUnsignedITF8(byte[] data) {
public static final int readUnsignedITF8(ByteBuffer buf) {
int b1 = 0xFF & buf.get();
- if ((b1 & 0b10000000) == 0)
+ if ((b1 & 128) == 0)
return b1;
- if ((b1 & 0b01000000) == 0)
- return ((b1 & 0b01111111) << 8) | (0xFF & buf.get());
+ if ((b1 & 64) == 0)
+ return ((b1 & 127) << 8) | (0xFF & buf.get());
- if ((b1 & 0b00100000) == 0) {
+ if ((b1 & 32) == 0) {
int b2 = 0xFF & buf.get();
int b3 = 0xFF & buf.get();
- return ((b1 & 0b00111111) << 16) | b2 << 8 | b3;
+ return ((b1 & 63) << 16) | b2 << 8 | b3;
}
- if ((b1 & 0b00010000) == 0)
- return ((b1 & 0b00011111) << 24) | (0xFF & buf.get()) << 16
+ if ((b1 & 16) == 0)
+ return ((b1 & 31) << 24) | (0xFF & buf.get()) << 16
| (0xFF & buf.get()) << 8 | (0xFF & buf.get());
- return ((b1 & 0b00001111) << 28) | (0xFF & buf.get()) << 20
+ return ((b1 & 15) << 28) | (0xFF & buf.get()) << 20
| (0xFF & buf.get()) << 12 | (0xFF & buf.get()) << 4
- | (0b00001111 & buf.get());
+ | (15 & buf.get());
}
public static final void writeUnsignedITF8(int value, ByteBuffer buf) {
@@ -118,27 +118,27 @@ public static final void writeUnsignedITF8(int value, ByteBuffer buf) {
}
if ((value >>> 14) == 0) {
- buf.put((byte) ((value >> 8) | 0b10000000));
+ buf.put((byte) ((value >> 8) | 128));
buf.put((byte) (value & 0xFF));
return;
}
if ((value >>> 21) == 0) {
- buf.put((byte) ((value >> 16) | 0b11000000));
+ buf.put((byte) ((value >> 16) | 192));
buf.put((byte) ((value >> 8) & 0xFF));
buf.put((byte) (value & 0xFF));
return;
}
if ((value >>> 28) == 0) {
- buf.put((byte) ((value >> 24) | 0b11100000));
+ buf.put((byte) ((value >> 24) | 224));
buf.put((byte) ((value >> 16) & 0xFF));
buf.put((byte) ((value >> 8) & 0xFF));
buf.put((byte) (value & 0xFF));
return;
}
- buf.put((byte) ((value >> 28) | 0b11110000));
+ buf.put((byte) ((value >> 28) | 240));
buf.put((byte) ((value >> 20) & 0xFF));
buf.put((byte) ((value >> 12) & 0xFF));
buf.put((byte) ((value >> 4) & 0xFF));
View
2  src/main/java/net/sf/cram/lossy/PreservationPolicy.java
@@ -5,7 +5,7 @@
public class PreservationPolicy {
public ReadCategory readCategory;
- public List<BaseCategory> baseCategories = new ArrayList<>();
+ public List<BaseCategory> baseCategories = new ArrayList<BaseCategory>();
public QualityScoreTreatment treatment;
View
4 src/main/java/net/sf/cram/lossy/QualityScorePreservation.java
@@ -21,7 +21,7 @@
public QualityScorePreservation(String specification) {
this.specification = specification;
- policyList = new ArrayList<>();
+ policyList = new ArrayList<PreservationPolicy>();
for (String s : specification.split("-")) {
if (s.length() == 0) continue ;
PreservationPolicy policy = parseSinglePolicy(s);
@@ -83,7 +83,7 @@ private static final QualityScoreTreatment readTreament(
private static final PreservationPolicy parseSinglePolicy(String spec) {
PreservationPolicy p = new PreservationPolicy();
- LinkedList<Character> list = new LinkedList<>();
+ LinkedList<Character> list = new LinkedList<Character>();
for (char b : spec.toCharArray())
list.add(b);
View
20 src/main/java/net/sf/cram/stats/CompressionHeaderFactory.java
@@ -39,7 +39,7 @@
public CompressionHeader build(List<CramRecord> records) {
CompressionHeader h = new CompressionHeader();
- h.externalIds = new ArrayList<>();
+ h.externalIds = new ArrayList<Integer>();
int exCounter = 0;
int baseID = exCounter++;
@@ -59,11 +59,11 @@ public CompressionHeader build(List<CramRecord> records) {
log.debug("Assigned external id to read names: " + readNameID);
log.debug("Assigned external id to mate info: " + mateInfoID);
- h.eMap = new TreeMap<>();
+ h.eMap = new TreeMap<EncodingKey, EncodingParams>();
for (EncodingKey key : EncodingKey.values())
h.eMap.put(key, NullEncoding.toParam());
- h.tMap = new TreeMap<>();
+ h.tMap = new TreeMap<String, EncodingParams>();
{ // bit flags encoding:
HuffmanParamsCalculator calculator = new HuffmanParamsCalculator();
@@ -280,8 +280,8 @@ public CompressionHeader build(List<CramRecord> records) {
}
{ // test mark
- h.eMap.put(EncodingKey.TM_TestMark,
- BetaIntegerEncoding.toParam(0, 32));
+// h.eMap.put(EncodingKey.TM_TestMark,
+// BetaIntegerEncoding.toParam(0, 32));
}
return h;
@@ -310,7 +310,7 @@ public int compareTo(BitCode o) {
}
public static class HuffmanParamsCalculator {
- private HashMap<Integer, MutableInt> countMap = new HashMap<>();
+ private HashMap<Integer, MutableInt> countMap = new HashMap<Integer, MutableInt>();
private int[] values = new int[] {};
private int[] bitLens = new int[] {};
@@ -380,8 +380,8 @@ public void calculate() {
tree = HuffmanCode.buildTree(freqs, Utils.autobox(values));
}
- List<Integer> valueList = new ArrayList<>();
- List<Integer> lens = new ArrayList<>();
+ List<Integer> valueList = new ArrayList<Integer>();
+ List<Integer> lens = new ArrayList<Integer>();
HuffmanCode.getValuesAndBitLengths(valueList, lens, tree);
// the following sorting is not really required, but whatever:
@@ -426,11 +426,11 @@ public long len() {
}
private static class IntegerEncodingCalculator {
- private List<EncodingLengthCalculator> calcs = new ArrayList<>();
+ private List<EncodingLengthCalculator> calcs = new ArrayList<EncodingLengthCalculator>();
private int max = 0;
private int count = 0;
private String name;
- private HashMap<Integer, MutableInt> dictionary = new HashMap<>();
+ private HashMap<Integer, MutableInt> dictionary = new HashMap<Integer, MutableInt>();
private int dictionaryThreshold = 100;
public IntegerEncodingCalculator(String name, int dictionaryThreshold) {
View
820 src/main/java/net/sf/picard/sam/SamFileValidator.java
@@ -0,0 +1,820 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2009 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+package net.sf.picard.sam;
+
+import net.sf.picard.PicardException;
+import net.sf.picard.metrics.MetricBase;
+import net.sf.picard.metrics.MetricsFile;
+import net.sf.picard.reference.ReferenceSequence;
+import net.sf.picard.reference.ReferenceSequenceFile;
+import net.sf.picard.reference.ReferenceSequenceFileWalker;
+import net.sf.picard.util.Histogram;
+import net.sf.picard.util.Log;
+import net.sf.picard.util.ProgressLogger;
+import net.sf.samtools.*;
+import net.sf.samtools.SAMFileReader.ValidationStringency;
+import net.sf.samtools.SAMValidationError.Type;
+import net.sf.samtools.util.*;
+
+import java.io.*;
+import java.util.*;
+
+/**
+ * Validates SAM files as follows:
+ * <ul>
+ * <li>checks sam file header for sequence dictionary</li>
+ * <li>checks sam file header for read groups</li>
+ * <li>for each sam record
+ * <ul>
+ * <li>reports error detected by SAMRecord.isValid()</li>
+ * <li>validates NM (nucleotide differences) exists and matches reality</li>
+ * <li>validates mate fields agree with data in the mate record</li>
+ * </ul>
+ * </li>
+ * </ul>
+ *
+ * @see SAMRecord#isValid()
+ * @author Doug Voet
+ */
+public class SamFileValidator {
+ private Histogram<Type> errorsByType = new Histogram<Type>();
+ private final PrintWriter out;
+ private PairEndInfoMap pairEndInfoByName;
+ private ReferenceSequenceFileWalker refFileWalker = null;
+ private boolean verbose = false;
+ private int maxVerboseOutput = 100;
+ private SAMSortOrderChecker orderChecker;
+ private Set<Type> errorsToIgnore = EnumSet.noneOf(Type.class);
+ private boolean ignoreWarnings = false;
+ private boolean bisulfiteSequenced = false;
+ private boolean validateIndex = false;
+ private boolean sequenceDictionaryEmptyAndNoWarningEmitted = false;
+ private final int maxTempFiles;
+
+ private final static Log log = Log.getInstance(SamFileValidator.class);
+
+ public SamFileValidator(final PrintWriter out, final int maxTempFiles) {
+ this.out = out;
+ this.maxTempFiles = maxTempFiles;
+ }
+
+ /** Sets one or more error types that should not be reported on. */
+ public void setErrorsToIgnore(final Collection<Type> types) {
+ if (!types.isEmpty()) {
+ this.errorsToIgnore = EnumSet.copyOf(types);
+ }
+ }
+
+ public void setIgnoreWarnings(final boolean ignoreWarnings) {
+ this.ignoreWarnings = ignoreWarnings;
+ }
+
+ /**
+ * Outputs validation summary report to out.
+ *
+ * @param samReader
+ * records to validate
+ * @param reference
+ * if null, NM tag validation is skipped
+ * @return boolean true if there are no validation errors, otherwise false
+ */
+ public boolean validateSamFileSummary(final SAMFileReader samReader,
+ final ReferenceSequenceFile reference) {
+ init(reference, samReader.getFileHeader());
+
+ validateSamFile(samReader, out);
+
+ boolean result = errorsByType.isEmpty();
+
+ if (errorsByType.getCount() > 0) {
+ // Convert to a histogram with String IDs so that WARNING: or ERROR:
+ // can be prepended to the error type.
+ final Histogram<String> errorsAndWarningsByType = new Histogram<String>(
+ "Error Type", "Count");
+ for (final Histogram<SAMValidationError.Type>.Bin bin : errorsByType
+ .values()) {
+ errorsAndWarningsByType.increment(bin.getId()
+ .getHistogramString(), bin.getValue());
+ }
+ final MetricsFile<ValidationMetrics, String> metricsFile = new MetricsFile<ValidationMetrics, String>();
+ errorsByType.setBinLabel("Error Type");
+ errorsByType.setValueLabel("Count");
+ metricsFile.setHistogram(errorsAndWarningsByType);
+ metricsFile.write(out);
+ }
+ cleanup();
+ return result;
+ }
+
+ /**
+ * Outputs validation error details to out.
+ *
+ * @param samReader
+ * records to validate
+ * @param reference
+ * if null, NM tag validation is skipped processing will stop
+ * after this threshold has been reached
+ * @return boolean true if there are no validation errors, otherwise false
+ */
+ public boolean validateSamFileVerbose(final SAMFileReader samReader,
+ final ReferenceSequenceFile reference) {
+ init(reference, samReader.getFileHeader());
+
+ try {
+ validateSamFile(samReader, out);
+ } catch (MaxOutputExceededException e) {
+ out.println("Maximum output of [" + maxVerboseOutput
+ + "] errors reached.");
+ }
+ boolean result = errorsByType.isEmpty();
+ cleanup();
+ return result;
+ }
+
+ public void validateBamFileTermination(final File inputFile) {
+ BufferedInputStream inputStream = null;
+ try {
+ inputStream = IOUtil
+ .toBufferedStream(new FileInputStream(inputFile));
+ if (!BlockCompressedInputStream.isValidFile(inputStream)) {
+ return;
+ }
+ final BlockCompressedInputStream.FileTermination terminationState = BlockCompressedInputStream
+ .checkTermination(inputFile);
+ if (terminationState
+ .equals(BlockCompressedInputStream.FileTermination.DEFECTIVE)) {
+ addError(new SAMValidationError(Type.TRUNCATED_FILE,
+ "BAM file has defective last gzip block",
+ inputFile.getPath()));
+ } else if (terminationState
+ .equals(BlockCompressedInputStream.FileTermination.HAS_HEALTHY_LAST_BLOCK)) {
+ addError(new SAMValidationError(
+ Type.BAM_FILE_MISSING_TERMINATOR_BLOCK,
+ "Older BAM file -- does not have terminator block",
+ inputFile.getPath()));
+
+ }
+ } catch (IOException e) {
+ throw new PicardException("IOException", e);
+ } finally {
+ if (inputStream != null) {
+ CloserUtil.close(inputStream);
+ }
+ }
+ }
+
+ private void validateSamFile(final SAMFileReader samReader,
+ final PrintWriter out) {
+ try {
+ samReader.setValidationStringency(ValidationStringency.SILENT);
+ validateHeader(samReader.getFileHeader());
+ orderChecker = new SAMSortOrderChecker(samReader.getFileHeader()
+ .getSortOrder());
+ validateSamRecords(samReader, samReader.getFileHeader());
+ validateUnmatchedPairs();
+ if (validateIndex) {
+ try {
+ BamIndexValidator.exhaustivelyTestIndex(samReader);
+ } catch (Exception e) {
+ addError(new SAMValidationError(
+ Type.INVALID_INDEX_FILE_POINTER, e.getMessage(),
+ null));
+ }
+ }
+
+ if (errorsByType.isEmpty()) {
+ out.println("No errors found");
+ }
+ } finally {
+ out.flush();
+ }
+ }
+
+ /**
+ * Report on reads marked as paired, for which the mate was not found.
+ */
+ private void validateUnmatchedPairs() {
+ final InMemoryPairEndInfoMap inMemoryPairMap;
+ if (pairEndInfoByName instanceof CoordinateSortedPairEndInfoMap) {
+ // For the coordinate-sorted map, need to detect mate pairs in which
+ // the mateReferenceIndex on one end
+ // does not match the readReference index on the other end, so the
+ // pairs weren't united and validated.
+ inMemoryPairMap = new InMemoryPairEndInfoMap();
+ CloseableIterator<Map.Entry<String, PairEndInfo>> it = ((CoordinateSortedPairEndInfoMap) pairEndInfoByName)
+ .iterator();
+ while (it.hasNext()) {
+ Map.Entry<String, PairEndInfo> entry = it.next();
+ PairEndInfo pei = inMemoryPairMap.remove(
+ entry.getValue().readReferenceIndex, entry.getKey());
+ if (pei != null) {
+ // Found a mismatch btw read.mateReferenceIndex and
+ // mate.readReferenceIndex
+ List<SAMValidationError> errors = pei.validateMates(
+ entry.getValue(), entry.getKey());
+ for (final SAMValidationError error : errors) {
+ addError(error);
+ }
+ } else {
+ // Mate not found.
+ inMemoryPairMap.put(entry.getValue().mateReferenceIndex,
+ entry.getKey(), entry.getValue());
+ }
+ }
+ it.close();
+ } else {
+ inMemoryPairMap = (InMemoryPairEndInfoMap) pairEndInfoByName;
+ }
+ // At this point, everything in InMemoryMap is a read marked as a pair,
+ // for which a mate was not found.
+ for (final Map.Entry<String, PairEndInfo> entry : inMemoryPairMap) {
+ addError(new SAMValidationError(Type.MATE_NOT_FOUND,
+ "Mate not found for paired read", entry.getKey()));
+ }
+ }
+
+ public void validateSamRecords(final Iterable<SAMRecord> samRecords,
+ final SAMFileHeader header) {
+ SAMRecordIterator iter = (SAMRecordIterator) samRecords.iterator();
+ final ProgressLogger progress = new ProgressLogger(log, 10000000,
+ "Validated Read");
+ try {
+ while (iter.hasNext()) {
+ SAMRecord record = iter.next();
+
+ final long recordNumber = progress.getCount() + 1;
+ final Collection<SAMValidationError> errors = record.isValid();
+ if (errors != null) {
+ for (final SAMValidationError error : errors) {
+ error.setRecordNumber(recordNumber);
+ addError(error);
+ }
+ }
+
+ validateMateFields(record, recordNumber);
+ validateSortOrder(record, recordNumber);
+ validateReadGroup(record, header);
+ final boolean cigarIsValid = validateCigar(record, recordNumber);
+ if (cigarIsValid) {
+ validateNmTag(record, recordNumber);
+ }
+ validateSecondaryBaseCalls(record, recordNumber);
+ validateTags(record, recordNumber);
+ if (sequenceDictionaryEmptyAndNoWarningEmitted
+ && !record.getReadUnmappedFlag()) {
+ addError(new SAMValidationError(
+ Type.MISSING_SEQUENCE_DICTIONARY,
+ "Sequence dictionary is empty", null));
+ sequenceDictionaryEmptyAndNoWarningEmitted = false;
+
+ }
+ progress.record(record);
+ }
+ } catch (SAMFormatException e) {
+ // increment record number because the iterator behind the
+ // SAMFileReader
+ // reads one record ahead so we will get this failure one record
+ // ahead
+ final String msg = "SAMFormatException on record "
+ + progress.getCount() + 1;
+ out.println(msg);
+ throw new PicardException(msg, e);
+ } catch (FileTruncatedException e) {
+ addError(new SAMValidationError(Type.TRUNCATED_FILE,
+ "File is truncated", null));
+ } finally {
+ iter.close();
+ }
+ }
+
+ private void validateReadGroup(final SAMRecord record,
+ final SAMFileHeader header) {
+ SAMReadGroupRecord rg = record.getReadGroup();
+ if (rg == null) {
+ addError(new SAMValidationError(Type.MISSING_READ_GROUP,
+ "A record is missing a read group", record.getReadName()));
+ } else if (!header.getReadGroups().contains(rg)) {
+ addError(new SAMValidationError(Type.READ_GROUP_NOT_FOUND,
+ "A record has a read group not found in the header: ",
+ record.getReadName() + ", " + rg.getReadGroupId()));
+ }
+ }
+
+ /**
+ * Report error if a tag value is a Long.
+ */
+ private void validateTags(final SAMRecord record, final long recordNumber) {
+ for (final SAMRecord.SAMTagAndValue tagAndValue : record
+ .getAttributes()) {
+ if (tagAndValue.value instanceof Long) {
+ addError(new SAMValidationError(Type.TAG_VALUE_TOO_LARGE,
+ "Numeric value too large for tag " + tagAndValue.tag,
+ record.getReadName(), recordNumber));
+ }
+ }
+ }
+
+ private void validateSecondaryBaseCalls(final SAMRecord record,
+ final long recordNumber) {
+ final String e2 = (String) record.getAttribute(SAMTag.E2.name());
+ if (e2 != null) {
+ if (e2.length() != record.getReadLength()) {
+ addError(new SAMValidationError(
+ Type.MISMATCH_READ_LENGTH_AND_E2_LENGTH, String.format(
+ "E2 tag length (%d) != read length (%d)",
+ e2.length(), record.getReadLength()),
+ record.getReadName(), recordNumber));
+ }
+ final byte[] bases = record.getReadBases();
+ final byte[] secondaryBases = StringUtil.stringToBytes(e2);
+ for (int i = 0; i < Math.min(bases.length, secondaryBases.length); ++i) {
+ if (SequenceUtil.isNoCall(bases[i])
+ || SequenceUtil.isNoCall(secondaryBases[i])) {
+ continue;
+ }
+ if (SequenceUtil.basesEqual(bases[i], secondaryBases[i])) {
+ addError(new SAMValidationError(
+ Type.E2_BASE_EQUALS_PRIMARY_BASE,
+ String.format(
+ "Secondary base call (%c) == primary base call (%c)",
+ (char) secondaryBases[i], (char) bases[i]),
+ record.getReadName(), recordNumber));
+ break;
+ }
+ }
+ }
+ final String u2 = (String) record.getAttribute(SAMTag.U2.name());
+ if (u2 != null && u2.length() != record.getReadLength()) {
+ addError(new SAMValidationError(
+ Type.MISMATCH_READ_LENGTH_AND_U2_LENGTH, String.format(
+ "U2 tag length (%d) != read length (%d)",
+ u2.length(), record.getReadLength()),
+ record.getReadName(), recordNumber));
+ }
+ }
+
+ private boolean validateCigar(final SAMRecord record,
+ final long recordNumber) {
+ if (record.getReadUnmappedFlag()) {
+ return true;
+ }
+ final ValidationStringency savedStringency = record
+ .getValidationStringency();
+ record.setValidationStringency(ValidationStringency.LENIENT);
+ final List<SAMValidationError> errors = record
+ .validateCigar(recordNumber);
+ record.setValidationStringency(savedStringency);
+ if (errors == null) {
+ return true;
+ }
+ boolean valid = true;
+ for (final SAMValidationError error : errors) {
+ addError(error);
+ valid = false;
+ }
+ return valid;
+ }
+
+ private void validateSortOrder(final SAMRecord record,
+ final long recordNumber) {
+ final SAMRecord prev = orderChecker.getPreviousRecord();
+ if (!orderChecker.isSorted(record)) {
+ addError(new SAMValidationError(
+ Type.RECORD_OUT_OF_ORDER,
+ String.format(
+ "The record is out of [%s] order, prior read name [%s], prior coodinates [%d:%d]",
+ record.getHeader().getSortOrder().name(),
+ prev.getReadName(), prev.getReferenceIndex(),
+ prev.getAlignmentStart()), record.getReadName(),
+ recordNumber));
+ }
+ }
+
+ public void init(final ReferenceSequenceFile reference,
+ final SAMFileHeader header) {
+ if (orderChecker == null)
+ orderChecker = new SAMSortOrderChecker(header.getSortOrder());
+ if (header.getSortOrder() == SAMFileHeader.SortOrder.coordinate) {
+ this.pairEndInfoByName = new CoordinateSortedPairEndInfoMap();
+ } else {
+ this.pairEndInfoByName = new InMemoryPairEndInfoMap();
+ }
+ if (reference != null) {
+ this.refFileWalker = new ReferenceSequenceFileWalker(reference);
+ }
+ }
+
+ private void cleanup() {
+ this.errorsByType = null;
+ this.pairEndInfoByName = null;
+ this.refFileWalker = null;
+ }
+
+ private void validateNmTag(final SAMRecord record, final long recordNumber) {
+ if (!record.getReadUnmappedFlag()) {
+ final Integer tagNucleotideDiffs = record
+ .getIntegerAttribute(ReservedTagConstants.NM);
+ if (tagNucleotideDiffs == null) {
+ addError(new SAMValidationError(Type.MISSING_TAG_NM,
+ "NM tag (nucleotide differences) is missing",
+ record.getReadName(), recordNumber));
+ } else if (refFileWalker != null) {
+ final ReferenceSequence refSequence = refFileWalker.get(record
+ .getReferenceIndex());
+ final int actualNucleotideDiffs = SequenceUtil
+ .calculateSamNmTag(record, refSequence.getBases(), 0,
+ isBisulfiteSequenced());
+
+ if (!tagNucleotideDiffs.equals(actualNucleotideDiffs)) {
+ addError(new SAMValidationError(Type.INVALID_TAG_NM,
+ "NM tag (nucleotide differences) in file ["
+ + tagNucleotideDiffs
+ + "] does not match reality ["
+ + actualNucleotideDiffs + "]",
+ record.getReadName(), recordNumber));
+ }
+ }
+ }
+ }
+
+ private void validateMateFields(final SAMRecord record,
+ final long recordNumber) {
+ if (!record.getReadPairedFlag() || record.getNotPrimaryAlignmentFlag()) {
+ return;
+ }
+
+ final PairEndInfo pairEndInfo = pairEndInfoByName.remove(
+ record.getReferenceIndex(), record.getReadName());
+ if (pairEndInfo == null) {
+ pairEndInfoByName
+ .put(record.getMateReferenceIndex(), record.getReadName(),
+ new PairEndInfo(record, recordNumber));
+ } else {
+ final List<SAMValidationError> errors = pairEndInfo
+ .validateMates(new PairEndInfo(record, recordNumber),
+ record.getReadName());
+ for (final SAMValidationError error : errors) {
+ addError(error);
+ }
+ }
+ }
+
+ private void validateHeader(final SAMFileHeader fileHeader) {
+ for (final SAMValidationError error : fileHeader.getValidationErrors()) {
+ addError(error);
+ }
+ if (fileHeader.getVersion() == null) {
+ addError(new SAMValidationError(Type.MISSING_VERSION_NUMBER,
+ "Header has no version number", null));
+ } else if (!SAMFileHeader.ACCEPTABLE_VERSIONS.contains(fileHeader
+ .getVersion())) {
+ addError(new SAMValidationError(
+ Type.INVALID_VERSION_NUMBER,
+ "Header version: "
+ + fileHeader.getVersion()
+ + " does not match any of the acceptable versions: "
+ + StringUtil.join(", ",
+ SAMFileHeader.ACCEPTABLE_VERSIONS
+ .toArray(new String[0])), null));
+ }
+ if (fileHeader.getSequenceDictionary().isEmpty()) {
+ sequenceDictionaryEmptyAndNoWarningEmitted = true;
+ }
+ if (fileHeader.getReadGroups().isEmpty()) {
+ addError(new SAMValidationError(Type.MISSING_READ_GROUP,
+ "Read groups is empty", null));
+ }
+ List<SAMProgramRecord> pgs = fileHeader.getProgramRecords();
+ for (int i = 0; i < pgs.size() - 1; i++) {
+ for (int j = i + 1; j < pgs.size(); j++) {
+ if (pgs.get(i).getProgramGroupId()
+ .equals(pgs.get(j).getProgramGroupId())) {
+ addError(new SAMValidationError(
+ Type.DUPLICATE_PROGRAM_GROUP_ID, "Duplicate "
+ + "program group id: "
+ + pgs.get(i).getProgramGroupId(), null));
+ }
+ }
+ }
+
+ List<SAMReadGroupRecord> rgs = fileHeader.getReadGroups();
+ for (int i = 0; i < rgs.size() - 1; i++) {
+ for (int j = i + 1; j < rgs.size(); j++) {
+ if (rgs.get(i).getReadGroupId()
+ .equals(rgs.get(j).getReadGroupId())) {
+ addError(new SAMValidationError(
+ Type.DUPLICATE_READ_GROUP_ID, "Duplicate "
+ + "read group id: "
+ + rgs.get(i).getReadGroupId(), null));
+ }
+ }
+ }
+
+ }
+
+ private void addError(final SAMValidationError error) {
+ // Just ignore an error if it's of a type we're not interested in
+ if (this.errorsToIgnore.contains(error.getType()))
+ return;
+
+ if (this.ignoreWarnings
+ && error.getType().severity == SAMValidationError.Severity.WARNING)
+ return;
+
+ this.errorsByType.increment(error.getType());
+ if (verbose) {
+ out.println(error);
+ out.flush();
+ if (this.errorsByType.getCount() >= maxVerboseOutput) {
+ throw new MaxOutputExceededException();
+ }
+ }
+ }
+
+ /**
+ * Control verbosity
+ *
+ * @param verbose
+ * True in order to emit a message per error or warning.
+ * @param maxVerboseOutput
+ * If verbose, emit no more than this many messages. Ignored if
+ * !verbose.
+ */
+ public void setVerbose(final boolean verbose, final int maxVerboseOutput) {
+ this.verbose = verbose;
+ this.maxVerboseOutput = maxVerboseOutput;
+ }
+
+ public boolean isBisulfiteSequenced() {
+ return bisulfiteSequenced;
+ }
+
+ public void setBisulfiteSequenced(boolean bisulfiteSequenced) {
+ this.bisulfiteSequenced = bisulfiteSequenced;
+ }
+
+ public SamFileValidator setValidateIndex(boolean validateIndex) {
+ // The SAMFileReader must also have IndexCaching enabled to have the
+ // index validated,
+ // samReader.enableIndexCaching(true);
+ this.validateIndex = validateIndex;
+ return this;
+ }
+
+ public static class ValidationMetrics extends MetricBase {
+ }
+
+ /**
+ * This class is used so we don't have to store the entire SAMRecord in
+ * memory while we wait to find a record's mate and also to store the record
+ * number.
+ */
+ private static class PairEndInfo {
+ private final int readAlignmentStart;
+ private final int readReferenceIndex;
+ private final boolean readNegStrandFlag;
+ private final boolean readUnmappedFlag;
+
+ private final int mateAlignmentStart;
+ private final int mateReferenceIndex;
+ private final boolean mateNegStrandFlag;
+ private final boolean mateUnmappedFlag;
+
+ private final boolean firstOfPairFlag;
+
+ private final long recordNumber;
+
+ public PairEndInfo(final SAMRecord record, final long recordNumber) {
+ this.recordNumber = recordNumber;
+
+ this.readAlignmentStart = record.getAlignmentStart();
+ this.readNegStrandFlag = record.getReadNegativeStrandFlag();
+ this.readReferenceIndex = record.getReferenceIndex();
+ this.readUnmappedFlag = record.getReadUnmappedFlag();
+
+ this.mateAlignmentStart = record.getMateAlignmentStart();
+ this.mateNegStrandFlag = record.getMateNegativeStrandFlag();
+ this.mateReferenceIndex = record.getMateReferenceIndex();
+ this.mateUnmappedFlag = record.getMateUnmappedFlag();
+
+ this.firstOfPairFlag = record.getFirstOfPairFlag();
+ }
+
+ private PairEndInfo(int readAlignmentStart, int readReferenceIndex,
+ boolean readNegStrandFlag, boolean readUnmappedFlag,
+ int mateAlignmentStart, int mateReferenceIndex,
+ boolean mateNegStrandFlag, boolean mateUnmappedFlag,
+ boolean firstOfPairFlag, long recordNumber) {
+ this.readAlignmentStart = readAlignmentStart;
+ this.readReferenceIndex = readReferenceIndex;
+ this.readNegStrandFlag = readNegStrandFlag;
+ this.readUnmappedFlag = readUnmappedFlag;
+ this.mateAlignmentStart = mateAlignmentStart;
+ this.mateReferenceIndex = mateReferenceIndex;
+ this.mateNegStrandFlag = mateNegStrandFlag;
+ this.mateUnmappedFlag = mateUnmappedFlag;
+ this.firstOfPairFlag = firstOfPairFlag;
+ this.recordNumber = recordNumber;
+ }
+
+ public List<SAMValidationError> validateMates(final PairEndInfo mate,
+ final String readName) {
+ final List<SAMValidationError> errors = new ArrayList<SAMValidationError>();
+ validateMateFields(this, mate, readName, errors);
+ validateMateFields(mate, this, readName, errors);
+ // Validations that should not be repeated on both ends
+ if (this.firstOfPairFlag == mate.firstOfPairFlag) {
+ final String whichEnd = this.firstOfPairFlag ? "first"
+ : "second";
+ errors.add(new SAMValidationError(Type.MATES_ARE_SAME_END,
+ "Both mates are marked as " + whichEnd + " of pair",
+ readName, this.recordNumber));
+ }
+ return errors;
+ }
+
+ private void validateMateFields(final PairEndInfo end1,
+ final PairEndInfo end2, final String readName,
+ final List<SAMValidationError> errors) {
+ if (end1.mateAlignmentStart != end2.readAlignmentStart) {
+ errors.add(new SAMValidationError(
+ Type.MISMATCH_MATE_ALIGNMENT_START,
+ "Mate alignment does not match alignment start of mate",
+ readName, end1.recordNumber));
+ }
+ if (end1.mateNegStrandFlag != end2.readNegStrandFlag) {
+ errors.add(new SAMValidationError(
+ Type.MISMATCH_FLAG_MATE_NEG_STRAND,
+ "Mate negative strand flag does not match read negative strand flag of mate",
+ readName, end1.recordNumber));
+ }
+ if (end1.mateReferenceIndex != end2.readReferenceIndex) {
+ errors.add(new SAMValidationError(
+ Type.MISMATCH_MATE_REF_INDEX,
+ "Mate reference index (MRNM) does not match reference index of mate",
+ readName, end1.recordNumber));
+ }
+ if (end1.mateUnmappedFlag != end2.readUnmappedFlag) {
+ errors.add(new SAMValidationError(
+ Type.MISMATCH_FLAG_MATE_UNMAPPED,
+ "Mate unmapped flag does not match read unmapped flag of mate",
+ readName, end1.recordNumber));
+ }
+ }
+ }
+
+ /**
+ * Thrown in addError indicating that maxVerboseOutput has been exceeded and
+ * processing should stop
+ */
+ private static class MaxOutputExceededException extends PicardException {
+ MaxOutputExceededException() {
+ super("maxVerboseOutput exceeded.");
+ }
+ }
+
+ interface PairEndInfoMap extends Iterable<Map.Entry<String, PairEndInfo>> {
+ void put(int mateReferenceIndex, String key, PairEndInfo value);
+
+ PairEndInfo remove(int mateReferenceIndex, String key);
+
+ CloseableIterator<Map.Entry<String, PairEndInfo>> iterator();
+ }
+
+ private class CoordinateSortedPairEndInfoMap implements PairEndInfoMap {
+ private final CoordinateSortedPairInfoMap<String, PairEndInfo> onDiskMap = new CoordinateSortedPairInfoMap<String, PairEndInfo>(
+ maxTempFiles, new Codec());
+
+ public void put(int mateReferenceIndex, String key, PairEndInfo value) {
+ onDiskMap.put(mateReferenceIndex, key, value);
+ }
+
+ public PairEndInfo remove(int mateReferenceIndex, String key) {
+ return onDiskMap.remove(mateReferenceIndex, key);
+ }
+
+ public CloseableIterator<Map.Entry<String, PairEndInfo>> iterator() {
+ return onDiskMap.iterator();
+ }
+
+ private class Codec implements
+ CoordinateSortedPairInfoMap.Codec<String, PairEndInfo> {
+ private DataInputStream in;
+ private DataOutputStream out;
+
+ public void setOutputStream(final OutputStream os) {
+ this.out = new DataOutputStream(os);
+ }
+
+ public void setInputStream(final InputStream is) {
+ this.in = new DataInputStream(is);
+ }
+
+ public void encode(String key, PairEndInfo record) {
+ try {
+ out.writeUTF(key);
+ out.writeInt(record.readAlignmentStart);
+ out.writeInt(record.readReferenceIndex);
+ out.writeBoolean(record.readNegStrandFlag);
+ out.writeBoolean(record.readUnmappedFlag);
+ out.writeInt(record.mateAlignmentStart);
+ out.writeInt(record.mateReferenceIndex);
+ out.writeBoolean(record.mateNegStrandFlag);
+ out.writeBoolean(record.mateUnmappedFlag);
+ out.writeBoolean(record.firstOfPairFlag);
+ out.writeLong(record.recordNumber);
+ } catch (IOException e) {
+ throw new PicardException(
+ "Error spilling PairInfo to disk", e);
+ }
+ }
+
+ public Map.Entry<String, PairEndInfo> decode() {
+ try {
+ final String key = in.readUTF();
+ final int readAlignmentStart = in.readInt();
+ final int readReferenceIndex = in.readInt();
+ final boolean readNegStrandFlag = in.readBoolean();
+ final boolean readUnmappedFlag = in.readBoolean();
+
+ final int mateAlignmentStart = in.readInt();
+ final int mateReferenceIndex = in.readInt();
+ final boolean mateNegStrandFlag = in.readBoolean();
+ final boolean mateUnmappedFlag = in.readBoolean();
+
+ final boolean firstOfPairFlag = in.readBoolean();
+
+ final long recordNumber = in.readLong();
+ final PairEndInfo rec = new PairEndInfo(readAlignmentStart,
+ readReferenceIndex, readNegStrandFlag,
+ readUnmappedFlag, mateAlignmentStart,
+ mateReferenceIndex, mateNegStrandFlag,
+ mateUnmappedFlag, firstOfPairFlag, recordNumber);
+ return new AbstractMap.SimpleEntry(key, rec);
+ } catch (IOException e) {
+ throw new PicardException(
+ "Error reading PairInfo from disk", e);
+ }
+ }
+ }
+ }
+
+ private static class InMemoryPairEndInfoMap implements PairEndInfoMap {
+ private final Map<String, PairEndInfo> map = new HashMap<String, PairEndInfo>();
+
+ public void put(int mateReferenceIndex, String key, PairEndInfo value) {
+ if (mateReferenceIndex != value.mateReferenceIndex)
+ throw new IllegalArgumentException(
+ "mateReferenceIndex does not agree with PairEndInfo");
+ map.put(key, value);
+ }
+
+ public PairEndInfo remove(int mateReferenceIndex, String key) {
+ return map.remove(key);
+ }
+
+ public CloseableIterator<Map.Entry<String, PairEndInfo>> iterator() {
+ final Iterator<Map.Entry<String, PairEndInfo>> it = map.entrySet()
+ .iterator();
+ return new CloseableIterator<Map.Entry<String, PairEndInfo>>() {
+ public void close() {
+ // do nothing
+ }
+
+ public boolean hasNext() {
+ return it.hasNext();
+ }
+
+ public Map.Entry<String, PairEndInfo> next() {
+ return it.next();
+ }
+
+ public void remove() {
+ it.remove();
+ }
+ };
+ }
+ }
+}
View
422 src/main/java/net/sf/samtools/BinaryTagCodec.java
@@ -1,422 +0,0 @@
-/*
- * The MIT License
- *
- * Copyright (c) 2009 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-package net.sf.samtools;
-
-import java.lang.reflect.Array;
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
-
-import net.sf.samtools.util.BinaryCodec;
-import net.sf.samtools.util.StringUtil;
-
-/**
- * Converter between disk and in-memory representation of a SAMRecord tag.
- */
-public class BinaryTagCodec {
- // Size in bytes of the fixed part of the disk representation of a tag,
- // i.e. the number of bytes occupied by the tag name and tag type fields.
- private static final int FIXED_TAG_SIZE = 3;
-
- // Size in bytes of the fixed part of the value of a binary array,
- // i.e. the number of bytes occupied by the array type and the array length.
- private static final int FIXED_BINARY_ARRAY_TAG_SIZE = 5;
-
- // Integers are stored in the smallest size that will hold them.
- private static final long MAX_INT = Integer.MAX_VALUE;
- private static final long MAX_UINT = MAX_INT * 2 + 1;
- private static final long MAX_SHORT = Short.MAX_VALUE;
- private static final long MAX_USHORT = MAX_SHORT * 2 + 1;
- private static final long MAX_BYTE = Byte.MAX_VALUE;
- private static final long MAX_UBYTE = MAX_BYTE * 2 + 1;
-
- // Source or sink for disk representation.
- final BinaryCodec binaryCodec;
-
- /**
- * For writing tags.
- * For reading tags, a BinaryCodec is not used. See readTags() below.
- * @param binaryCodec where to write the file rep of the tags
- */
- public BinaryTagCodec(final BinaryCodec binaryCodec) {
- this.binaryCodec = binaryCodec;
- }
-
- /**
- * @param attributeValue In-memory representation of a tag value.
- * @return Size in bytes to store the value on disk.
- */
- private static int getBinaryValueSize(final Object attributeValue) {
- switch (getTagValueType(attributeValue)) {
- case 'Z':
- return ((String)attributeValue).length() + 1;
- case 'A':
- return 1;
- case 'I':
- case 'i':
- return 4;
- case 's':
- case 'S':
- return 2;
- case 'c':
- case 'C':
- return 1;
- case 'f':
- return 4;
- case 'H':
- final byte[] byteArray = (byte[])attributeValue;
- return byteArray.length * 2 + 1;
- case 'B':
- final int numElements = Array.getLength(attributeValue);
- final int elementSize;
- if(attributeValue instanceof byte[]) {
- elementSize = 1;
- } else if(attributeValue instanceof short[]) {
- elementSize = 2;
- } else if(attributeValue instanceof int[]) {
- elementSize = 4;
- } else if(attributeValue instanceof float[]) {
- elementSize = 4;
- } else {
- throw new IllegalArgumentException("Unsupported array type: " + attributeValue.getClass());
- }
- return numElements * elementSize + FIXED_BINARY_ARRAY_TAG_SIZE;
- default:
- throw new IllegalArgumentException("When writing BAM, unrecognized tag type " +
- attributeValue.getClass().getName());
- }
- }
-
- /**
- * @param value In-memory representation of a tag value.
- * @return Size in bytes to store the tag name, tag type and tag value on disk.
- */
- static int getTagSize(final Object value) {
- return FIXED_TAG_SIZE + getBinaryValueSize(value);
- }
-
- /**
- * @param value In-memory representation of a tag value.
- * @return One-character disk representation of tag type.
- */
- static char getTagValueType(final Object value) {
- if (value instanceof String) {
- return 'Z';
- } else if (value instanceof Character) {
- return 'A';
- } else if (value instanceof Float) {
- return 'f';
- } else if (value instanceof Number) {
- if (!(value instanceof Byte || value instanceof Short || value instanceof Integer || value instanceof Long)) {
- throw new IllegalArgumentException("Unrecognized tag type " + value.getClass().getName());
- }
- return getIntegerType(((Number)value).longValue());
- } /*
- Note that H tag type is never written anymore, because B style is more compact.
- else if (value instanceof byte[]) {
- return 'H';
- }
- */
- else if (value instanceof byte[] || value instanceof short[] || value instanceof int[] || value instanceof float[]) {
- return 'B';
- } else {
- throw new IllegalArgumentException("When writing BAM, unrecognized tag type " +
- value.getClass().getName());
- }
- }
-
- /**
- * @param val Integer tag value.
- * @return Tag type corresponding to the smallest integer type that will hold the given value.
- */
- static private char getIntegerType(final long val) {
- if (val > MAX_UINT) {
- throw new IllegalArgumentException("Integer attribute value too large to be encoded in BAM");
- }
- if (val > MAX_INT) {
- return 'I';
- }
- if (val > MAX_USHORT) {
- return 'i';
- }
- if (val > MAX_SHORT) {
- return 'S';
- }
- if (val > MAX_UBYTE) {
- return 's';
- }
- if (val > MAX_BYTE) {
- return 'C';
- }
- if (val >= Byte.MIN_VALUE) {
- return 'c';
- }
- if (val >= Short.MIN_VALUE) {
- return 's';
- }
- if (val >= Integer.MIN_VALUE) {
- return 'i';
- }
- throw new IllegalArgumentException("Integer attribute value too negative to be encoded in BAM");
- }
-
- /**
- * Write the given tag name and value to disk.
- */
- public void writeTag(final short tag, final Object value, final boolean isUnsignedArray) {
- binaryCodec.writeShort(tag);
- final char tagValueType = getTagValueType(value);
- binaryCodec.writeByte(tagValueType);
-
- switch (tagValueType) {
- case 'Z':
- binaryCodec.writeString((String)value, false, true);
- break;
- case 'A':
- binaryCodec.writeByte(((Character)value));
- break;
- case 'I':
- binaryCodec.writeUInt((Long)value);
- break;
- case 'i':
- binaryCodec.writeInt(((Number)value).intValue());
- break;
- case 's':
- binaryCodec.writeShort(((Number)value).shortValue());
- break;
- case 'S':
- binaryCodec.writeUShort(((Number)value).intValue());
- break;
- case 'c':
- binaryCodec.writeByte(((Number)value).byteValue());
- break;
- case 'C':
- binaryCodec.writeUByte(((Integer)value).shortValue());
- break;
- case 'f':
- binaryCodec.writeFloat((Float)value);
- break;
- /*
- Writing H is no longer supported
- case 'H':
- final byte[] byteArray = (byte[])value;
- binaryCodec.writeString(StringUtil.bytesToHexString(byteArray), false, true);
- break;
- */
- case 'B':
- writeArray(value, isUnsignedArray);
- break;
- default:
- throw new IllegalArgumentException("When writing BAM, unrecognized tag type " +
- value.getClass().getName());
- }
- }
-
- private void writeArray(final Object value, final boolean isUnsignedArray) {
- if (value instanceof byte[]) {
- binaryCodec.writeByte(isUnsignedArray? 'C': 'c');
- final byte[] array = (byte[]) value;
- binaryCodec.writeInt(array.length);
- for (final byte element: array) binaryCodec.writeByte(element);
-
- } else if (value instanceof short[]) {
- binaryCodec.writeByte(isUnsignedArray? 'S': 's');
- final short[] array = (short[]) value;
- binaryCodec.writeInt(array.length);
- for (final short element: array) binaryCodec.writeShort(element);
-
- } else if (value instanceof int[]) {
- binaryCodec.writeByte(isUnsignedArray? 'I': 'i');
- final int[] array = (int[]) value;
- binaryCodec.writeInt(array.length);
- for (final int element: array) binaryCodec.writeInt(element);
-
- } else if (value instanceof float[]) {
- binaryCodec.writeByte('f');
- final float[] array = (float[]) value;
- binaryCodec.writeInt(array.length);
- for (final float element: array) binaryCodec.writeFloat(element);
-
- } else throw new SAMException("Unrecognized array value type: " + value.getClass());
- }
-
- /**
- * Convert tags from little-endian disk representation to in-memory representation.
- * @param binaryRep Byte buffer containing file representation of tags.
- * @param offset Where in binaryRep tags start.
- * @param length How many bytes in binaryRep are tag storage.
- */
- public static SAMBinaryTagAndValue readTags(final byte[] binaryRep, final int offset,
- final int length, final SAMFileReader.ValidationStringency validationStringency) {
- final ByteBuffer byteBuffer = ByteBuffer.wrap(binaryRep, offset, length);
- byteBuffer.order(ByteOrder.LITTLE_ENDIAN);
-
- SAMBinaryTagAndValue head = null;
- SAMBinaryTagAndValue tail = null;
-
- while (byteBuffer.hasRemaining()) {
- final short tag = byteBuffer.getShort();
- final byte tagType = byteBuffer.get();
- final SAMBinaryTagAndValue tmp;
- if (tagType != 'B') {
- tmp = new SAMBinaryTagAndValue(tag, readSingleValue(tagType, byteBuffer, validationStringency));
- } else {
- final TagValueAndUnsignedArrayFlag valueAndFlag = readArray(byteBuffer, validationStringency);
- if (valueAndFlag.isUnsignedArray) tmp = new SAMBinaryTagAndUnsignedArrayValue(tag, valueAndFlag.value);
- else tmp = new SAMBinaryTagAndValue(tag, valueAndFlag.value);
- }
-
- // If samjdk wrote the BAM then the attributes will be in lowest->highest tag order, to inserting at the
- // head each time will be very inefficient. To fix that we check here to see if the tag should go right on
- // the tail and if so stick it there, else insert it through the head.
- if (head == null) {
- head = tmp;
- tail = tmp;
- }
- else if (tmp.tag > tail.tag) {
- tail.insert(tmp);
- tail = tmp;
- }
- else {
- head = head.insert(tmp);
- }
- }
-
- return head;
- }
-
- /**
- * Read value of specified non-array type.
- * @param tagType What type to read.
- * @param byteBuffer Little-ending byte buffer to read value from.
- * @return Value in in-memory Object form.
- */
- private static Object readSingleValue(final byte tagType, final ByteBuffer byteBuffer,
- final SAMFileReader.ValidationStringency validationStringency) {
- switch (tagType) {
- case 'Z':
- return readNullTerminatedString(byteBuffer);
- case 'A':
- return (char)byteBuffer.get();
- case 'I':
- final long val = byteBuffer.getInt() & 0xffffffffL;
- if (val <= Integer.MAX_VALUE) {
- return (int)val;
- }
- SAMUtils.processValidationError(new SAMValidationError(SAMValidationError.Type.TAG_VALUE_TOO_LARGE,
- "Tag value " + val + " too large to store as signed integer.", null), validationStringency);
- // convert to unsigned int stored in a long
- return val;
- case 'i':
- return byteBuffer.getInt();
- case 's':
- return (int)byteBuffer.getShort();
- case 'S':
- // Convert to unsigned short stored in an int
- return byteBuffer.getShort() & 0xffff;
- case 'c':
- return (int)byteBuffer.get();
- case 'C':
- // Convert to unsigned byte stored in an int
- return (int)byteBuffer.get() & 0xff;
- case 'f':
- return byteBuffer.getFloat();
- case 'H':
- final String hexRep = readNullTerminatedString(byteBuffer);
- return StringUtil.hexStringToBytes(hexRep);
- default:
- throw new SAMFormatException("Unrecognized tag type: " + (char)tagType);
- }
- }