In [28]:
from django.core.management import setup_environ
import settings
setup_environ(settings)

'/home/wahern/projects/millstone/genome_designer'

In [30]:
# Test extract_contig_reads
contig = Contig.objects.get(label='ins_1kb_ins_1kb_sample_NODE_1_length_1966_cov_23.051373')
contig_number = 1
genome_finishing_directory_number = 1

extract_contig_reads(contig, contig_number, genome_finishing_directory_number, 'clipped')

number of clipped reads in contig: 56


In [31]:
import os
import re
import pysam
from collections import defaultdict
from main.models import Contig
from main.models import ExperimentSampleToAlignment

def extract_contig_reads(contig, contig_number, genome_finishing_directory_number, read_category='all'):
    # INPUTS:
    # contig = Contig.objects.get(label='ins_1kb_ins_1kb_sample_NODE_1_length_1966_cov_23.051373')
    # contig_number = 1
    # genome_finishing_directory_number = 1
    
    READ_CATEGORY_TO_FILENAME = {
        'all': 'bwa_align.SV_indicants_with_pairs.bam',
        'without_mates': 'bwa_align.SV_indicants_no_dups.bam',
        'clipped': 'bwa_align.clipped.bam',
        'split': 'bwa_align.split.bam',
        'unmapped': 'bwa_align.unmapped.bam'
    }
    assert read_category in READ_CATEGORY_TO_FILENAME
    
    extract_contig_reads_executable = os.path.join(settings.TOOLS_DIR, 'velvet/contrib/extractContigReads/extractContigReads.pl')

    sample_alignment = contig.experiment_sample_to_alignment
    genome_finish_dir = os.path.join(sample_alignment.get_model_data_dir(), 'genome_finishing')
    assembly_dir = os.path.join(genome_finish_dir, str(genome_finishing_directory_number), 'velvet_k21')

    read_unpacking_dir = os.path.join(genome_finish_dir, str(genome_finishing_directory_number), 'read_unpacking')
    if not os.path.exists(read_unpacking_dir):
        os.mkdir(read_unpacking_dir)

    contig_number = 1
    cmd = [extract_contig_reads_executable, str(contig_number), assembly_dir]
    cmd = ' '.join(cmd)

    contig_reads_fasta = os.path.join(read_unpacking_dir, 'contig_' + str(contig_number) + '_reads.fa')
    if not os.path.exists(contig_reads_fasta):
        with open(contig_reads_fasta, 'w') as fh:
            subprocess.call(cmd, shell=True, stdout=fh)

    p1 = re.compile('>(\S+)/(\d)')
    contig_reads = defaultdict(list)
    with open(contig_reads_fasta) as fh:
        for line in fh:
            m1 = p1.match(line)
            if m1:
                read_id = m1.group(1)
                read_number = int(m1.group(2))
                contig_reads[read_id].append(read_number)

    sv_indicant_reads_path = os.path.join(genome_finish_dir, str(genome_finishing_directory_number), READ_CATEGORY_TO_FILENAME[read_category])
    sam_file = pysam.AlignmentFile(sv_indicant_reads_path)
    sv_indicant_reads_in_contig = []
    for read in sam_file:
        if read.is_read1:
            read_number = 1
        elif read.is_read2:
            read_number = 2
        else:
            raise Exception('Read is neither read1 nor read2')

        contig_read_numbers = contig_reads.get(read.query_name, [])
        if read_number in contig_read_numbers:
            sv_indicant_reads_in_contig.append(read)

    return sv_indicant_reads_in_contig

In [None]:
# Test extract_contig_reads
contig = Contig.objects.get(label='ins_1kb_ins_1kb_sample_NODE_1_length_1966_cov_23.051373')
contig_number = 1
genome_finishing_directory_number = 1

contig_reads = extract_contig_reads(contig, contig_number, genome_finishing_directory_number, 'clipped')
print 'number of extracted reads:', len(contig_reads)

extracted_clipped_read_dicts = extract_left_and_right_clipped_read_dicts(contig_reads)
left_clipped = extracted_clipped_read_dicts['left_clipped']
right_clipped = extract_clipped_read_dicts['right_clipped']

ref_insertion_endpoints = find_ref_insertion_endpoints(left_clipped, right_clipped)

print 'ref_ins_left_end:', ref_ins_left_end
print 'ref_ins_right_end:', ref_ins_right_end
# TODO: Handle case of no endpoints found
assert ref_ins_left_end is not None and ref_ins_right_end is not None




In [32]:
def extract_left_and_right_clipped_read_dicts(sv_indicant_reads_in_contig):
    SOFT_CLIP = 4
    HARD_CLIP = 5
    CLIP = [SOFT_CLIP, HARD_CLIP]

    # Separate left and right clipped reads
    left_clipped = defaultdict(list)
    right_clipped = defaultdict(list)
    for read in sv_indicant_reads_in_contig:
        left_clipping = read.cigartuples[0][1] if read.cigartuples[0][0] in CLIP else 0
        right_clipping = read.cigartuples[-1][1] if read.cigartuples[-1][0] in CLIP else 0
        is_left_clipped = left_clipping > right_clipping
        is_right_clipped = right_clipping > left_clipping
        if is_left_clipped:
            left_clipped[read.reference_start].append(read)
        elif is_right_clipped:
            right_clipped[read.reference_end].append(read)
    
    return {
        'left_clipped': left_clipped,
        'right_clipped': right_clipped
    }

def find_ref_insertion_endpoints(left_clipped, right_clipped):
    """ left_clipped and right_clipped are dictionaries with lists of
    reads as values and the reference start and end of the clipped alignment
    as keys respectively
    """
    # Find positions in reference of most left clipping points
    left_clipped_list_sorted = sorted(left_clipped.items(), key=lambda x:len(x[1]), reverse=True)
    highest_clip_consensus = len(left_clipped_list_sorted[0][1])
    second_highest_clip_consensus = len(left_clipped_list_sorted[1][1]) if len(left_clipped_list_sorted) > 1 else 0
    if highest_clip_consensus - second_highest_clip_consensus > 2:
        ref_ins_right_end = left_clipped_list_sorted[0][0]
    else:
        ref_ins_right_end = None

    # Same for right clipping
    right_clipped_list_sorted = sorted(right_clipped.items(), key=lambda x:len(x[1]), reverse=True)
    highest_clip_consensus = len(right_clipped_list_sorted[0][1])
    second_highest_clip_consensus = len(right_clipped_list_sorted[1][1]) if len(right_clipped_list_sorted) > 1 else 0
    if highest_clip_consensus - second_highest_clip_consensus > 2:
        ref_ins_left_end = right_clipped_list_sorted[0][0]
    else:
        ref_ins_left_end = None


    return {
        'ref_ins_left_end': ref_ins_left_end,
        'ref_ins_right_end': ref_ins_right_end
    }

# Grab query_alignment_sequences for alignment to contig
def write_reads_to_fastq(reads, fastq_path):
    """
    T
    H
    I
    S
    
    IS WHAT I'M WORKING ON
    """
    
    
    right_clipped_query_alignment_seqrecords = []
    for read in right_clipped_same_end:
        right_clipped_query_alignment_seqrecords.append(SeqRecord(
                Seq(read.query_alignment_sequence, IUPAC.ambiguous_dna),
                letter_annotations={'phred_quality':read.query_alignment_qualities},
                id=read.query_name,
                description=''))

    right_clipped_query_alignment_fastq = os.path.join(read_unpacking_dir, 'right_clipped_query_alignment_seqs.fq')
    with open(right_clipped_query_alignment_fastq, 'w') as fastq_handle:
        SeqIO.write(right_clipped_query_alignment_seqrecords, fastq_handle, 'fastq')

    
def find_contig_insertion_endpoints(left_clipped_same_end, right_clipped_same_end)
""" left_clipped_same_end/right_clipped_same_end are lists of
left and right clipped reads all with the same left/right
alignment endpoint, corresponding to the reference insertion
right/left endpoint
"""
# TODO: Handle case of same query_name reads being added to same fastq
right_clipped_query_names = [read.query_name for read in right_clipped_same_end]
assert len(right_clipped_query_names) == len(set(right_clipped_query_names))

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio import SeqIO
right_clipped_query_alignment_seqrecords = []
for read in right_clipped_same_end:
    right_clipped_query_alignment_seqrecords.append(SeqRecord(
            Seq(read.query_alignment_sequence, IUPAC.ambiguous_dna),
            letter_annotations={'phred_quality':read.query_alignment_qualities},
            id=read.query_name,
            description=''))

right_clipped_query_alignment_fastq = os.path.join(read_unpacking_dir, 'right_clipped_query_alignment_seqs.fq')
with open(right_clipped_query_alignment_fastq, 'w') as fastq_handle:
    SeqIO.write(right_clipped_query_alignment_seqrecords, fastq_handle, 'fastq')


input_reads_fq = right_clipped_query_alignment_fastq
from main.models import Contig
from main.models import Dataset
contig = Contig.objects.get(label='ins_1kb_ins_1kb_sample_NODE_1_length_1966_cov_23.051373')
contig_fasta = contig.dataset_set.get(type=Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location()

# Align clipped query alignment fastq to contig
align_input_args = ' '.join([
        '%s/bwa/bwa' % settings.TOOLS_DIR,
        'mem',
        '-t', '1', # threads
        contig_fasta,
        input_reads_fq,
    ])

# To skip saving the SAM file to disk directly, pipe output directly to
# make a BAM file.
align_input_args += ' | ' + settings.SAMTOOLS_BINARY + ' view -bS -'

# Get a BAM filename
output_bam = os.path.join(read_unpacking_dir,
        'right_clipped_to_contig.bwa_align.bam')

# Ensure contig fasta is indexed
subprocess.call(' '.join([
        '%s/bwa/bwa' % settings.TOOLS_DIR,
        'index',
        contig_fasta
    ]),
shell=True, executable=settings.BASH_PATH)

# Run alignment
import subprocess
with open(output_bam, 'w') as fh:
    subprocess.check_call(
            align_input_args, stdout=fh,
            shell=True, executable=settings.BASH_PATH)

# # Check if reverse complement contig
# right_clipped_qname_to_read = {}
# for read in right_clipped[ref_ins_left_end]:
#     right_clipped_qname_to_read[read.query_name] = read

total_mapped_count = 0
reversed_complementarity_count = 0
sam_file = pysam.AlignmentFile(output_bam)
for read in sam_file:
    if not read.is_unmapped:
        total_mapped_count += 1
        if read.is_reverse:
            reversed_complementarity_count += 1

REVERSED_COMPLEMENTARITY_FRACTION_CUTOFF = 0.75
if reversed_complementarity_count / total_mapped_count > REVERSED_COMPLEMENTARITY_FRACTION_CUTOFF:
    print 'Contig is reverse complement'
    contig.metadata['is_reverse'] = True

# Write reverse complement of contig to file if is reverse
if contig.metadata.get('is_reverse', False):
    rc_contig_fasta = os.path.splitext(contig_fasta)[0] + '.reverse_complement.fa'
    print 'reverse complement contig fasta:', rc_contig_fasta
    contig_seqrecord = SeqIO.parse(contig_fasta, 'fasta').next()
    contig_seqrecord.seq = contig_seqrecord.seq.reverse_complement()
    SeqIO.write(contig_seqrecord, rc_contig_fasta, 'fasta')
    
    # Align clipped query alignment fastq to contig
    align_input_args = ' '.join([
            '%s/bwa/bwa' % settings.TOOLS_DIR,
            'mem',
            '-t', '1', # threads
            rc_contig_fasta,
            input_reads_fq,
        ])

    # To skip saving the SAM file to disk directly, pipe output directly to
    # make a BAM file.
    align_input_args += ' | ' + settings.SAMTOOLS_BINARY + ' view -bS -'

    # Get a BAM filename
    output_bam = os.path.join(read_unpacking_dir,
            'right_clipped_to_contig.bwa_align.bam')

    # Ensure contig fasta is indexed
    subprocess.call(' '.join([
            '%s/bwa/bwa' % settings.TOOLS_DIR,
            'index',
            rc_contig_fasta
        ]),
    shell=True, executable=settings.BASH_PATH)

    # Run alignment
    import subprocess
    with open(output_bam, 'w') as fh:
        subprocess.check_call(
                align_input_args, stdout=fh,
                shell=True, executable=settings.BASH_PATH)


alignment_ref_end_positions = defaultdict(list)
sam_file = pysam.AlignmentFile(output_bam)
for read in sam_file:
    # Change to reference_start for left_clipped
    alignment_ref_end_positions[read.reference_end].append(read)
#     print 'read.reference_start:', read.reference_start
#     print 'read.reference_end:', read.reference_end

alignment_ref_end_positions_sorted = sorted(alignment_ref_end_positions.items(), key=lambda x:len(x[1]), reverse=True)
highest_end_consensus = len(alignment_ref_end_positions_sorted[0][1])
second_highest_end_consensus = len(alignment_ref_end_positions_sorted[1][1]) if len(alignment_ref_end_positions_sorted) > 1 else 0
if highest_end_consensus - second_highest_end_consensus > 2:
    contig_ins_left_end = alignment_ref_end_positions_sorted[0][0]
else:
    contig_ins_left_end = None

print 'contig_ins_left_end:', contig_ins_left_end


SyntaxError: invalid syntax (<ipython-input-32-c355b5e700f0>, line 69)