In [None]:
!pip install pyfasta
import os, sys, subprocess

## CONTRIB
import pandas as pd
import seaborn as sns
import numpy as np
import itertools
#import gseapy as gp
import time 
## LOCAL
import isistools
from ngstools import HSModel, MDA
#from ngstools.Drona import *
%pylab inline
import cPickle as pickle
import inspect
from IPython.display import HTML
import scipy
# import ngstools
# print inspect.getsource(ngstools.Drona)

import plotly.offline as po 
from plotly.graph_objs import *
po.init_notebook_mode(connected=True) 
from plotly.tools import FigureFactory as FF

import warnings
warnings.filterwarnings("ignore")
import logging
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)

import contextlib
import sys
import csv

@contextlib.contextmanager
def stdout_redirect(where):
    sys.stdout = where
    try:
        yield where
    finally:
        sys.stdout = sys.__stdout__
        
import matplotlib.pyplot as plt
from Bio import SeqIO
from decimal import Decimal
import regex
from multiprocessing import Pool

In [None]:
def Samtools(directory,      #output directory,
             file,           #shortened path of SAM file (output of Minimap2)
            chr_region):     #list of chromosomes as a single string (see bottom of cell)    
    filename=file.split('.')[0]+'.bam'
    !samtools view -bS {directory+'/'+file} > {directory+'/'+filename}
    sort_aln=directory+'/sorted_aln.bam'
    !samtools sort -o {sort_aln} {directory+'/'+filename}
    !samtools index {sort_aln}
    !samtools flagstat {sort_aln}
    
        #Eliminate unmapped reads
    mapped=directory+'/merged_mapped.bam'
    !samtools view -b -F 4 {sort_aln} > {mapped}

        #Reference sequences and mapped reads
    print('')
    print('Ref seq name|Seq lgth|#mapped reads|#unmapped reads')
    !samtools index {mapped}
    !samtools idxstats {mapped} > {directory}/aln_index.csv
    !head -3 {directory}/aln_index.csv

    print ''
    print 'Extracting reads mapped to main chromosomes...'
    newfile=directory+'/chr_only'
    !samtools view -b {mapped} {chr_region} > {newfile}.bam
    print('Whole bam file:')
    !wc -l {mapped}
    print('chr_only'+' bam file:')
    !wc -l {newfile}.bam
    !samtools flagstat {newfile}.bam
    !samtools view -h {newfile}.bam > {newfile}.sam
    !samtools index {newfile}.bam

#Strings of GENCODE chromosome labels, for Samtools()
hum_chrs='NC_000001.11 NC_000002.12 NC_000015.10 NC_000003.12 NC_000016.10 NC_000004.12 NC_000017.11 NC_000005.10 NC_000018.10 NC_000006.12  NC_000019.10 NC_000007.14 NC_000020.11 NC_000008.11 NC_000021.9 NC_000009.12 NC_000022.11 NC_000010.11 NC_000023.11 NC_000011.10 NC_000024.10 NC_000012.12 NC_000013.11 NC_000014.9'
mouse_chrs='NC_000067.6 NC_000068.7 NC_000069.6 NC_000070.6 NC_000071.6 NC_000072.6 NC_000073.6 NC_000074.6 NC_000075.6 NC_000076.6 NC_000077.6 NC_000078.6 NC_000079.6 NC_000080.6 NC_000081.6 NC_000082.6 NC_000083.6 NC_000084.6 NC_000085.6 NC_000086.7 NC_000087.7'

In [None]:
Samtools('2019-7-26_cdna/liver',
        'liver.sam',
        hum_chrs)

In [None]:
directory='2019-8-6_cdna/a431.b3' #This directory needs to be defined every time TranscriptClean() is run

def TranscriptClean(chrs):   
    for my_chr in chrs:
        !samtools view -h {directory}/chr_only.bam {my_chr} > {directory}/{my_chr}.sam
        
        script='software/transcriptclean/TranscriptClean-master/TranscriptClean.py'
        hum_variants='software/transcriptclean/TranscriptClean-master/common_variant_human'
        genome='software/homo_GRCh38_trimmed_ref.fa'
        !python {script} --sam {directory}/{my_chr}.sam --variants {hum_variants} --genome {genome} --outprefix {directory}/trcl_VA/{my_chr}
                    
def multiRunWrapper(args): # intermediate for parallel processing
    # print 'step 1'
    return TranscriptClean(args)
def RunMany(inputList): # set up parallel processing
    #import psutil
    #cpuNumber = psutil.cpu_count()
    #print 'number of processors:', cpuNumber
    pool = Pool(processes=32)
    imageScores = pool.map(multiRunWrapper, inputList)
    pool.close()
    pool.join()
    return imageScores
    print 'done'
    
def SplitSamTrCl(directory,
                chrs,       #list of chromsomes
                pool_chrs): #list of lists (each nested list contains a chromosome label as its only item)
    !mkdir {directory}/trcl_VA
    RunMany(pool_chrs)

    new_d=directory+'/trcl_VA'
    !cat {new_d}/*_clean.fa > {new_d}/_clean.fa
    !cat {new_d}/*_clean.log > {new_d}/_clean.log
    !cat {new_d}/*_clean.TE.log > {new_d}/_clean.TE.log
    #Merge corrected SAM into one BAM
    !mkdir {new_d}/temp_bam
    !mkdir {directory}/chr_cvg
    avg_cvgs=[]
    chr_ids=[]
    seq_lengths=[]

    for achr in chrs:
        !samtools view -S -b {new_d}/{achr}_clean.sam > {new_d}/temp_bam/{achr}_clean.bam
        !samtools view -S -b {directory}/{achr}.sam > {new_d}/temp_bam/{achr}_clean.bam
        
#     Create final bam file for conversion to BED for flair
    !samtools merge {new_d}/_clean.bam {new_d}/temp_bam/*_clean.bam
    !samtools sort -o {new_d}/_cleansort.bam {new_d}/_clean.bam
    !samtools index {new_d}/_cleansort.bam
    !samtools view -h {new_d}/_cleansort.bam > {new_d}/_cleansort.sam
    !rm -R {new_d}/temp_bam
    
    
#Calculate error rate from TranscriptClean log file and cleaned fasta file
def Error(new_dir,fasta_path):
    errors=0
    with open(new_dir+'/_clean.TE.log') as csvfile:
        data=csv.reader(csvfile,delimiter='\t')
        next(data)
        for row in data:
            if 'Size' not in row[3]: errors+=float(row[3])
    print('Total number of error bases, detected by TranscriptClean: '+str(errors))
    cum_bases=0
    with open(fasta_path) as fasta_file:
            for read in SeqIO.parse(fasta_file,'fasta'):
                cum_bases+=len(read)
    print('Total number of bases sequenced from  '+new_dir+': '+str(cum_bases))
    print('Sequencing error rate: '+str(errors/cum_bases*100)+'%')

In [None]:
chrs=['NC_000001.11', 'NC_000002.12', 'NC_000015.10','NC_000003.12', 'NC_000016.10', 'NC_000004.12', 'NC_000017.11', 'NC_000005.10', 'NC_000018.10', 'NC_000006.12',  'NC_000019.10', 'NC_000007.14', 'NC_000020.11', 'NC_000008.11', 'NC_000021.9', 'NC_000009.12', 'NC_000022.11', 'NC_000010.11', 'NC_000023.11', 'NC_000011.10', 'NC_000024.10', 'NC_000012.12', 'NC_000013.11', 'NC_000014.9']
pool_list=[]
for achr in chrs:
    pool_list.append([achr])
print pool_list

In [None]:
SplitSamTrCl('2019-8-6_cdna/a431utc',
                chrs,       #list of chromsomes
                pool_list)

In [None]:
Error("2019-7-22_dirrna/trcl_VA","2019-7-22_dirrna/trcl_VA/_clean.fa")
Error("2019-7-26_cdna/a431utc/trcl_VA","2019-7-26_cdna/a431utc/trcl_VA/_clean.fa")

In [None]:
#Created new GENCODE mouse annotation (vM22) with consistent chromosome labels ('NC_' format)

file=open('software/mouse_chr.txt')
reader=file.readlines()
ids=[]
names=[]
count=1
for row in reader:
    if count==22: break
    name= row.split('>')[1].split(' Mus')[0]
    ids.append(name)
    if count==20: names.append('chrX')
    elif count==21: names.append('chrY')
    else: names.append('chr'+str(count))
    count+=1
subs=[]
for i,n in map(None,ids,names):
    subs.append('s/'+n+'/'+i+'/g')
print subs