In [1]:
from pyrpipe import sra,mapping,assembly,qc,tools
#First get the srr accessions of the runs. For this one can use the python package pysradb or R package sradb
#runs=['SRR3098746','SRR3098745','SRR3098744'] #from the study SRP068369
runs=['SRR765545'] #small test
#set up directories
maize_data="/home/usingh/work/urmi/hoap/test/maize/maize_data"
workingDir="/home/usingh/work/urmi/hoap/test/maize"



[93mLogs will be saved to 2020-01-16-16_48_04_pyrpipe.log[0m


## Download data, pre-process

In [2]:
sraObjects=[]
for x in runs:
    thisSraOb=sra.SRA(x,workingDir)
    if thisSraOb.download_sra():
        sraObjects.append(thisSraOb)
    else:
        print("Download failed:"+x)
        
#perform fastq dump and qc

#create a Trimgalore object
tgOpts={"--cores": "10"}
tg=qc.Trimgalore(**tgOpts)

for x in sraObjects:
    #to fastq
    x.run_fasterqdump(delete_sra=True,**{"-e":"20","-f":"","-t":workingDir}) #use 20 threads
    #perform qc using trim galore
    x.perform_qc(tg)
    

        

[95mDownloading SRR765545 ...[0m
[94m$ prefetch -O /home/usingh/work/urmi/hoap/test/maize/SRR765545 SRR765545[0m
[92mTime taken:0:00:14[0m
Downloaded file: /home/usingh/work/urmi/hoap/test/maize/SRR765545/SRR765545.sra 500.4 MB 
[94m$ fasterq-dump -e 20 -f -t /home/usingh/work/urmi/hoap/test/maize -O /home/usingh/work/urmi/hoap/test/maize/SRR765545 -o SRR765545.fastq /home/usingh/work/urmi/hoap/test/maize/SRR765545/SRR765545.sra[0m
[92mTime taken:0:00:27[0m
Performing QC using trim_galore
[94m$ trim_galore --cores 10 --paired -o /home/usingh/work/urmi/hoap/test/maize/SRR765545 /home/usingh/work/urmi/hoap/test/maize/SRR765545/SRR765545_1.fastq /home/usingh/work/urmi/hoap/test/maize/SRR765545/SRR765545_2.fastq[0m
[92mTime taken:0:01:09[0m


## Map using STAR

In [None]:
starParams={"--outFilterType":"BySJout",
            "--runThreadN":"8",
            "--outSAMtype": "BAM SortedByCoordinate"
            }

star=mapping.Star(star_index="",**starParams) #provided index is invalid

#create star index
indexOut=maize_data+"/starindex"
inFasta=maize_data+"/Zea_mays.B73_RefGen_v4.dna.toplevel.1_10.fa"
star.build_index(indexOut,inFasta)



No STAR index provided. Please build index now to generate an index using build_index()....
[94m$ STAR --runMode genomeGenerate --genomeDir /home/usingh/work/urmi/hoap/test/maize/maize_data/starindex --genomeFastaFiles /home/usingh/work/urmi/hoap/test/maize/maize_data/Zea_mays.B73_RefGen_v4.dna.toplevel.1_10.fa[0m


## Transcript assembly using StringTie

In [None]:
#Create object for stringtie. This will be used for all the bam files.
st=assembly.Stringtie(reference_gtf="maize_data/Zea_mays.B73_RefGen_v4.46.gtf")
gtfList=[]
for x in sraObjects:
    star_out_dir=star.perform_alignment(x,objectid=x.srr_accession)
    bam=star_out_dir+"/Aligned.sortedByCoord.out.bam"
    gtfList.append(st.perform_assembly(bam,objectid=x.srr_accession,**{"-p":"25"}))   

print(gtfList)


## lncRNA prediction using PLncPRO
We will use [PLncPRO](https://github.com/urmi-21/PLncPRO) for prediction of lncRNAs. Currently, PLncPRO is not integrated into `pyrpipe` so we will use the `pyrpipe_engine` module directly to execute.

In [None]:
#import pyrpipe modules
from pyrpipe import pyrpipe_engine as pe
#install plncpro
pe.execute_command("pip install plncpro".split(),verbose=True,quiet=False,logs=False)
#OR
#!pip install plncpro


genome="maize_data/Zea_mays.B73_RefGen_v4.dna.toplevel.1_10.fa"
model="monocot_model/monocot.model"
blastdb="uniprot/uniprotdb"
for i in range(len(gtfList)):
    thisOb=sraObjects[i]
    #first extract transcripts using gffread
    tx_file=thisOb.location+"/transcripts.fa"
    cmd="gffread -w "+tx_file+" -g maize_data/Zea_mays.B73_RefGen_v4.dna.toplevel.1_10.fa "+gtfList[i]
    pe.execute_command(cmd.split(" "),verbose=False,quiet=False,logs=True,objectid=thisOb.srr_accession,command_name="gffread")
    
    #Optional step use biopython to filter transcripts by len
    #out_file=thisOb.location+"/transcripts_filter.fa"
    #output_handle = open(out_file, "w")
    #for record in SeqIO.parse(tx_file, "fasta"):
        # keep tx between 200 and 1000
    #    if len(record)>=500 and len(record)<=1000:
    #        #write to temp file
    #        SeqIO.write(record, output_handle, "fasta")

    
    #run plncpro
    outdir=thisOb.location+"/plncpro_out"
    outfile="plncpro_predictions"
    cmd="plncpro predict -i "+tx_file+" -o "+outdir+" -p "+outfile+" -t 25 --min_len 200 -d "+blastdb+" -m "+model+" -v -r"
    pe.execute_command(cmd.split(),verbose=False,quiet=False,logs=True,objectid=thisOb.srr_accession,command_name="plncpro predict")
        


## Generate reports

In [1]:
!pyrpipe_diagnostic.py report pyrpipe_logs/2020-01-22-18_14_47_pyrpipe.log
!pyrpipe_diagnostic.py benchmark pyrpipe_logs/2020-01-22-18_14_47_pyrpipe.log
!pyrpipe_diagnostic.py shell pyrpipe_logs/2020-01-22-18_14_47_pyrpipe.log


  'There are known rendering problems and missing features with '
Report written to 2020-01-22-18_14_47_pyrpipe.pdf
  'There are known rendering problems and missing features with '
Generating benchmarks
[94mparsing log...[0m
[94mdone.[0m
      data             name
0   6448.0             STAR
1   6817.0             STAR
2     75.0     fasterq-dump
3     89.0     fasterq-dump
4      9.0          gffread
5      7.0          gffread
6    910.0  plncpro predict
7    815.0  plncpro predict
8    491.0         prefetch
9    406.0         prefetch
10  1209.0        stringtie
11  1198.0        stringtie
12   782.0      trim_galore
13   836.0      trim_galore
[92mBenchmark report saved to:/home/usingh/work/urmi/hoap/pyrpipe/examples/Maize_lncRNA_prediction/tmp/benchmark_reports[0m
  'There are known rendering problems and missing features with '
Generating bash script
shell commands written to 2020-01-22-18_14_47_pyrpipe.sh
