In [1]:
from pyrpipe import sra,mapping,assembly,qc,tools
from pyrpipe import pyrpipe_utils as pu
from pyrpipe import pyrpipe_engine as pe
#First get the srr accessions of the runs. For this one can use the python package pysradb or R package sradb
#runs=['SRR3098746','SRR3098745','SRR3098744'] #from the study SRP068369
runs=['SRR765545'] #small test
#set up directories

workingDir="maize_out"
#create working directory
if not pu.check_paths_exist(workingDir):
    pu.mkdir(workingDir)



[93mCreating script backup: .pyrpipe/_pyrpipe_b9d1b41b9e44376d98653a33686db164_ipykernel_launcher.py[0m
[93mLogs will be saved to /home/usingh/work/urmi/hoap/pyrpipe/case_studies/Maize_lncRNA_prediction/pyrpipe_logs/2021-01-01-16_47_00.489627_52103_pyrpipe.log[0m


## Download Genome and GTF

In [2]:
GENOME=workingDir+"/Zea_mays.B73_RefGen_v4.dna.toplevel.fa"
GTF=workingDir+"/Zea_mays.B73_RefGen_v4.46.gtf"

if not pu.check_files_exist(GENOME):
    print("Downloading genome fasta file")
    wget="wget ftp://ftp.ensemblgenomes.org/pub/release-46/plants/fasta/zea_mays/dna/Zea_mays.B73_RefGen_v4.dna.toplevel.fa.gz -q -O "+GENOME+".gz"
    pe.execute_command(wget.split(),verbose=False,logs=False)
    pe.execute_command(['gunzip',GENOME+".gz"],verbose=False,logs=False)
    
if not pu.check_files_exist(GTF):
    print("Downloading GTF file")
    wget="wget ftp://ftp.ensemblgenomes.org/pub/release-46/plants/gtf/zea_mays/Zea_mays.B73_RefGen_v4.46.gtf.gz -q -O "+GTF+".gz"
    pe.execute_command(wget.split(),verbose=False,logs=False)
    pe.execute_command(['gunzip',GTF+".gz"],verbose=False,logs=False)


Downloading genome fasta file


[93mStart:21-01-01 16:48:32[0m
[96m$ wget ftp://ftp.ensemblgenomes.org/pub/release-46/plants/fasta/zea_mays/dna/Zea_mays.B73_RefGen_v4.dna.toplevel.fa.gz -q -O maize_out/Zea_mays.B73_RefGen_v4.dna.toplevel.fa.gz[0m
[93mEnd:21-01-01 16:48:56[0m
[92mTime taken:0:00:24[0m
[93mStart:21-01-01 16:48:56[0m
[96m$ gunzip maize_out/Zea_mays.B73_RefGen_v4.dna.toplevel.fa.gz[0m
[93mEnd:21-01-01 16:49:11[0m
[92mTime taken:0:00:15[0m
[93mStart:21-01-01 16:49:11[0m
[96m$ wget ftp://ftp.ensemblgenomes.org/pub/release-46/plants/gtf/zea_mays/Zea_mays.B73_RefGen_v4.46.gtf.gz -q -O maize_out/Zea_mays.B73_RefGen_v4.46.gtf.gz[0m


Downloading GTF file


[93mEnd:21-01-01 16:49:18[0m
[92mTime taken:0:00:07[0m
[93mStart:21-01-01 16:49:18[0m
[96m$ gunzip maize_out/Zea_mays.B73_RefGen_v4.46.gtf.gz[0m
[93mEnd:21-01-01 16:49:20[0m
[92mTime taken:0:00:03[0m


## Download data, pre-process

Similar to the *A. thaliana* example, we will create SRA objects to download the fastq files. Then, we will use `trim_galore` to perform trimming by creatin a `Trimgalore` object.

In [4]:
#create a Trimgalore object
tg=qc.Trimgalore()
sraObjects=[]
for x in runs:
    thisSraOb=sra.SRA(x,workingDir)
    thisSraOb.trim(tg)
    sraObjects.append(thisSraOb)       

[93mStart:21-01-01 16:53:21[0m
[96m$ prefetch -O maize_out/SRR765545 SRR765545[0m
[93mEnd:21-01-01 16:53:28[0m
[92mTime taken:0:00:08[0m
[93mStart:21-01-01 16:53:29[0m
[96m$ fasterq-dump -O maize_out/SRR765545 -o SRR765545.fastq -e 6 -f maize_out/SRR765545/SRR765545.sra[0m
[93mEnd:21-01-01 16:53:57[0m
[92mTime taken:0:00:28[0m
[93mStart:21-01-01 16:53:57[0m
[96m$ trim_galore --cores 6 --paired -o maize_out/SRR765545 maize_out/SRR765545/SRR765545_1.fastq maize_out/SRR765545/SRR765545_2.fastq[0m
[93mEnd:21-01-01 16:54:43[0m
[92mTime taken:0:00:46[0m


##  STAR Alignment and transcript assembly using StringTie

Now we will align the trimmed fastq files using STAR. pyrpipe provides the `Star` class via the `mapping` module to use STAR in python. We will create a `Star` object and pass it to the `align` function.

We are providing the index as `workingDir+"/starindex"`. If this index doesn't exist pyrpipe will create one using the genome. Additional STAR parameters specified in the `./params/star.yaml` file will be loaded automatically.

`./params/star.yaml` file contains:

```
--outFilterType : BySJout
--runThreadN": 6
--outSAMtype: BAM SortedByCoordinate

```

**Note: It is recommended that users generate their index using appropriate parameters. Parameters to be used while building an index could be stored in star_index.yaml files and pyrpipe will automatically load them if building a new index.**

To reduce the RAM consumption during generating of STAR index `--genomeChrBinNbits 5` option is added to `star_index.yaml`.

To perform transcript assembly using stringtie, we create a `Stringtie` object.
The `align()` and `assemble()` functions can be `chained` so we can write a `one-liner` to perform alignemnt and assembly.

The `align()` method performs alignemnt and the resultant bam file is stored in the `SRA` object as the `bam_path` attribute. The `assemble()` function requires the `bam_path` attribute and uses it to perform transcript assembly using the provided `Assembler` object (stringtie in this example).

In [None]:
#parametrs can go into ./params/star.yaml
star=mapping.Star(index=workingDir+"/starindex",genome=GENOME,threads=3) 
#create stringtie object
st=assembly.Stringtie()
gtfList=[]

#combine align and assemble
for x in sraObjects:
    #align and assemble
    x.align(star).assemble(st)
    gtfList.append(x.gtf)   

print(gtfList)


The above output shows that a STAR index was generated first. Then, the options present in the `star.yaml` file were loaded and passed to the STAR command.

## lncRNA prediction using PLncPRO
We will use [PLncPRO](https://github.com/urmi-21/PLncPRO) for prediction of lncRNAs. Currently, PLncPRO is not integrated into `pyrpipe` so we will use the `pyrpipe_engine` module directly to execute.

In [None]:
#import pyrpipe modules
from pyrpipe import pyrpipe_engine as pe
#install plncpro
pe.execute_command("pip install plncpro".split(),verbose=True,quiet=False,logs=False)
#OR
#!pip install plncpro


genome="maize_data/Zea_mays.B73_RefGen_v4.dna.toplevel.1_10.fa"
model="monocot_model/monocot.model"
blastdb="uniprot/uniprotdb"
for i in range(len(gtfList)):
    thisOb=sraObjects[i]
    #first extract transcripts using gffread
    tx_file=thisOb.location+"/transcripts.fa"
    cmd="gffread -w "+tx_file+" -g maize_data/Zea_mays.B73_RefGen_v4.dna.toplevel.1_10.fa "+gtfList[i]
    pe.execute_command(cmd.split(" "),verbose=False,quiet=False,logs=True,objectid=thisOb.srr_accession,command_name="gffread")
    
    #Optional step use biopython to filter transcripts by len
    #out_file=thisOb.location+"/transcripts_filter.fa"
    #output_handle = open(out_file, "w")
    #for record in SeqIO.parse(tx_file, "fasta"):
        # keep tx between 200 and 1000
    #    if len(record)>=500 and len(record)<=1000:
    #        #write to temp file
    #        SeqIO.write(record, output_handle, "fasta")

    
    #run plncpro
    outdir=thisOb.location+"/plncpro_out"
    outfile="plncpro_predictions"
    cmd="plncpro predict -i "+tx_file+" -o "+outdir+" -p "+outfile+" -t 25 --min_len 200 -d "+blastdb+" -m "+model+" -v -r"
    pe.execute_command(cmd.split(),verbose=False,quiet=False,logs=True,objectid=thisOb.srr_accession,command_name="plncpro predict")
        


## Generate reports

In [None]:
#NOTE: Following commands are executed in shell, hence the ! before each command
!pyrpipe_diagnostic.py report pyrpipe_logs/2020-01-22-18_14_47_pyrpipe.log
!pyrpipe_diagnostic.py benchmark pyrpipe_logs/2020-01-22-18_14_47_pyrpipe.log
!pyrpipe_diagnostic.py shell pyrpipe_logs/2020-01-22-18_14_47_pyrpipe.log
!pyrpipe_diagnostic.py multiqc -o ./multiqc_report pyrpipe_logs/2020-01-22-18_14_47_pyrpipe.log
