# Overview
     0) Download Genome Fasta and Annotation GTF
     1) Clean Fasta with Fastp
     2) STAR Generate Genome
     3) STAR Align
     4) Cufflinks

# Download Reference Genome and Annotations
https://uswest.ensembl.org/Taeniopygia_guttata/Info/Index

In [None]:
mkdir /bigstore/binfo/zebra_finch/
mkdir /bigstore/binfo/zebra_finch/GenomeFasta/
cd /bigstore/binfo/zebra_finch/GenomeFasta/
rsync -av rsync://ftp.ensembl.org/pub/release-96/fasta/taeniopygia_guttata/dna/

In [None]:
cd /bigstore/binfo/zebra_finch/
rsync -av rsync://ftp.ensembl.org/pub/release-96/gtf/taeniopygia_guttata/

In [None]:
!for f in /bigstore/binfo/zebra_finch/GenomeFasta/Taeniopygia_guttata.taeGut3.2.4.dna.chromosome* ; do gunzip $f ; done

In [5]:
!cat /bigstore/binfo/zebra_finch/GenomeFasta/Taeniopygia_guttata.taeGut3.2.4.dna.chromosome* > '/bigstore/binfo/zebra_finch/GenomeFasta/Taeniopygia_guttata.taeGut3.2.4.dna.primary_assembly.fa'

# STAR Generate Genome
    https://github.com/alexdobin/STAR
    conda install -c bioconda star

In [None]:
# STAR Generate Genome
STAR
--runThreadN 32
--runMode genomeGenerate
--genomeDir /bigstore/binfo/zebra_finch/STARgenomeDir
--genomeFastaFiles /bigstore/binfo/zebra_finch/Taeniopygia_guttata.taeGut3.2.4.dna.primary_assembly.fa
--sjdbGTFfile /bigstore/binfo/zebra_finch/Taeniopygia_guttata.taeGut3.2.4.96.gtf

In [None]:
!STAR --runThreadN 32 --runMode genomeGenerate --genomeDir /bigstore/binfo/zebra_finch/STARgenomeDir --genomeFastaFiles /bigstore/binfo/zebra_finch/Taeniopygia_guttata.taeGut3.2.4.dna.primary_assembly.fa --sjdbGTFfile /bigstore/binfo/zebra_finch/Taeniopygia_guttata.taeGut3.2.4.96.gtf

Apr 28 14:38:15 ..... started STAR run
Apr 28 14:38:15 ... starting to generate Genome files
Apr 28 14:38:50 ... starting to sort Suffix Array. This may take a long time...
Apr 28 14:39:01 ... sorting Suffix Array chunks and saving them to disk...


# Retrieved Files from Caitlin


In [None]:
!mkdir /bigstore/binfo/zebra_finch/RegionXExpression/
!mkdir /bigstore/binfo/zebra_finch/RegionXExpression/Raw/

In [7]:
!for f in /bigstore/binfo/zebra_finch/RegionXExpression/Raw/*.fastq.gz ; do gunzip $f ; done

# Clean Fasta Files: Fastp
    https://github.com/OpenGene/fastp
    conda install -c bioconda fastp

In [None]:
!mkdir /bigstore/binfo/zebra_finch/RegionXExpression/Fastp/
!for f in /bigstore/binfo/zebra_finch/RegionXExpression/Raw/*.fastq ; do fastp -i $f -o /bigstore/binfo/zebra_finch/RegionXExpression/Fastp/$f ; done

# STAR Align

In [9]:
# Generate samples list for later use
pth = '/bigstore/binfo/zebra_finch/RegionXExpression/Raw'
import os
import pandas as pd
files = []
for f in os.listdir(pth):
    if '.fastq' in f:
        fn = f.split('_R')[0]
        if not fn in files:
            files.append(fn)
            print(fn)
files = pd.Series(data=files)
files.to_csv('/bigstore/binfo/zebra_finch/Samples.csv',index='False')

['Blue623X_S6_L001',
 'Blue623X_S6_L002',
 'Brown443X_S79_L007',
 'Brown443X_S79_L008',
 'LtPink78X_S40_L003',
 'LtPink78X_S40_L004',
 'LtPink82X_S67_L007',
 'LtPink82X_S67_L008',
 'Purple405X_S68_L007',
 'Purple405X_S68_L008',
 'Purple418X_S42_L003',
 'Purple418X_S42_L004']

In [None]:
# STAR Align
cat /bigstore/binfo/zebra_finch/RegionXExpression/Samples.csv | while read line
do
STAR
--runThreadN 32
--runMode alignReads
--genomeDir /bigstore/binfo/zebra_finch/STARgenomeDir
--readFilesIn /bigstore/binfo/zebra_finch/RegionXExpression/Fastp/"$line"_R1_001.fastq.gz /bigstore/binfo/zebra_finch/RegionXExpression/Fastp/"$line"_R2_001.fastq.gz
--outFileNamePrefix /bigstore/binfo/zebra_finch/RegionXExpression/Aligned/$line
--readFilesCommand gunzip -c
--outSAMtype BAM SortedByCoordinate
--quantMode TranscriptomeSAM GeneCounts
--genomeLoad LoadAndKeep
done

In [None]:
!cat /bigstore/binfo/zebra_finch/RegionXExpression/Samples.csv | while read line
do
STAR --runThreadN 32 --runMode alignReads --genomeDir /bigstore/binfo/zebra_finch/STARgenomeDir/ --readFilesIn /bigstore/binfo/zebra_finch/RegionXExpression/Fastp/"$line"_R1_001.fastq /bigstore/binfo/zebra_finch/RegionXExpression/Fastp/"$line"_R2_001.fastq --outFileNamePrefix /bigstore/binfo/zebra_finch/RegionXExpression/Aligned/$line --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM GeneCounts
done

In [None]:
STAR --runThreadN 32 --genomeDir /bigstore/binfo/zebra_finch/STARgenomeDir/ --readFilesIn /bigstore/binfo/zebra_finch/RegionXExpression/Fastp/Blue623X_S6_L001_R1_001.fastq /bigstore/binfo/zebra_finch/RegionXExpression/Fastp/Blue623X_S6_L001_R2_001.fastq --outFileNamePrefix /bigstore/binfo/zebra_finch/RegionXExpression/Aligned/Blue623X_S6_L001 --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM GeneCounts

# Cufflinks
    https://github.com/cole-trapnell-lab/cufflinks
    conda install -c bioconda cufflinks