In [1]:
##setting work directory for R
mydir<- getwd()
setwd(mydir)

In [None]:
##setting work directory for bash
%use bash
mydir=`pwd`
cd $mydir

In [None]:
##loading R libraries
.libPaths(c("/anvil/projects/x-tra220018/2022/Rlibs", .libPaths()))
library(BiocParallel)
library(dplyr)
library(EnsDb.Hsapiens.v86)
library(GenomicAlignments)
library(GenomicRanges) 
library(Gviz)
library(parallel)
library(Rsamtools)
library(ShortRead)

In [None]:
#checking quality of single-end data
#%use bash
module load fastqc/0.11.9
data_path=/home/x-tsuzuki/bigcare/myproject/raw
fastqc -t 20 $data_path/*.fastq.gz -o ./data/fastqc/raw/

module load multiqc
data_path=/home/x-tsuzuki/bigcare/myproject/data/fastqc/raw
multiqc $data_path -o $data_path

In [None]:
# trimming adapters from reads
# PDX4_CR1_S7_R1_001
module load trimmomatic/0.39
data_path=/home/x-tsuzuki/bigcare/myproject/raw
out_path=./data/trim
trimmomatic SE -phred33 -threads 15 \
$data_path/PDX4_CR1_S7_R1_001.fastq.gz  \
$out_path/PDX4_CR1_S7_R1_001.fastq.gz \
ILLUMINACLIP:/home/x-tsuzuki/bigcare/ref_files/TruSeq2-SE.fa:2:30:10 \
LEADING:10 TRAILING:10 SLIDINGWINDOW:4:20 MINLEN:20

# PDX4_CR2_S8_R1_001
trimmomatic SE -phred33 -threads 15 \
$data_path/PDX4_CR2_S8_R1_001.fastq.gz  \
$out_path/PDX4_CR2_S8_R1_001.fastq.gz \
ILLUMINACLIP:/home/x-tsuzuki/bigcare/ref_files/TruSeq2-SE.fa:2:30:10 \
LEADING:10 TRAILING:10 SLIDINGWINDOW:4:20 MINLEN:20

# PDX4_CR3_S9_R1_001
trimmomatic SE -phred33 \
$data_path/PDX4_CR3_S9_R1_001.fastq.gz  \
$out_path/PDX4_CR3_S9_R1_001.fastq.gz \
ILLUMINACLIP:/home/x-tsuzuki/bigcare/ref_files/TruSeq2-SE.fa:2:30:10 \
LEADING:10 TRAILING:10 SLIDINGWINDOW:4:20 MINLEN:20

# PDX4_SE1_S10_R1_001
trimmomatic SE -phred33 -threads 15 \
$data_path/PDX4_SE1_S10_R1_001.fastq.gz \
$out_path/PDX4_SE1_S10_R1_001.fastq.gz \
ILLUMINACLIP:/home/x-tsuzuki/bigcare/ref_files/TruSeq2-SE.fa:2:30:10 \
LEADING:10 TRAILING:10 SLIDINGWINDOW:4:20 MINLEN:20

# PDX4_SE2_S11_R1_001
trimmomatic SE -phred33 -threads 15 \
$data_path/PDX4_SE2_S11_R1_001.fastq.gz \
$out_path/PDX4_SE2_S11_R1_001.fastq.gz \
ILLUMINACLIP:/home/x-tsuzuki/bigcare/ref_files/TruSeq2-SE.fa:2:30:10 \
LEADING:10 TRAILING:10 SLIDINGWINDOW:4:20 MINLEN:20

# PDX4_SE3_S12_R1_001
trimmomatic SE -phred33 -threads 15 \
$data_path/PDX4_SE3_S12_R1_001.fastq.gz \
$out_path/PDX4_SE3_S12_R1_001.fastq.gz \
ILLUMINACLIP:/home/x-tsuzuki/bigcare/ref_files/TruSeq2-SE.fa:2:30:10 \
LEADING:10 TRAILING:10 SLIDINGWINDOW:4:20 MINLEN:20

In [None]:
#quality control after trimming single-end
module load fastqc/0.11.9
data_path=/home/x-tsuzuki/bigcare/myproject/data
fastqc -t 20 $data_path/trim/*.fastq.gz -o ./data/fastqc/trim/

In [None]:
#the following steps were based on GeneLab's pipeline
#generating genome index for ensembl annotations
#release-109
#ensembl
conda activate apps
module load star/2.7.10a
STAR --runThreadN 20 --runMode genomeGenerate \
--genomeDir /home/x-tsuzuki/bigcare/ref_files/genomeassembly \
--genomeFastaFiles /home/x-tsuzuki/Homo_sapiens.GRCh38.dna.primary_assembly.fa \
--sjdbGTFfile /home/x-tsuzuki/Homo_sapiens.GRCh38.109.gtf \
--sjdbOverhang 75

In [None]:
##aligning single-end data to genome
# PDX4_CR1_S7_R1_001
module load star/2.7.10a
data_path=./data/trim
out_path=./data/STAR/single
STAR --twopassMode Basic \
--genomeDir /home/x-tsuzuki/bigcare/ref_files/genomeassembly \
--outSAMunmapped Within \
--outFilterType BySJout \
--outSAMattributes NH HI AS NM MD MC \
--outFilterMultimapNmax 20 \
--outFilterMismatchNmax 999 \
--outFilterMismatchNoverReadLmax 0.04 \
--alignIntronMin 20 \
--alignIntronMax 1000000 \
--alignSJoverhangMin 8 \
--alignSJDBoverhangMin 1 \
--sjdbScore 1 \
--readFilesCommand zcat \
--runThreadN 20 \
--outSAMtype BAM SortedByCoordinate \
--quantMode TranscriptomeSAM GeneCounts \
--outSAMheaderHD @HD VN:1.4 SO:coordinate \
--outFileNamePrefix $out_path/PDX4_CR1_ \
--readFilesIn $data_path/PDX4_CR1_S7_R1_001.fastq.gz

# PDX4_CR2_S8_R1_001
STAR --twopassMode Basic \
--genomeDir /home/x-tsuzuki/bigcare/ref_files/genomeassembly \
--outSAMunmapped Within \
--outFilterType BySJout \
--outSAMattributes NH HI AS NM MD MC \
--outFilterMultimapNmax 20 \
--outFilterMismatchNmax 999 \
--outFilterMismatchNoverReadLmax 0.04 \
--alignIntronMin 20 \
--alignIntronMax 1000000 \
--alignSJoverhangMin 8 \
--alignSJDBoverhangMin 1 \
--sjdbScore 1 \
--readFilesCommand zcat \
--runThreadN 20 \
--outSAMtype BAM SortedByCoordinate \
--quantMode TranscriptomeSAM GeneCounts \
--outSAMheaderHD @HD VN:1.4 SO:coordinate \
--outFileNamePrefix $out_path/PDX4_CR2_ \
--readFilesIn $data_path/PDX4_CR2_S8_R1_001.fastq.gz

# PDX4_CR3_S9_R1_001
STAR --twopassMode Basic \
--genomeDir /home/x-tsuzuki/bigcare/ref_files/genomeassembly \
--outSAMunmapped Within \
--outFilterType BySJout \
--outSAMattributes NH HI AS NM MD MC \
--outFilterMultimapNmax 20 \
--outFilterMismatchNmax 999 \
--outFilterMismatchNoverReadLmax 0.04 \
--alignIntronMin 20 \
--alignIntronMax 1000000 \
--alignSJoverhangMin 8 \
--alignSJDBoverhangMin 1 \
--sjdbScore 1 \
--readFilesCommand zcat \
--runThreadN 20 \
--outSAMtype BAM SortedByCoordinate \
--quantMode TranscriptomeSAM GeneCounts \
--outSAMheaderHD @HD VN:1.4 SO:coordinate \
--outFileNamePrefix $out_path/PDX4_CR3_ \
--readFilesIn $data_path/PDX4_CR3_S9_R1_001.fastq.gz

# PDX4_SE1_S10_R1_001
STAR --twopassMode Basic \
--genomeDir /home/x-tsuzuki/bigcare/ref_files/genomeassembly \
--outSAMunmapped Within \
--outFilterType BySJout \
--outSAMattributes NH HI AS NM MD MC \
--outFilterMultimapNmax 20 \
--outFilterMismatchNmax 999 \
--outFilterMismatchNoverReadLmax 0.04 \
--alignIntronMin 20 \
--alignIntronMax 1000000 \
--alignSJoverhangMin 8 \
--alignSJDBoverhangMin 1 \
--sjdbScore 1 \
--readFilesCommand zcat \
--runThreadN 20 \
--outSAMtype BAM SortedByCoordinate \
--quantMode TranscriptomeSAM GeneCounts \
--outSAMheaderHD @HD VN:1.4 SO:coordinate \
--outFileNamePrefix $out_path/PDX4_SE1_ \
--readFilesIn $data_path/PDX4_SE1_S10_R1_001.fastq.gz

# PDX4_SE2_S11_R1_001
STAR --twopassMode Basic \
--genomeDir /home/x-tsuzuki/bigcare/ref_files/genomeassembly \
--outSAMunmapped Within \
--outFilterType BySJout \
--outSAMattributes NH HI AS NM MD MC \
--outFilterMultimapNmax 20 \
--outFilterMismatchNmax 999 \
--outFilterMismatchNoverReadLmax 0.04 \
--alignIntronMin 20 \
--alignIntronMax 1000000 \
--alignSJoverhangMin 8 \
--alignSJDBoverhangMin 1 \
--sjdbScore 1 \
--readFilesCommand zcat \
--runThreadN 20 \
--outSAMtype BAM SortedByCoordinate \
--quantMode TranscriptomeSAM GeneCounts \
--outSAMheaderHD @HD VN:1.4 SO:coordinate \
--outFileNamePrefix $out_path/PDX4_SE2_ \
--readFilesIn $data_path/PDX4_SE2_S11_R1_001.fastq.gz

# PDX4_SE3_S12_R1_001
STAR --twopassMode Basic \
--genomeDir /home/x-tsuzuki/bigcare/ref_files/genomeassembly \
--outSAMunmapped Within \
--outFilterType BySJout \
--outSAMattributes NH HI AS NM MD MC \
--outFilterMultimapNmax 20 \
--outFilterMismatchNmax 999 \
--outFilterMismatchNoverReadLmax 0.04 \
--alignIntronMin 20 \
--alignIntronMax 1000000 \
--alignSJoverhangMin 8 \
--alignSJDBoverhangMin 1 \
--sjdbScore 1 \
--readFilesCommand zcat \
--runThreadN 20 \
--outSAMtype BAM SortedByCoordinate \
--quantMode TranscriptomeSAM GeneCounts \
--outSAMheaderHD @HD VN:1.4 SO:coordinate \
--outFileNamePrefix $out_path/PDX4_SE3_ \
--readFilesIn $data_path/PDX4_SE3_S12_R1_001.fastq.gz

In [None]:
## sort aligned reads
conda activate apps
data_path=/home/x-tsuzuki/bigcare/myproject/data/STAR/single
samtools sort -m 3G --threads 20 -o $data_path/PDX4_CR1_Aligned.sortedByCoord.out.bam $data_path/PDX4_CR1_Aligned.sortedByCoord.out.bam
samtools sort -m 3G --threads 20 -o $data_path/PDX4_CR2_Aligned.sortedByCoord.out.bam $data_path/PDX4_CR2_Aligned.sortedByCoord.out.bam
samtools sort -m 3G --threads 20 -o $data_path/PDX4_CR3_Aligned.sortedByCoord.out.bam $data_path/PDX4_CR3_Aligned.sortedByCoord.out.bam
samtools sort -m 3G --threads 20 -o $data_path/PDX4_SE1_Aligned.sortedByCoord.out.bam $data_path/PDX4_SE1_Aligned.sortedByCoord.out.bam
samtools sort -m 3G --threads 20 -o $data_path/PDX4_SE2_Aligned.sortedByCoord.out.bam $data_path/PDX4_SE2_Aligned.sortedByCoord.out.bam
samtools sort -m 3G --threads 20 -o $data_path/PDX4_SE3_Aligned.sortedByCoord.out.bam $data_path/PDX4_SE3_Aligned.sortedByCoord.out.bam

In [None]:
## indexing alignments 
module load samtools/1.12
data_path=/home/x-tsuzuki/bigcare/myproject/data/STAR/single
samtools index -@ 20 $data_path/PDX4_CR1_Aligned.sortedByCoord.out.bam 
samtools index -@ 20 $data_path/PDX4_CR2_Aligned.sortedByCoord.out.bam 
samtools index -@ 20 $data_path/PDX4_CR3_Aligned.sortedByCoord.out.bam
samtools index -@ 20 $data_path/PDX4_SE1_Aligned.sortedByCoord.out.bam
samtools index -@ 20 $data_path/PDX4_SE2_Aligned.sortedByCoord.out.bam
samtools index -@ 20 $data_path/PDX4_SE3_Aligned.sortedByCoord.out.bam

In [None]:
#counting expression with htseq
#PDX4_CR1
module load htseq/2.0.2
data_path=/home/x-tsuzuki/bigcare/myproject/data/STAR/single/Trimmomatic
python -m HTSeq.scripts.count \
       -f bam -r pos -s yes -t exon -i gene_id --mode=union --nonunique=none -c PDX4_CR1.tsv -n 20 \
       $data_path/PDX4_CR1_SAligned.sortedByCoord.out.bam \
       /home/x-tsuzuki/bigcare/ref_files/genomeassembly/Homo_sapiens.GRCh38.105.gtf

#PDX4_CR2
module load htseq/2.0.2
data_path=/home/x-tsuzuki/bigcare/myproject/data/STAR/single/Trimmomatic
python -m HTSeq.scripts.count \
       -f bam -r pos -s yes -t exon -i gene_id --mode=union --nonunique=none -c PDX4_CR2.tsv -n 20 \
       $data_path/PDX4_CR2_SAligned.sortedByCoord.out.bam \
       /home/x-tsuzuki/bigcare/ref_files/genomeassembly/Homo_sapiens.GRCh38.105.gtf

#PDX4_CR3
module load htseq/2.0.2
data_path=/home/x-tsuzuki/bigcare/myproject/data/STAR/single/Trimmomatic
python -m HTSeq.scripts.count \
       -f bam -r pos -s yes -t exon -i gene_id --mode=union --nonunique=none -c PDX4_CR3.tsv -n 20 \
       $data_path/PDX4_CR3_SAligned.sortedByCoord.out.bam \
       /home/x-tsuzuki/bigcare/ref_files/genomeassembly/Homo_sapiens.GRCh38.105.gtf

#PDX4_SE1
module load htseq/2.0.2
data_path=/home/x-tsuzuki/bigcare/myproject/data/STAR/single/Trimmomatic
python -m HTSeq.scripts.count \
       -f bam -r pos -s yes -t exon -i gene_id --mode=union --nonunique=none -c PDX4_SE1.tsv -n 20 \
       $data_path/PDX4_SE1_SAligned.sortedByCoord.out.bam \
       /home/x-tsuzuki/bigcare/ref_files/genomeassembly/Homo_sapiens.GRCh38.105.gtf

#PDX4_SE2
module load htseq/2.0.2
data_path=/home/x-tsuzuki/bigcare/myproject/data/STAR/single/Trimmomatic
python -m HTSeq.scripts.count \
       -f bam -r pos -s yes -t exon -i gene_id --mode=union --nonunique=none -c PDX4_SE2.tsv -n 20 \
       $data_path/PDX4_SE2_SAligned.sortedByCoord.out.bam \
       /home/x-tsuzuki/bigcare/ref_files/genomeassembly/Homo_sapiens.GRCh38.105.gtf

#PDX4_SE3
module load htseq/2.0.2
data_path=/home/x-tsuzuki/bigcare/myproject/data/STAR/single/Trimmomatic
python -m HTSeq.scripts.count \
       -f bam -r pos -s yes -t exon -i gene_id --mode=union --nonunique=none -c PDX4_SE3.tsv -n 20 \
       $data_path/PDX4_SE3_SAligned.sortedByCoord.out.bam \
       /home/x-tsuzuki/bigcare/ref_files/genomeassembly/Homo_sapiens.GRCh38.105.gtf