In [None]:
# PART 1: Project Setup
mkdir RNASeq_PROJECT                  # Create project directory
cd RNASeq_PROJECT                     # Enter the project directory
mkdir DATA                            # Create directory for raw data
cd DATA                               # Navigate to DATA directory
chmod +x DATA.sh                      # Make the data preparation script executable
./DATA.sh                             # Download the data files


In [None]:
# PART 2: Install FastQC for Quality Check
sudo apt update                       # Update package lists
sudo apt upgrade                      # Upgrade installed packages
sudo apt install fastqc               # Install FastQC
mkdir -p fastqc-results               # Create directory for FastQC results


In [None]:
# PART 3: Run FastQC on Raw Data
touch pscript.sh                      # Create a script for running FastQC
chmod +x pscript.sh                   # Make the script executable
code pscript.sh                       # Open in VS Code to write the script

# CONTENTS OF pscript.sh
#!/bin/bash
DATA_DIR=~/RNASeq_PROJECT/DATA
OUTPUT_DIR=~/RNASeq_PROJECT/fastqc-results
for file in $DATA_DIR/*.fastq.gz
do
    echo "Running FastQC on $file"
    fastqc $file -o $OUTPUT_DIR
done
echo "FastQC analysis complete. Results are in the $OUTPUT_DIR directory."

./pscript.sh                          # Run the script

In [None]:
# PART 4: Summarize FastQC Results
cd fastqc-results                     # Navigate to FastQC results directory
pip install multiqc                   # Install MultiQC
multiqc .                             # Combine and summarize FastQC results


In [None]:
# PART 5: Mapping Reads to Reference Genome
mkdir -p pmapping                     # Create a directory for mapping
cd pmapping                           # Enter the mapping directory
wget -P ~/pmapping https://genome-idx.s3.amazonaws.com/hisat/grch38_tran.tar.gz
                                      # Download reference genome
tar -xvf ~/RNASeq_PROJECT/pmapping/grch38_tran.tar.gz -C ~/RNASeq_PROJECT/pmapping
                                      # Extract reference genome files
touch mapping.sh                      # Create mapping script
chmod +x mapping.sh                   # Make the script executable
code mapping.sh                       # Open the script for editing

# CONTENTS OF mapping.sh
#!/bin/bash
# Mapping Reads: SRR30861166
hisat2 \
-x ~/RNASeq_PROJECT/pmapping/grch38_tran/genome_tran \
-1 ~/RNASeq_PROJECT/DATA/SRR30861166_1.fastq.gz \
-2 ~/RNASeq_PROJECT/DATA/SRR30861166_2.fastq.gz \
--rna-strandness RF \
-S ~/RNASeq_PROJECT/pmapping/SRR30861166.sam 2> SRR30861166_summary.txt
samtools view -S -b ~/RNASeq_PROJECT/pmapping/SRR30861166.sam > ~/RNASeq_PROJECT/pmapping/SRR30861166.bam
rm ~/RNASeq_PROJECT/pmapping/SRR30861166.sam

# Mapping Reads: SRR30861167
hisat2 \
-x ~/RNASeq_PROJECT/pmapping/grch38_tran/genome_tran \
-1 ~/RNASeq_PROJECT/DATA/SRR30861167_1.fastq.gz \
-2 ~/RNASeq_PROJECT/DATA/SRR30861167_2.fastq.gz \
--rna-strandness RF \
-S ~/RNASeq_PROJECT/pmapping/SRR30861167.sam 2> SRR30861167_summary.txt
samtools view -S -b ~/RNASeq_PROJECT/pmapping/SRR30861167.sam > ~/RNASeq_PROJECT/pmapping/SRR30861167.bam
rm ~/RNASeq_PROJECT/pmapping/SRR30861167.sam


# Mapping Reads: SRR30861168
hisat2 \
-x ~/RNASeq_PROJECT/pmapping/grch38_tran/genome_tran \
-1 ~/RNASeq_PROJECT/DATA/SRR30861168_1.fastq.gz \
-2 ~/RNASeq_PROJECT/DATA/SRR30861168_2.fastq.gz \
--rna-strandness RF \
-S ~/RNASeq_PROJECT/pmapping/SRR30861168.sam 2> SRR30861168_summary.txt
samtools view -S -b ~/RNASeq_PROJECT/pmapping/SRR30861168.sam > ~/RNASeq_PROJECT/pmapping/SRR30861168.bam
rm ~/RNASeq_PROJECT/pmapping/SRR30861168.sam


# Mapping Reads: SRR30861170
hisat2 \
-x ~/RNASeq_PROJECT/pmapping/grch38_tran/genome_tran \
-1 ~/RNASeq_PROJECT/DATA/SRR30861170_1.fastq.gz \
-2 ~/RNASeq_PROJECT/DATA/SRR30861170_2.fastq.gz \
--rna-strandness RF \
-S ~/RNASeq_PROJECT/pmapping/SRR30861170.sam 2> SRR30861170_summary.txt
samtools view -S -b ~/RNASeq_PROJECT/pmapping/SRR30861170.sam > ~/RNASeq_PROJECT/pmapping/SRR30861170.bam
rm ~/RNASeq_PROJECT/pmapping/SRR30861170.sam



# Mapping Reads: SRR30861171
hisat2 \
-x ~/RNASeq_PROJECT/pmapping/grch38_tran/genome_tran \
-1 ~/RNASeq_PROJECT/DATA/SRR30861171_1.fastq.gz \
-2 ~/RNASeq_PROJECT/DATA/SRR30861171_2.fastq.gz \
--rna-strandness RF \
-S ~/RNASeq_PROJECT/pmapping/SRR30861171.sam 2> SRR30861171_summary.txt
samtools view -S -b ~/RNASeq_PROJECT/pmapping/SRR30861171.sam > ~/RNASeq_PROJECT/pmapping/SRR30861171.bam
rm ~/RNASeq_PROJECT/pmapping/SRR30861171.sam


# Mapping Reads: SRR30861169
hisat2 \
-x ~/RNASeq_PROJECT/pmapping/grch38_tran/genome_tran \
-1 ~/RNASeq_PROJECT/DATA/SRR30861169_1.fastq.gz \
-2 ~/RNASeq_PROJECT/DATA/SRR30861169_2.fastq.gz \
--rna-strandness RF \
-S ~/RNASeq_PROJECT/pmapping/SRR30861169.sam 2> SRR30861169_summary.txt
samtools view -S -b ~/RNASeq_PROJECT/pmapping/SRR30861169.sam > ~/RNASeq_PROJECT/pmapping/SRR30861169.bam
rm ~/RNASeq_PROJECT/pmapping/SRR30861169.sam



In [None]:
# PART 6: Analyze Mapping Quality
cd pmapping                           # Enter pmapping directory
multiqc *.txt                         # Combine and analyze mapping results

In [None]:
# PART 7: Download Annotation File
mkdir counts                          # Create a directory for counts
cd counts                             # Navigate to the counts directory
wget https://ftp.ensembl.org/pub/release-113/gtf/homo_sapiens/Homo_sapiens.GRCh38.113.gtf.gz
                                      # Download GTF annotation file

In [None]:
# PART 8: Run FeatureCounts
touch fcount.sh                       # Create feature counts script
chmod +x fcount.sh                    # Make the script executable
code fcount.sh                        # Open the script for editing

# CONTENTS OF fcount.sh
#!/bin/bash
featureCounts -T 4 -s 2 -p --countReadPairs \
-a ~/RNASeq_PROJECT/counts/Homo_sapiens.GRCh38.113.gtf.gz \
-t exon -g gene_id \
-o ~/RNASeq_PROJECT/counts/counts.txt \
~/RNASeq_PROJECT/pmapping/SRR30861169.bam \
~/RNASeq_PROJECT/pmapping/SRR30861170.bam \
~/RNASeq_PROJECT/pmapping/SRR30861171.bam \
~/RNASeq_PROJECT/pmapping/SRR30861166.bam \
~/RNASeq_PROJECT/pmapping/SRR30861167.bam \
~/RNASeq_PROJECT/pmapping/SRR30861168.bam

grep -v "^#" counts.txt > counts_no_header.txt
cut -f 1,7-12 counts_no_header.txt > counts_filtered.txt