In [None]:
# PART 1: Project Setup
mkdir RNASeq_PROJECT                  # Create project directory
cd RNASeq_PROJECT                     # Enter the project directory
mkdir DATA                            # Create directory for raw data
cd DATA                               # Navigate to DATA directory
ls                                    # Check the contents of the directory
chmod +x DATA.sh                      # Make the data preparation script executable
./DATA.sh                             # Run the data preparation script
ls                                    # Verify files are present after running the script


In [None]:
# PART 2: Install FastQC for Quality Check
sudo apt update                       # Update package lists
sudo apt upgrade                      # Upgrade installed packages
sudo apt install fastqc               # Install FastQC
mkdir -p fastqc-results               # Create directory for FastQC results
ls                                    # Verify folder creation


In [None]:
# PART 3: Run FastQC on Raw Data
touch pscript.sh                      # Create a script for running FastQC
chmod +x pscript.sh                   # Make the script executable
code pscript.sh                       # Open the script for editing

# CONTENTS OF pscript.sh
#!/bin/bash
DATA_DIR=~/RNASeq_PROJECT/DATA
OUTPUT_DIR=~/RNASeq_PROJECT/fastqc-results
for file in $DATA_DIR/*.fastq.gz
do
    echo "Running FastQC on $file"
    fastqc $file -o $OUTPUT_DIR
done
echo "FastQC analysis complete. Results are in the $OUTPUT_DIR directory."

./pscript.sh                          # Run the script

In [None]:
# PART 4: Summarize FastQC Results
cd fastqc-results                     # Navigate to FastQC results directory
pip install multiqc                   # Install MultiQC
multiqc .                             # Combine and summarize FastQC results


In [None]:
# PART 5: Mapping Reads to Reference Genome
cd ..                                 # Return to the main directory
mkdir -p pmapping                     # Create a directory for mapping
cd pmapping                           # Enter the mapping directory
wget -P ~/pmapping https://genome-idx.s3.amazonaws.com/hisat/grch38_tran.tar.gz
                                      # Download reference genome
tar -xvf ~/RNASeq_PROJECT/pmapping/grch38_tran.tar.gz -C ~/RNASeq_PROJECT/pmapping
                                      # Extract reference genome files
touch mapping.sh                      # Create mapping script
chmod +x mapping.sh                   # Make the script executable
code mapping.sh                       # Open the script for editing

# CONTENTS OF mapping.sh
#!/bin/bash
samples=("SRR30861166" "SRR30861167" "SRR30861168" "SRR30861169" "SRR30861170" "SRR30861171")
for sample in "${samples[@]}"; do
    hisat2 -x ~/RNASeq_PROJECT/pmapping/grch38_tran/genome_tran \
    -1 ~/RNASeq_PROJECT/DATA/${sample}_1.fastq.gz \
    -2 ~/RNASeq_PROJECT/DATA/${sample}_2.fastq.gz \
    --rna-strandness RF \
    -S ~/RNASeq_PROJECT/pmapping/${sample}.sam 2> ${sample}_summary.txt
    samtools view -S -b ~/RNASeq_PROJECT/pmapping/${sample}.sam > ~/RNASeq_PROJECT/pmapping/${sample}.bam
    rm ~/RNASeq_PROJECT/pmapping/${sample}.sam
done

./mapping.sh                          # Run the mapping script


In [None]:
# PART 6: Analyze Mapping Quality
cd pmapping                           # Enter pmapping directory
multiqc *.txt                         # Combine and analyze mapping results
mkdir counts                          # Create a directory for counts
cd counts                             # Navigate to the counts directory

In [None]:
# PART 7: Download Annotation File
wget https://ftp.ensembl.org/pub/release-113/gtf/homo_sapiens/Homo_sapiens.GRCh38.113.gtf.gz
                                      # Download GTF annotation file

In [None]:
# PART 8: Run FeatureCounts
touch fcount.sh                       # Create feature counts script
chmod +x fcount.sh                    # Make the script executable
code fcount.sh                        # Open the script for editing

# CONTENTS OF fcount.sh
#!/bin/bash
GTF_FILE=~/RNASeq_PROJECT/counts/Homo_sapiens.GRCh38.113.gtf.gz
BAM_DIR=~/RNASeq_PROJECT/pmapping
OUTPUT_FILE=~/RNASeq_PROJECT/counts/counts.txt

bam_files=()
for file in $BAM_DIR/*.bam; do
    bam_files+=("$file")
done

featureCounts -T 4 -s 2 -p --countReadPairs \
-a $GTF_FILE -t exon -g gene_id \
-o $OUTPUT_FILE "${bam_files[@]}"

grep -v "^#" $OUTPUT_FILE > counts_no_header.txt
cut -f 1,7-12 counts_no_header.txt > counts_filtered.txt

./fcount.sh                           # Run the FeatureCounts script