## Unpacking Cellular Clarity Dataset to Preprocess and Prepare the Dataset

#### Teague McCracken 2/24/25

Note: create a new environment if sequence alignment tools are needed

In [1]:
import pandas as pd
import os
import socket

# Define paths
# System Switch - @Mohsen, add your systems if you want to...
hostname = socket.gethostname()
if hostname in ['Teagues-MacBook-Pro.local', 'Teagues-MBP.attlocal.net']:
    sys_id = 1
else:
    sys_id = 0

# Setting programs path based on which system was identified by the system switch
if sys_id == 1: 
    programs_dir = '/Users/teaguemcc/Programs'
elif sys_id == 0:
    programs_dir = '/home/temccrac/Programs'

git_path = os.path.join(programs_dir, 'git_clones/ECE759_Project')
git_data_path = os.path.join(git_path, 'cellular_clarity')
data_path = os.path.join(programs_dir, 'data/cellular_clarity')

In [None]:
epidermis_RPKM = pd.read_excel(os.path.join(git_data_path, 'GSE228011_epidermis_RPKM.xlsx'))

In [3]:
epidermis_RPKM.columns

Index(['AGI', 'Alias', 'Description', 'A1', 'A2', 'A3', 'Mean  A', 'B+1',
       'B+2', 'B+3', 'Mean B+', 'C+1', 'C+2', 'C+3', 'Mean C+', 'D+1', 'D+2',
       'D+3', 'Mean D+', 'E+1', 'E+2', 'E+3', 'Mean E+', 'F+1', 'F+2', 'F+3',
       'Mean F+', 'G+1', 'G+3', 'G+4', 'Mean G+', 'B-1', 'B-2', 'B-3',
       'Mean B-', 'C-1', 'C-2', 'C-3', 'Mean C-', 'D-1', 'D-2', 'D-3',
       'Mean D-', 'E-1', 'E-2', 'E-3', 'Mean E-', 'F-1', 'F-2', 'F-3',
       'Mean F-', 'G-1', 'G-2', 'G-4', 'Mean G-'],
      dtype='object')

In [4]:
epidermis_RPKM.head()

Unnamed: 0,AGI,Alias,Description,A1,A2,A3,Mean A,B+1,B+2,B+3,...,E-3,Mean E-,F-1,F-2,F-3,Mean F-,G-1,G-2,G-4,Mean G-
0,AT1G01010,NAC001,NAC domain containing protein 1,9.395285,10.565348,8.507675,9.489436,6.168535,6.558103,5.528676,...,6.106949,6.716575,7.056339,9.987304,8.876777,8.64014,13.742812,12.494721,8.31729,11.518274
1,AT1G01020,ARV1,,12.101814,11.064094,10.210212,11.125373,12.776892,11.617354,13.798787,...,8.753435,9.233842,8.69723,9.190536,11.132598,9.673455,12.597341,10.898251,12.83178,12.109124
2,AT1G01030,NGA3,NGATHA3,1.739964,2.353536,1.629959,1.90782,1.066512,1.792775,1.594087,...,1.773065,1.683229,2.10578,1.863083,2.29298,2.087281,2.677744,3.128885,1.729712,2.512113
3,AT1G01040,"ASU1, ATDCL1, CAF, DCL1, EMB60, EMB76, SIN1, SUS1","ABNORMAL SUSPENSOR 1, DICER-LIKE 1, CARPEL FAC...",0.913234,1.035521,1.365358,1.104704,0.893378,0.790887,0.757958,...,1.045099,1.223554,1.963766,1.631575,0.965136,1.520159,0.495219,0.65982,0.549524,0.568188
4,AT1G01046,MIR838A,microRNA838A,0.246349,0.0,0.0,0.082116,0.306719,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.293369,0.0,0.0,0.09779


In [None]:
DE_genes_epidermis_RPKM = pd.read_csv(os.path.join(git_data_path, 'GSE228011_rpkm_DE_genes_epidermis.csv.gz'), compression = 'gzip')
DE_genes_epidermis_RPKM.columns

Index(['GeneID', 't0(Fe+)', 't6 (Fe+)', 't6 (Fe-)', 't12 (Fe+)', 't12 (Fe-)',
       't18 (Fe+)', 't18 (Fe-)', 't24 (Fe+)', 't24 (Fe-)', 't30 (Fe+)',
       't30 (Fe-)', 't36 (Fe+)', 't36(Fe-)', 'GeneName'],
      dtype='object')

In [6]:
DE_genes_epidermis_RPKM.head()

Unnamed: 0,GeneID,t0(Fe+),t6 (Fe+),t6 (Fe-),t12 (Fe+),t12 (Fe-),t18 (Fe+),t18 (Fe-),t24 (Fe+),t24 (Fe-),t30 (Fe+),t30 (Fe-),t36 (Fe+),t36(Fe-),GeneName
0,AT1G03070,671.339306,346.370569,534.558117,544.918031,548.079335,552.925562,531.805714,561.520848,646.830501,546.531955,511.358494,618.642493,466.02812,unknown
1,AT1G03090,56.489909,413.232207,115.251591,256.182647,92.072337,138.823846,76.020536,211.405346,51.50888,178.78663,43.954197,89.647417,55.002692,MCCA
2,AT1G04610,555.842721,334.214428,515.020756,389.253013,504.686718,593.00306,737.095889,515.249415,602.295081,566.374468,633.293184,449.895363,435.154338,YUC3
3,AT1G04990,442.063076,305.687575,386.743703,306.571326,360.500173,332.170769,373.96519,420.078045,374.790222,349.300187,377.968463,422.188547,398.544092,unknown
4,AT1G05560,820.899447,380.128909,695.339515,406.5961,845.046569,528.074246,632.583643,654.978333,966.961101,577.220749,965.881525,917.40475,805.241328,"UGT1, UGT75B1"


In [7]:
DE_genes_epidermis_RPKM.shape

(806, 15)

In [None]:
SRA_metadata = pd.read_csv(os.path.join(git_data_path, 'SRARunTable.csv'))
SRA_metadata.columns

FileNotFoundError: [Errno 2] No such file or directory: '/home/temccrac/Programs/git_clones/ECE759_Project/cellular_clarity/SRARunTable.csv'

## Testing RPKM Properties

In [14]:
example_data_path = os.path.join(data_path, 'example_data')
X_id = pd.read_csv(os.path.join(example_data_path, 'X_id.csv'))
Y = pd.read_csv(os.path.join(example_data_path, 'Y.csv.gz'), compression='gzip', index_col = 0)

In [15]:
# Simulate RPKM 
import numpy as np
num_genes = len(Y.columns) # number of genes, columns are genes and rows are samples
gene_lengths = np.random.randint(500, 5000, size=num_genes) # get random gene lengths
total_mapped_reads = Y.sum(axis=0) # total amount of each gene detected


## Processing Raw RNA-Seq Data

The folders in arabidopsis_data should all contain fastq.gz files for adapter trimming

In [2]:
# Gather the file paths and directories 
sequence_path = os.path.join(data_path, "adrabidopsis_data")
directories = []
file_paths = []
dir_names = []

for root, dirs, files in os.walk(sequence_path):
    # Append full path of each directory
    for dir_name in dirs:
        dir_names.append(dir_name)
        directories.append(os.path.join(root, dir_name))
    
    # Append full path of each file
    for file in files:
        file_paths.append(os.path.join(root, file))

print("Directories:", directories)
print("Files:", file_paths)

Directories: ['/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lane4_FastqFiles', '/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lane5_FastqFiles', '/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lane6_FastqFiles', '/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lane7_FastqFiles', '/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/20171212_GSL_105B_Long']
Files: ['/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/GSL_100B_Long_PoolD-001.tar', '/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Sample - Pool Information_1-19.xlsx', '/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/GSL_100B_Long_PoolA-002.tar', '/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/GSL_100B_Long_PoolB-003.tar', '/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/adapters.fa', '/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/GSL_100B_Long_PoolC-

In [3]:
# Store each fastq.gz file containing sequences under its respective sequencing pool directory
sequence_dictionary = dict.fromkeys(dir_names)
raw_sequence_file_paths = []
for n, dir in enumerate(directories): 
    raw_sequence_file_paths = []  # Reset for each directory
    for root, dirs, files in os.walk(dir):
        for file in files: 
            if file.endswith('fastq.gz'): 
                raw_sequence_file_paths.append(os.path.join(dir,file))
    sequence_dictionary[dir_names[n]] = raw_sequence_file_paths
sequence_dictionary

{'Lane4_FastqFiles': ['/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lane4_FastqFiles/Long_4_S102_L004_R1_001.fastq.gz',
  '/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lane4_FastqFiles/trimmed_Long_3_S101_L004_R1_001.fastq.gz',
  '/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lane4_FastqFiles/trimmed_Long_10_S108_L004_R1_001.fastq.gz',
  '/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lane4_FastqFiles/Long_2_S100_L004_R1_001.fastq.gz',
  '/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lane4_FastqFiles/Long_3_S101_L004_R1_001.fastq.gz',
  '/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lane4_FastqFiles/trimmed_Long_12_S110_L004_R1_001.fastq.gz',
  '/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lane4_FastqFiles/Long_10_S108_L004_R1_001.fastq.gz',
  '/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lane4_FastqFiles/Long_11_S109_L004_R1_001.fastq.gz',


In [74]:
# Trim all of the files using the fastq-mcf command
import subprocess

# Adapter file path
adapter_path = "/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/adapters.fa"

# Dictionary to store trimmed file paths
trimmed_files = {}

# Process each directory separately
for pool, file_paths in sequence_dictionary.items():
    trimmed_files[pool] = []  # Store trimmed file paths for this pool

    for file_path in file_paths:
        # Get directory and filename
        directory, filename = os.path.split(file_path)

        # Generate new filename with "trimmed_" prefix
        trimmed_filename = f"trimmed_{filename}"
        trimmed_path = os.path.join(directory, trimmed_filename)

        # Construct the fastq-mcf command
        command = [
            "fastq-mcf",
            adapter_path,
            file_path,
            "-o", trimmed_path,
            "-l", "20", "-q", "15", "-x", "0.75"
        ]

        # Run the trimming process
        subprocess.run(command, check=True)
        print(f"Trimmed: {file_path} → {trimmed_path}")

        # Store the trimmed file path
        trimmed_files[pool].append(trimmed_path)

# Print summary of trimmed files
print("\nTrimmed Sequences Dictionary:")
for pool, files in trimmed_files.items():
    print(f"{pool}: {files}")


Command Line: /home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/adapters.fa /home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lane4_FastqFiles/Long_4_S102_L004_R1_001.fastq.gz -o /home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lane4_FastqFiles/trimmed_Long_4_S102_L004_R1_001.fastq.gz -l 20 -q 15 -x 0.75
Scale used: 2.2
Phred: 33
Threshold used: 751 out of 300000
Adapter Illumina_Universal_Adapter (AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC): counted 115505 at the 'end' of '/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lane4_FastqFiles/Long_4_S102_L004_R1_001.fastq.gz', clip set to 1
Files: 1
Total reads: 15736335
Too short after clip: 619
Clipped 'end' reads: Count: 10316007, Mean: 28.73, Sd: 26.86
Trimmed 1008061 reads by an average of 2.88 bases on quality < 15
Trimmed: /home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lane4_FastqFiles/Long_4_S102_L004_R1_001.fastq.gz → /home/temccrac/Programs/data/cellular_clarity/a

In [15]:
# Initialize dictionary to store trimmed files by directory
trimmed_sequence_dictionary = dict.fromkeys(dir_names)

for n, dir in enumerate(directories): 
    trimmed_file_paths = []  # Reset list for each directory
    for root, dirs, files in os.walk(dir):
        for file in files: 
            if "trimmed_" in file and file.endswith('fastq.gz'): 
                trimmed_file_paths.append(os.path.join(dir, file))
    
    trimmed_sequence_dictionary[dir_names[n]] = trimmed_file_paths

# Output the dictionary
trimmed_sequence_dictionary
fastq_files = [file for files_list in trimmed_sequence_dictionary.values() for file in files_list]
fastq_files

['/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lane4_FastqFiles/trimmed_Long_3_S101_L004_R1_001.fastq.gz',
 '/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lane4_FastqFiles/trimmed_Long_10_S108_L004_R1_001.fastq.gz',
 '/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lane4_FastqFiles/trimmed_Long_12_S110_L004_R1_001.fastq.gz',
 '/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lane4_FastqFiles/trimmed_Long_8_S106_L004_R1_001.fastq.gz',
 '/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lane4_FastqFiles/trimmed_Long_7_S105_L004_R1_001.fastq.gz',
 '/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lane4_FastqFiles/trimmed_Long_18_S104_L004_R1_001.fastq.gz',
 '/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lane4_FastqFiles/trimmed_Long_2_S100_L004_R1_001.fastq.gz',
 '/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lane4_FastqFiles/trimmed_Long_4_S102_L004_R1_001.

In [None]:
# Parallel processing for sequence alignment
import os
import subprocess
from multiprocessing import Pool, cpu_count

# Define paths
tophat2_path = "/home/temccrac/Programs/environments/tophat-2.1.1.Linux_x86_64/tophat"
bowtie2_index = "/home/temccrac/Programs/data/genomes/TAIR10_index" # path to index prefix
gtf_file = "/home/temccrac/Programs/data/genomes/Arabidopsis_thaliana.TAIR10.54.gtf"

# Function to run TopHat2 for a single file
def run_tophat(fastq_file_path):
    dir_name_detected = None  # Initialize
    # Loop through each directory name to find a match
    for dir_name in dir_names:
        if f"/{dir_name}/" in fastq_file_path:  # Ensures exact match as directory
            dir_name_detected = dir_name
            break  # Stop searching after first match

    # Only print an error if no match was found after the loop finishes
    if dir_name_detected is None:
        print(f"Error: No matching directory found for {fastq_file_path}")
        return  # Exit function if no match is found

    sample_name = os.path.basename(fastq_file_path).replace(".fastq.gz", "")
    print(sample_name, 'now being aligned')
    output_dir_path = os.path.join(sequence_path, dir_name_detected, sample_name)
    
    # Construct TopHat2 command
    command = [
        "python2", tophat2_path,  # Ensures Python2 is used
        "-o", output_dir_path,
        "-G", gtf_file,
        "-p", "4",  # Number of threads per job
        bowtie2_index,
        fastq_file_path
    ]
    
    # Run the command
    print(f"Running TopHat2 for {sample_name}...")
    subprocess.run(command, check=True)
    print(f"Completed {sample_name}.")

# Number of parallel processes
num_cores = 8  # Use as many cores as available

# Run in parallel
with Pool(processes=num_cores) as pool:
    pool.map(run_tophat, fastq_files)

print("All alignments completed!")

trimmed_Long_7_S105_L004_R1_001trimmed_Long_11_S109_L004_R1_001trimmed_Long_1_S99_L004_R1_001trimmed_Long_12_S110_L004_R1_001trimmed_Long_2_S100_L004_R1_001trimmed_Long_28_S114_L005_R1_001trimmed_Long_13_S111_L005_R1_001       now being alignednow being alignednow being alignednow being alignednow being alignednow being aligned
now being aligned



Running TopHat2 for trimmed_Long_1_S99_L004_R1_001...
Running TopHat2 for trimmed_Long_12_S110_L004_R1_001...
Running TopHat2 for trimmed_Long_7_S105_L004_R1_001...Running TopHat2 for trimmed_Long_11_S109_L004_R1_001...Running TopHat2 for trimmed_Long_28_S114_L005_R1_001...
Running TopHat2 for trimmed_Long_2_S100_L004_R1_001...

Running TopHat2 for trimmed_Long_13_S111_L005_R1_001...



trimmed_Long_3_S101_L004_R1_001 now being aligned
Running TopHat2 for trimmed_Long_3_S101_L004_R1_001...




[2025-03-15 16:39:51] Beginning TopHat run (v2.1.1)

[2025-03-15 16:39:51] Beginning TopHat run (v2.1.1)
-----------------------------------------------

[2025-03-15 16:39:51] Beginning TopHat run (v2.1.1)
[2025-03-15 16:39:51] Checking for Bowtie
-----------------------------------------------
[2025-03-15 16:39:51] Beginning TopHat run (v2.1.1)
-----------------------------------------------
[2025-03-15 16:39:51] Checking for Bowtie
[2025-03-15 16:39:51] Checking for Bowtie

[2025-03-15 16:39:51] Beginning TopHat run (v2.1.1)
-----------------------------------------------
[2025-03-15 16:39:51] Checking for Bowtie
-----------------------------------------------
[2025-03-15 16:39:51] Checking for Bowtie

[2025-03-15 16:39:51] Beginning TopHat run (v2.1.1)
-----------------------------------------------
[2025-03-15 16:39:51] Checking for Bowtie

[2025-03-15 16:39:51] Beginning TopHat run (v2.1.1)
-----------------------------------------------
[2025-03-15 16:39:51] Checking for Bowtie

Completed trimmed_Long_3_S101_L004_R1_001.
trimmed_Long_10_S108_L004_R1_001 now being aligned
Running TopHat2 for trimmed_Long_10_S108_L004_R1_001...


-----------------------------------------------
[2025-03-15 16:57:07] A summary of the alignment counts can be found in /home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lane4_FastqFiles/trimmed_Long_3_S101_L004_R1_001/align_summary.txt
[2025-03-15 16:57:07] Run complete: 00:17:16 elapsed

[2025-03-15 16:57:07] Beginning TopHat run (v2.1.1)
-----------------------------------------------
[2025-03-15 16:57:07] Checking for Bowtie
		  Bowtie version:	 2.5.4.0
[2025-03-15 16:57:07] Checking for Bowtie index files (genome)..
[2025-03-15 16:57:07] Checking for reference FASTA file
[2025-03-15 16:57:07] Generating SAM header for /home/temccrac/Programs/data/genomes/TAIR10_index
[2025-03-15 16:57:07] Reading known junctions from GTF file
[2025-03-15 16:57:11] Preparing reads
	 left reads: min. length=20, max. length=125, 10465739 kept reads (1445 discarded)
[2025-03-15 16:59:11] Building transcriptome data files /home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lan

Completed trimmed_Long_11_S109_L004_R1_001.
trimmed_Long_9_S107_L004_R1_001 now being aligned
Running TopHat2 for trimmed_Long_9_S107_L004_R1_001...


-----------------------------------------------
[2025-03-15 17:00:07] A summary of the alignment counts can be found in /home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lane4_FastqFiles/trimmed_Long_11_S109_L004_R1_001/align_summary.txt
[2025-03-15 17:00:07] Run complete: 00:20:16 elapsed

[2025-03-15 17:00:07] Beginning TopHat run (v2.1.1)
-----------------------------------------------
[2025-03-15 17:00:07] Checking for Bowtie
		  Bowtie version:	 2.5.4.0
[2025-03-15 17:00:07] Checking for Bowtie index files (genome)..
[2025-03-15 17:00:07] Checking for reference FASTA file
[2025-03-15 17:00:07] Generating SAM header for /home/temccrac/Programs/data/genomes/TAIR10_index
[2025-03-15 17:00:07] Reading known junctions from GTF file
[2025-03-15 17:00:11] Preparing reads
[2025-03-15 17:00:52] Resuming TopHat pipeline with unmapped reads
[2025-03-15 17:00:52] Mapping left_kept_reads.m2g_um to genome TAIR10_index with Bowtie2 


Completed trimmed_Long_2_S100_L004_R1_001.
trimmed_Long_4_S102_L004_R1_001 now being aligned
Running TopHat2 for trimmed_Long_4_S102_L004_R1_001...


-----------------------------------------------
[2025-03-15 17:01:03] A summary of the alignment counts can be found in /home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lane4_FastqFiles/trimmed_Long_2_S100_L004_R1_001/align_summary.txt
[2025-03-15 17:01:03] Run complete: 00:21:12 elapsed

[2025-03-15 17:01:03] Beginning TopHat run (v2.1.1)
-----------------------------------------------
[2025-03-15 17:01:03] Checking for Bowtie
		  Bowtie version:	 2.5.4.0
[2025-03-15 17:01:03] Checking for Bowtie index files (genome)..
[2025-03-15 17:01:03] Checking for reference FASTA file
[2025-03-15 17:01:03] Generating SAM header for /home/temccrac/Programs/data/genomes/TAIR10_index
[2025-03-15 17:01:03] Reading known junctions from GTF file
[2025-03-15 17:01:07] Preparing reads
	 left reads: min. length=20, max. length=125, 11459220 kept reads (1728 discarded)
[2025-03-15 17:02:22] Building transcriptome data files /home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lan

In [None]:
# Reccomended by ChatGPT for concatinating the output bam files from alignment into a matrix
# Do I need to move all bam files to one folder, hehe?
featureCounts -T 8 -a Arabidopsis_thaliana.TAIR10.54.gtf -o gene_counts.txt *.bam


In [18]:
dir_names

['Lane4_FastqFiles',
 'Lane5_FastqFiles',
 'Lane6_FastqFiles',
 'Lane7_FastqFiles',
 '20171212_GSL_105B_Long']

In [9]:
trimmed_sequence_dictionary[dir_names[0]][0]

'/home/temccrac/Programs/data/cellular_clarity/adrabidopsis_data/Lane4_FastqFiles/trimmed_Long_3_S101_L004_R1_001.fastq.gz'

In [None]:
# Command for tophat
tophat2 -o /path/to/output \
  -G ~/genomes/TAIR10/Arabidopsis_thaliana.TAIR10.54.gtf \
  -p 4 \
  ~/genomes/TAIR10/TAIR10_index \
  /path/to/sample.fastq.gz