#### 1. Processing all forward sequences

In [2]:
import pandas as pd
import numpy as np
import os

# Input and output directories
input_dir = '/home/azureuser/dna_sequencing/Anushka/batches/'
output_dir = '/home/azureuser/dna_sequencing/clean_forward_reads/'

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Define the quality check function
def is_quality_good(quality_scores):
    return np.min(quality_scores) >= 30

# Iterate through all 603 parquet files
for i in range(604):  # 0 to 603 inclusive
    file_path = os.path.join(input_dir, f'reads_batch_{i}.parquet')
    
    try:
        # Step 1: Load parquet file
        df = pd.read_parquet(file_path)

        # Step 2: Filter by sequence length (>= 100 bp)
        df_quality = df[df['sequence'].str.len() >= 100]

        # Step 3: Filter by Phred quality score (all scores >= 30)
        df_quality = df_quality[df_quality['quality'].apply(is_quality_good)]

        # Step 4: Save filtered DataFrame
        output_path = os.path.join(output_dir, f'clean_reads_batch_{i}.parquet')
        df_quality.to_parquet(output_path, index=False)

        # Print how many records were retained
        print(f'Saved clean_reads_batch_{i}.parquet ({len(df_quality)} records)')
        
    except Exception as e:
        print(f"❌ Failed to process file {i}: {e}")

print("✅ All files processed and saved to clean_forward_reads.")

Saved clean_reads_batch_0.parquet (51690 records)
Saved clean_reads_batch_1.parquet (52445 records)
Saved clean_reads_batch_2.parquet (51523 records)
Saved clean_reads_batch_3.parquet (46075 records)
Saved clean_reads_batch_4.parquet (43084 records)
Saved clean_reads_batch_5.parquet (45352 records)
Saved clean_reads_batch_6.parquet (44091 records)
Saved clean_reads_batch_7.parquet (39502 records)
Saved clean_reads_batch_8.parquet (46885 records)
Saved clean_reads_batch_9.parquet (47292 records)
Saved clean_reads_batch_10.parquet (49238 records)
Saved clean_reads_batch_11.parquet (49857 records)
Saved clean_reads_batch_12.parquet (47080 records)
Saved clean_reads_batch_13.parquet (49517 records)
Saved clean_reads_batch_14.parquet (51059 records)
Saved clean_reads_batch_15.parquet (50967 records)
Saved clean_reads_batch_16.parquet (50708 records)
Saved clean_reads_batch_17.parquet (48695 records)
Saved clean_reads_batch_18.parquet (48871 records)
Saved clean_reads_batch_19.parquet (46718

In [3]:
num_files = len([f for f in os.listdir(output_dir) if os.path.isfile(os.path.join(output_dir, f))])
print(f"Number of files in clean_forward_reads: {num_files}")

Number of files in clean_forward_reads: 604


In [4]:
df1 = pd.read_parquet('/home/azureuser/dna_sequencing/clean_forward_reads/clean_reads_batch_0.parquet')
df1

Unnamed: 0,id,sequence,quality
0,SRR5177930.19,GCCATAGCCATTGCCATTGCCACTTGGGGCAAAGCCATTTCCCCCA...,"[33, 33, 33, 33, 33, 37, 37, 37, 37, 37, 37, 3..."
1,SRR5177930.28,ATGTGGGATTTTGATATTTATGGTACTGTGTCTATGTGCTGATTGT...,"[33, 33, 33, 33, 33, 37, 37, 37, 37, 37, 37, 3..."
2,SRR5177930.38,ACCTTTATAGGTGGGGATTAGGAGTCCCTTCTGGGCTGGGTGTGGT...,"[33, 33, 33, 33, 33, 37, 37, 37, 37, 37, 37, 3..."
3,SRR5177930.39,GCACAGGTAGCCAGACTCTGATCATGGCTCTGAGGAGGAGCCCTGG...,"[33, 33, 33, 33, 33, 37, 37, 37, 37, 37, 37, 3..."
4,SRR5177930.58,ATCCTGGGTTTTAATGCTAGGGTGGAAAGGTATTTCTGAAGCCTTG...,"[33, 33, 33, 33, 33, 37, 37, 37, 37, 37, 37, 3..."
...,...,...,...
51685,SRR5177930.99988,GAAAGATGTTGTTTTTGGTGAGTTTGACGCTTTTGGGCCTTGGGTG...,"[33, 33, 33, 33, 33, 37, 37, 37, 37, 37, 37, 3..."
51686,SRR5177930.99989,ATGCCGTGGGTTATTTCCTAAGGTTTCCTAGGTTATAGCCTAACCT...,"[33, 33, 33, 33, 33, 37, 37, 37, 37, 37, 37, 3..."
51687,SRR5177930.99995,TAATCGTTTCATATATGATGGAATTGACAGCAACTTTGAACCTGAG...,"[33, 33, 33, 33, 33, 37, 37, 37, 37, 37, 37, 3..."
51688,SRR5177930.99996,ACCACAATTCCAGAAAATGACATAGAGAAGACTGACCCTTGGTTTG...,"[33, 33, 33, 33, 33, 37, 37, 37, 37, 37, 37, 3..."


In [6]:
df2 = pd.read_parquet('/home/azureuser/dna_sequencing/Anushka/batches/reads_batch_0.parquet')
df2.head(20)

Unnamed: 0,id,sequence,quality
0,SRR5177930.1,NTACCTTCAGGCCCCTGGACCCTTGCTCCCCAGCTGGTCCGTCCGG...,"[2, 27, 27, 33, 33, 37, 37, 37, 37, 37, 37, 37..."
1,SRR5177930.2,NTCCCCTCTGGGCACCTCATTCCCAGAGGCATGTAAGGCTGGAAGG...,"[2, 27, 27, 33, 33, 37, 37, 37, 37, 37, 37, 37..."
2,SRR5177930.3,NATGTGAACACCTGAATGAATGAGTGCCCTGAAAATATGACTGGCT...,"[2, 27, 33, 33, 33, 37, 37, 37, 37, 37, 37, 37..."
3,SRR5177930.4,NGCCTGTGGGCCAGGGCCAGAGCCTTCAGGGACCCTTGACTCCCCG...,"[2, 27, 27, 27, 33, 37, 37, 37, 37, 37, 37, 37..."
4,SRR5177930.5,NATTGAGACTGGCCCAACAAACATTCAATCCACTCCACCCATGGAC...,"[2, 27, 33, 33, 33, 37, 37, 37, 37, 37, 37, 37..."
5,SRR5177930.6,NACTCAGTTCTTTTCATGGCCAGACTCTGCCAGTCCCTGGGAGAGC...,"[2, 27, 27, 27, 33, 37, 37, 37, 37, 37, 37, 37..."
6,SRR5177930.7,NAAGTTCCGCACAATACTTTTCAGAAAGAGAAAAGCCATGCAGTTG...,"[2, 27, 27, 33, 33, 37, 37, 37, 37, 37, 37, 37..."
7,SRR5177930.8,NTCTGTTTCTATGTGGAAATAACCTCCTTCATTTCCTGATGCAAAT...,"[2, 27, 27, 27, 27, 37, 37, 37, 37, 33, 14, 37..."
8,SRR5177930.9,NGCCCCCTGTTCTCTAGTTGGCCTGTGCCCCTCTCCCATGTGGAGT...,"[2, 27, 33, 33, 33, 37, 37, 37, 37, 37, 37, 37..."
9,SRR5177930.10,NATTTCTCAAGACTTGCACATTTATATTATGCAAAACACAGCATGA...,"[2, 27, 27, 33, 33, 37, 37, 37, 37, 37, 37, 37..."


In [7]:
# as it can be seen in df1, the first good sequene is SRR id 19.

#### 2. Processing all backward sequences

In [8]:
import pandas as pd
import numpy as np
import os

# Input and output directories
input_dir = '/home/azureuser/dna_sequencing/Laavanya/batches/'
output_dir = '/home/azureuser/dna_sequencing/clean_backward_reads/'

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Define the quality check function
def is_quality_good(quality_scores):
    return np.min(quality_scores) >= 30

# Iterate through all 603 parquet files
for i in range(604):  # 0 to 603 inclusive
    file_path = os.path.join(input_dir, f'reads_batch_{i}.parquet')
    
    try:
        # Step 1: Load parquet file
        df = pd.read_parquet(file_path)

        # Step 2: Filter by sequence length (>= 100 bp)
        df_quality = df[df['sequence'].str.len() >= 100]

        # Step 3: Filter by Phred quality score (all scores >= 30)
        df_quality = df_quality[df_quality['quality'].apply(is_quality_good)]

        # Step 4: Save filtered DataFrame
        output_path = os.path.join(output_dir, f'clean_reads_batch_{i}.parquet')
        df_quality.to_parquet(output_path, index=False)

        # Print how many records were retained
        print(f'Saved clean_reads_batch_{i}.parquet ({len(df_quality)} records)')
        
    except Exception as e:
        print(f"❌ Failed to process file {i}: {e}")

print("✅ All files processed and saved to clean_backward_reads.")

Saved clean_reads_batch_0.parquet (17065 records)
Saved clean_reads_batch_1.parquet (17997 records)
Saved clean_reads_batch_2.parquet (16940 records)
Saved clean_reads_batch_3.parquet (10830 records)
Saved clean_reads_batch_4.parquet (8041 records)
Saved clean_reads_batch_5.parquet (7357 records)
Saved clean_reads_batch_6.parquet (8185 records)
Saved clean_reads_batch_7.parquet (13741 records)
Saved clean_reads_batch_8.parquet (12813 records)
Saved clean_reads_batch_9.parquet (12525 records)
Saved clean_reads_batch_10.parquet (14258 records)
Saved clean_reads_batch_11.parquet (14593 records)
Saved clean_reads_batch_12.parquet (14444 records)
Saved clean_reads_batch_13.parquet (15030 records)
Saved clean_reads_batch_14.parquet (16259 records)
Saved clean_reads_batch_15.parquet (15596 records)
Saved clean_reads_batch_16.parquet (14983 records)
Saved clean_reads_batch_17.parquet (14241 records)
Saved clean_reads_batch_18.parquet (14466 records)
Saved clean_reads_batch_19.parquet (13981 re

In [9]:
df3 = pd.read_parquet('/home/azureuser/dna_sequencing/Laavanya/batches/reads_batch_0.parquet')
df3.head()

Unnamed: 0,id,sequence,quality
0,SRR5177930.1,ATGGCCCGAGGGAGACCCCTGCTGTCCGGTGTGCTAGTCCCTTTTT...,"[37, 33, 33, 33, 33, 33, 33, 37, 37, 37, 37, 3..."
1,SRR5177930.2,ATGCTGGCCAGAGCCCAGAGGGAGAGGGCTCATCGGTCCATGGAGA...,"[33, 27, 33, 33, 33, 33, 33, 37, 37, 37, 37, 3..."
2,SRR5177930.3,ATGGTAAAGCATAGGGGCCATGCTAAAGAAACCACCACCAAGGAGA...,"[37, 27, 33, 33, 33, 33, 33, 37, 37, 37, 37, 3..."
3,SRR5177930.4,ATGAAATTAACTTTGGTGTCTGGGACAGTGATATTCTCATTCAAGC...,"[33, 27, 33, 33, 33, 33, 33, 37, 37, 37, 33, 3..."
4,SRR5177930.5,ATCTCAGAAAGGACAGAGGAAACTCTTCCTAATGACTGGCTGATGC...,"[37, 14, 33, 33, 33, 33, 33, 37, 37, 37, 37, 3..."


In [11]:
len(df3)

100000

In [10]:
# Filter records with quality < 30
low_quality_records = df3[df3['quality'].apply(lambda q: any(score < 30 for score in q))]

# Display the length of these records
print(f"Number of records with quality < 30: {len(low_quality_records)}")

Number of records with quality < 30: 82935


In [12]:
100000 - 82935

17065

In [None]:
# again - all the low quality records have been removed.