In [65]:
# imports
import gzip
import os
import pandas as pd

# Reads stats

## Number of reads per sample

In [81]:
# FUNCTIONS

# paths to the individual gz files for each sample

def get_paths(directory):
    """
    Retrieve paths to individual gzip files for each sample.

    Args:
        directory (str): Path to the directory containing the files.

    Returns:
        list: A list of file paths to the individual gzip files for each sample.
    """
    all_files = os.listdir(directory)
    gz_files = [file for file in all_files if file.endswith('reads.fastq.gz')]
    gz_paths = [directory + path for path in gz_files]
    return gz_paths


# counting the number of reads in the raw reads files

def count_seqs(gz_paths):
    """
    Count the number of reads in the raw read files.

    Args:
        gz_paths (list): A list of file paths to the raw read files.

    Returns:
        dict: A dictionary where the keys are sample names (derived from file names) and
        the values are the corresponding number of sequences in each raw read file.
    """
    output = {}
    for file_ in gz_paths:
        with gzip.open(file_,'rt') as f:
            num_lines = 0
            for line in f:
                num_lines += 1
            num_seqs = num_lines / 4
            file_path = file_.split("/")[-1]
            file_name = file_path.split(".")[0].split('_')[0]
            output[file_name] = num_seqs     
    return output

In [78]:
pacbio = "../raw_data/PacBio/"
seq_paths = [f'{pacbio}/Suthaus_2022_Full18S/cell1/',
             f'{pacbio}/Suthaus_2022_Full18S/cell2/', 
             f'{pacbio}/Suthaus_2022_Full18S/cellCombined/', 
             f'{pacbio}/Suthaus_2022_rDNA/cell/']

num_read_data = {}
for seq_path in seq_paths:
    gz_paths = get_paths(seq_path)
    run_name = seq_path.split('/')[-3].split('_')[-1]
    cell_name = seq_path.split('/')[-2]
    sample_name = run_name + '_' + cell_name
    num_read_data[sample_name] = count_seqs(gz_paths)

In [79]:
num_read_data['Full18S_cell1']

{'Th16': 12655,
 'NH1': 9175,
 'Sim22': 16365,
 'Mock': 27928,
 'Sim17': 16884,
 'A3': 5621,
 'Th40': 9208,
 'Th38': 17006,
 'NH4': 5027,
 'X17007': 6005}

In [85]:
df_reads = pd.DataFrame.from_dict(num_read_data)
df_reads = df.dropna()
df_reads = df.astype(int)

In [86]:
df_reads

Unnamed: 0,Full18S_cell1,Full18S_cell2,Full18S_cellCombined,rDNA_cell
Th16,12655,12213,24868,15238
NH1,9175,8796,17971,143994
Sim22,16365,16058,32423,9114
Mock,27928,27806,55734,5865
Sim17,16884,15951,32835,33204
A3,5621,5480,11101,65836
Th40,9208,8726,17934,2383
Th38,17006,16068,33074,49846
NH4,5027,4934,9961,25360
X17007,6005,5975,11980,34204


In [87]:
df['most_reads'] = df.idxmax(axis=1)

In [88]:
df

Unnamed: 0,Full18S_cell1,Full18S_cell2,Full18S_cellCombined,rDNA_cell,most_reads
Th16,12655,12213,24868,15238,Full18S_cellCombined
NH1,9175,8796,17971,143994,rDNA_cell
Sim22,16365,16058,32423,9114,Full18S_cellCombined
Mock,27928,27806,55734,5865,Full18S_cellCombined
Sim17,16884,15951,32835,33204,rDNA_cell
A3,5621,5480,11101,65836,rDNA_cell
Th40,9208,8726,17934,2383,Full18S_cellCombined
Th38,17006,16068,33074,49846,rDNA_cell
NH4,5027,4934,9961,25360,rDNA_cell
X17007,6005,5975,11980,34204,rDNA_cell


In [89]:
df = pd.DataFrame.from_dict(num_read_data)

In [90]:
df

Unnamed: 0,Full18S_cell1,Full18S_cell2,Full18S_cellCombined,rDNA_cell
Th16,12655.0,12213.0,24868.0,15238
NH1,9175.0,8796.0,17971.0,143994
Sim22,16365.0,16058.0,32423.0,9114
Mock,27928.0,27806.0,55734.0,5865
Sim17,16884.0,15951.0,32835.0,33204
A3,5621.0,5480.0,11101.0,65836
Th40,9208.0,8726.0,17934.0,2383
Th38,17006.0,16068.0,33074.0,49846
NH4,5027.0,4934.0,9961.0,25360
X17007,6005.0,5975.0,11980.0,34204


## Length of the reads

In [None]:
# counting the number of reads in the raw reads files

path = '../raw_data/PacBio/Suthaus_2022_rDNA/cell/A3_21R.hifi_reads.fastq.gz'

for file_ in gz_paths:
    with gzip.open(file_,'rt') as f:
        short_reads = 0
        for line in f:
            num_lines += 1
        num_seqs = num_lines / 4
        file_path = file_.split("/")[-1]
        file_name = file_path.split(".")[0].split('_')[0]
        output[file_name] = num_seqs     
return output

In [151]:
path = '../raw_data/PacBio/Suthaus_2022_rDNA/cell/A3_21R.hifi_reads.fastq.gz'



def get_reads_length(path):
    reads_length = {'less_3000': 0, 
                    '3000_6000': 0, 
                    'more_6000': 0
                   }

    with gzip.open(path,'rt') as f:
        for line in f:
            line_cleaned = line.replace('\n', '')
            if line_cleaned.startswith(('A', 'C',  'T', 'G')) and line_cleaned.isalpha():
                if len(line_cleaned) < 3000:
                    reads_length['less_3000'] += 1
                if 3000 <= len(line_cleaned) <= 6000:
                    reads_length['3000_6000'] += 1
                if len(line_cleaned) > 6000:
                    reads_length['more_6000'] += 1
    return reads_length

In [159]:
directory = '../raw_data/PacBio/Suthaus_2022_rDNA/cell/'

paths = get_paths(directory)
data = {}

for path in paths:
    sample_name = path.split('/')[-1].split('.')[0].split('_')[0]
    data[sample_name] = get_reads_length(path)


In [160]:
df = pd.DataFrame.from_dict(data)

In [161]:
df = df.T

In [162]:
df = df.rename(columns={'less_3000': 'less_than_3000', '3000_6000': 'between_3000-6000', 'more_6000': 'more_than_6000'})

In [163]:
df

Unnamed: 0,less_than_3000,between_3000-6000,more_than_6000
NH4,20983,4376,1
TS,58872,6306,12
Th40,2074,306,3
FS9,15145,18421,9
X17007,33940,262,2
Sim22,8397,716,1
X579,8138,458,2
Mock,2042,3819,4
FS2,974,817,5
NH1,139162,4829,3
