In [1]:
# Get a summary of the records in the file

import pandas as pd
import os

def extract_summary_numbers(filename):
    with open(filename, 'r') as f:
        lines = f.readlines()

    # Find the start and end of the "# Summary Numbers." section
    start_index = lines.index("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n")
    end_index = lines.index("# First Fragment Qualities. Use `grep ^FFQ | cut -f 2-` to extract this part.\n")

    # Extract the relevant lines and split them into columns
    data = [line.split('\t')[1:3] for line in lines[start_index+1:end_index]]

    # Convert the data into a pandas dataframe
    df = pd.DataFrame(data, columns=['Metric', 'Value'])

    # Clean up the Metric and Value columns
    df['Metric'] = df['Metric'].str.strip().str.rstrip(':')
    df['Value'] = df['Value'].str.strip()  # This will remove the newline characters

    # List of metrics to keep
    metrics_to_keep = [
        "raw total sequences",
        "sequences",
        "1st fragments",
        "reads mapped",
        "reads unmapped",
        "reads duplicated",
        "reads MQ0",
        "reads QC failed",
        "non-primary alignments",
        "supplementary alignments",
        "total length",
        "total first fragment length",
        "bases mapped",
        "bases mapped (cigar)",
        "mismatches",
        "error rate",
        "average length",
        "average first fragment length",
        "maximum length",
        "maximum first fragment length",
        "average quality"
    ]

    # Filter the dataframe to keep only the rows with the specified metrics
    df = df[df['Metric'].isin(metrics_to_keep)]

    return df


In [2]:
# Directory containing the stats files
directory = "./Users/byeongyeoncho/kosticlab_ev/MappCountFlow/output"

# List all files with the ".sorted.stats" extension
stats_files = [f for f in os.listdir(directory) if f.endswith(".sorted.stats")]

# Create an empty list to store dataframes
dfs = []

# Loop through each file and apply the function
for stats_file in stats_files:
    filepath = os.path.join(directory, stats_file)
    df = extract_summary_numbers(filepath)
    dfs.append(df.set_index('Metric'))

# Rename columns with the filenames (without the extension)
column_names = [os.path.splitext(f)[0] for f in stats_files]

# Concatenate dataframes side by side
result = pd.concat(dfs, axis=1, keys=column_names)

# Print the result
print(result)

result.to_csv("./Users/byeongyeoncho/kosticlab_ev/MappCountFlow/output/manuscript_mapping_stats.csv")

                              p2_sim1.sorted     p2.sorted wt_sim2.sorted  \
                                       Value         Value          Value   
Metric                                                                      
raw total sequences                    68171         68171         270929   
sequences                              68171         68171         270929   
1st fragments                          68171         68171         270929   
reads mapped                           67892         67273         269306   
reads unmapped                           279           898           1623   
reads duplicated                           0             0              0   
reads MQ0                                601           532           3529   
reads QC failed                            0             0              0   
non-primary alignments                  3328          3094          17025   
supplementary alignments                 107           307             78   