# 25/09/16: Extracting sort counts from FACSMelody sort reports

When performing the four-way nuclei sorts in experiment 383, sort efficency and sort count data are recorded in a table within a .pdf sort report.
To get through en entire 1/2 mouse brain, I had to run between 2 and 4 sorts per sample. This means that we have to find a way to:
- parse the pdf files for the sort count and sort efficiency data for each population of nuclei
- use the sort efficiency percentage to work back to an accurate total number of nuclei in the original sample
- add together the counts for each population across mutliple reports generated from sorting a single brain 

In [1]:
import os
import base64
from IPython.display import HTML
import camelot
import pandas as pd
from pathlib import Path
from collections import Counter

In [2]:
%%bash
# Use some bash to take a look at the structure of the sort data directory.
tree /media/tmurphy/windows_mount_u/experiment-383/experiment_383_FANS_data/EXPERIMENT_383_MAIN_SORTS/SORT_DATA_FILES/ | tail -n 20

│       ├── [01;32m25-08-22-experiment-383-sort-71-917445_917445_Sort_002.pdf[0m
│       ├── [01;32m25-08-22-experiment-383-sort-71-917445_917445_Sort_003.pdf[0m
│       └── [01;32m25-08-22-experiment-383-sort-71-917445_917445_Sort_004.pdf[0m
├── [34;42m25-08-22-experiment-383-sort-72-918278[0m
│   ├── [01;32m918278_sortCounts.txt[0m
│   ├── [01;32m918278_sortNotes.txt[0m
│   ├── [34;42mFCS_files[0m
│   │   └── [01;32m25-08-22-experiment-383-sort-72-918278.zip[0m
│   └── [34;42msort_reports[0m
│       ├── [01;32m25-08-22-experiment-383-sort-72-918278_918278_Sort_001.pdf[0m
│       ├── [01;32m25-08-22-experiment-383-sort-72-918278_918278_Sort_002.pdf[0m
│       ├── [01;32m25-08-22-experiment-383-sort-72-918278_918278_Sort_003.pdf[0m
│       └── [01;32m25-08-22-experiment-383-sort-72-918278_918278_Sort_004.pdf[0m
└── [34;42mexp-383-sort-folder-template[0m
    ├── [34;42mFCS_files[0m
    ├── [01;32m_sortCounts.txt[0m
    ├── [01;32m_sortNotes.txt[0m
    └

In [3]:
# We can see that subdirectories are for samples,
# a lower-level subdirectory called sort_reports contains the PDF files we are after.

# change to the sort data directory
sort_data_path = '/media/tmurphy/windows_mount_u/experiment-383/experiment_383_FANS_data/EXPERIMENT_383_MAIN_SORTS/SORT_DATA_FILES/'
os.chdir(sort_data_path)

# from the output of the previous cell, get the name of the subdirectory for the last sort
sort_72_dir = '25-08-22-experiment-383-sort-72-918278'
sort_rep_dir = 'sort_reports'

#build the path, check the files
sort_72_path = os.path.join(sort_data_path, sort_72_dir, sort_rep_dir)
print(f'Path to reports: {sort_72_path}')
print('\npdf reports:')

#print the names of the reports and add them to a list for easy reference later
report_list = []
for file in os.listdir(sort_72_path):
    if file.endswith(".pdf"):
        print(file)
        report_list.append(str(file))

Path to reports: /media/tmurphy/windows_mount_u/experiment-383/experiment_383_FANS_data/EXPERIMENT_383_MAIN_SORTS/SORT_DATA_FILES/25-08-22-experiment-383-sort-72-918278/sort_reports

pdf reports:
25-08-22-experiment-383-sort-72-918278_918278_Sort_001.pdf
25-08-22-experiment-383-sort-72-918278_918278_Sort_002.pdf
25-08-22-experiment-383-sort-72-918278_918278_Sort_003.pdf
25-08-22-experiment-383-sort-72-918278_918278_Sort_004.pdf


In [4]:
# We should diplay the pdf file image
# This will allow us to guage what it should look like and the fields
# we want to recover

# path to first report
pdf_001_path = os.path.join(sort_72_path, report_list[0])

# open the pdf - use read and binary options
with open(pdf_001_path, 'rb') as pdf:
    base64_pdf = base64.b64encode(pdf.read()).decode('utf-8')

pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="1000" height="1000" type="application/pdf"></iframe>'

# display pdf
# IFrame(pdf_display, width=1000, height=1000)
HTML(pdf_display)



In [5]:
# Try a library I found called camelot
# There are different methods of pdf reading, results stream

stream_tables = camelot.read_pdf(pdf_001_path, flavor='stream')
print(f'stream method found {len(stream_tables)} tables in the pdf')

network_tables = camelot.read_pdf(pdf_001_path, flavor='network')
print(f'network method found {len(network_tables)} tables in the pdf')

stream method found 3 tables in the pdf
network method found 5 tables in the pdf


In [6]:
network_tables
type(network_tables)

camelot.core.TableList

In [7]:
for i, table in enumerate(stream_tables, start=1):
    print(f'Table number {i} has dimensions {table.shape}')
    print(f'Parse report: \n{table.parsing_report}')
    print()

Table number 1 has dimensions (8, 4)
Parse report: 
{'accuracy': 98.54, 'whitespace': 31.25, 'order': 1, 'page': 1}

Table number 2 has dimensions (11, 9)
Parse report: 
{'accuracy': 99.2, 'whitespace': 17.17, 'order': 2, 'page': 1}

Table number 3 has dimensions (24, 9)
Parse report: 
{'accuracy': 98.54, 'whitespace': 50.93, 'order': 3, 'page': 1}



In [8]:
# Examine the tables found by the network method.
for i, table in enumerate(network_tables, start=1):
    print(f'Table number {i} has dimensions {table.shape}')
    print(f'Parse report: \n{table.parsing_report}')
    print()


Table number 1 has dimensions (4, 6)
Parse report: 
{'accuracy': 96.96, 'whitespace': 41.67, 'order': 1, 'page': 1}

Table number 2 has dimensions (9, 6)
Parse report: 
{'accuracy': 93.26, 'whitespace': 40.74, 'order': 2, 'page': 1}

Table number 3 has dimensions (6, 7)
Parse report: 
{'accuracy': 99.43, 'whitespace': 11.9, 'order': 3, 'page': 1}

Table number 4 has dimensions (12, 2)
Parse report: 
{'accuracy': 97.3, 'whitespace': 0.0, 'order': 4, 'page': 1}

Table number 5 has dimensions (25, 9)
Parse report: 
{'accuracy': 95.43, 'whitespace': 52.0, 'order': 5, 'page': 1}



In [9]:
# Let's have a look at the data.
# convert each table in the list into a dataframe
table_df_list = [table.df for table in network_tables]

In [10]:
# View the first table
table_df_list[0]

Unnamed: 0,0,1,2,3,4,5
0,918278_Sort_001,,,,,
1,\nCYTOMETER INFO,,,,,
2,User Name:,Admin Admin,Application Name:,BD FACSChorus,Cytometer Serial Number:,R6643420008
3,Experiment Name:,25-08-22-experiment-383-sort-72-918278,Application Data Version:,1.4.3.0,Cytometer Name:,FACSMelody


In [11]:
# View the second table: 
table_df_list[1]

Unnamed: 0,0,1,2,3,4,5
0,918278_Sort_001,,,,,
1,\nCYTOMETER INFO,,,,,
2,User Name:,Admin Admin,,Application Name:,BD FACSChorus,
3,Experiment Name:,,25-08-22-experiment-383-sort-72-918278,Application Data Version:,1.4.3.0,Cytometer Name:
4,\nSORT DETAILS,,,,,
5,Sort Mode:,Purity,Sort Status:,Stopped by User,Start Date Time:,08/22/2025 01:28PM
6,Sort Device:,4-Way Tubes 5.0mL,Nozzle Size:,100 micron,End Date Time:,08/22/2025 01:41PM
7,Total Events:,6089316,Pressure:,22.93 PSI,,
8,Processed Events:,98.8%,Drop Frequency:,34.0 kHz,,


In [12]:
# Third table: 
table_df_list[2] #SUCCESS!

# # save as csv for reference
# os.chdir('/home/tmurphy')
# table_df_list[2].to_csv(str(report_list[2]) + 'csv')

# display the raw dataframe from camelot
table_df_list[2]

Unnamed: 0,0,1,2,3,4,5,6
0,,SORT STATISTICS,,,,,
1,Tube,Population,Target Count,Sort Count,Sort Rate,Efﬁciency,Time
2,1,microglia,921600,73711,102,68%,12m 0s
3,2,NeuN+ neurons,1388160,1388160,2024,76%,11m 25s
4,3,SOX10+ oligo,1388160,278113,386,70%,12m 0s
5,4,SOX2+ astrocytes,921600,38865,53,66%,12m 0s


In [13]:
# make this into a dataframe with blank row removed
# copy the dataframe to new variable minus blank first row
df = table_df_list[2].iloc[1:].reset_index(drop=True)

# polishing the raw df 

# Sort out the column names.
# 1. promote current first row to column headers
df.columns = df.iloc[0]
df = df.iloc[1:].reset_index(drop=True)

# 2. remove all whitespace
df.columns = [string.replace(" ", "") for string in df.columns]

# 3. Change the Efficiency name for clarity (its a percentage)
### WARNING: pdf used ligature character fi not letter i ###
df.rename(columns={"Efﬁciency": "EfficiencyPct"}, inplace=True)

# ### Work through each column making necessary changes. ###

# # "Tube", "TargetCount", "Time" - not useful. Remove them.
df = df.drop(columns=["Tube", "TargetCount", "Time"])

# "Population"
# We know that the populations names in the reports never change
# discitonary mapping of report names to preferred names:
pop_mapping = {
    "microglia": "PU1+",
    "NeuN+ neurons": "NeuN+",
    "SOX10+ oligo": "SOX10+",
    "SOX2+ astrocytes": "SOX2+"
}
# give the new mappings to the replace method to change the pop. names
df["Population"] = df["Population"].replace(pop_mapping)

# recode SortCount
df['SortCount'] = [int(count.replace(",","")) for count in df['SortCount']]

# recode sortRate
df['SortRate'] = [int(count) for count in df['SortRate']]

# recode EfficiencyPct
df['EfficiencyPct'] = [int(pct.replace("%","")) for pct in df['EfficiencyPct']]

# Calculate an estimate for the nuclei counts in the sample
# SortCount / EfficencyPct * 100
df['EstTotalCount'] = round(((df['SortCount'] / df['EfficiencyPct']) * 100))
df['EstTotalCount'] = [int(count) for count in df['EstTotalCount']]

In [14]:
df # perfect!

Unnamed: 0,Population,SortCount,SortRate,EfficiencyPct,EstTotalCount
0,PU1+,73711,102,68,108399
1,NeuN+,1388160,2024,76,1826526
2,SOX10+,278113,386,70,397304
3,SOX2+,38865,53,66,58886


## Combining mutliple reports from the same sample
Now that we can extract the right information and wranlge it into the correct format, we need to combine the 3 or 4 dataframes generated during each sort. I think the best way to do this is to write a function that takes a directory path as input, then finds all the files that end with the .pdf file extension. 

Next, it should extract from the file name the anmimal number and the sort number. These should be used for keeping track of df's to combine.

In [15]:
# First function is for getting all the pdf file paths together in a list
def getPDFreports(directory_path):
    ''' 
    This function takes a directory path as input.
    It parses the files in the directory and finds files whose names look
    like sort reports and that end with .pdf file extension. 
    returns a list of full file paths, one for each .pdf report found.
    
    directory_path: path to a directroy containing .pdf cell sorter reports.
    returns: List of absolute paths to PDF reports.
    '''
    # check if path provided
    if not directory_path:
        print("No directory path provided.")
        return  
    # check if path exists
    if not os.path.isdir(directory_path):
        print(f"The path '{directory_path}' is not a directory or does not exist.")
        return  

    # ensure absolute path to directory is used
    directory_path = os.path.abspath(directory_path)
    
    # build list of pdf file paths
    pdfReportList = [
        os.path.join(directory_path, file)
        for file in os.listdir(directory_path)
        if file.lower().endswith('.pdf')
    ]
    # handle cases where no pdfs are found
    if not pdfReportList:
        print(f'No PDF files found in {directory_path}')
        return

    return(pdfReportList)   
    
        

In [16]:
# test first function
test1 = getPDFreports(sort_72_path)
test1

['/media/tmurphy/windows_mount_u/experiment-383/experiment_383_FANS_data/EXPERIMENT_383_MAIN_SORTS/SORT_DATA_FILES/25-08-22-experiment-383-sort-72-918278/sort_reports/25-08-22-experiment-383-sort-72-918278_918278_Sort_001.pdf',
 '/media/tmurphy/windows_mount_u/experiment-383/experiment_383_FANS_data/EXPERIMENT_383_MAIN_SORTS/SORT_DATA_FILES/25-08-22-experiment-383-sort-72-918278/sort_reports/25-08-22-experiment-383-sort-72-918278_918278_Sort_002.pdf',
 '/media/tmurphy/windows_mount_u/experiment-383/experiment_383_FANS_data/EXPERIMENT_383_MAIN_SORTS/SORT_DATA_FILES/25-08-22-experiment-383-sort-72-918278/sort_reports/25-08-22-experiment-383-sort-72-918278_918278_Sort_003.pdf',
 '/media/tmurphy/windows_mount_u/experiment-383/experiment_383_FANS_data/EXPERIMENT_383_MAIN_SORTS/SORT_DATA_FILES/25-08-22-experiment-383-sort-72-918278/sort_reports/25-08-22-experiment-383-sort-72-918278_918278_Sort_004.pdf']

In [17]:
import re
import unicodedata

# Helper: extract ONLY Efficiency (as integer %) even if a column is merged like "Efﬁciency\nTime"
# This was added becuase occasioanlly the efficiency and time columns were merged
def _ensure_efficiency_pct_only(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    def _canon(s: str) -> str:
        # normalize ligatures (ﬁ -> fi), lowercase, remove whitespace/newlines,
        # keep letters/digits/% only to make matching robust
        s = unicodedata.normalize("NFKD", str(s)).lower()
        s = re.sub(r"\s+", "", s)
        s = s.replace("％", "%").replace("percent", "%").replace("pct", "%")
        return re.sub(r"[^a-z0-9%]+", "", s)

    raw_cols = list(df.columns)
    canon = {c: _canon(c) for c in raw_cols}

    # Case A: already a clean Efficiency column (any variant spelling/ligature)
    eff_col = next((c for c, k in canon.items() if k.startswith("efficiency") and "time" not in k), None)
    # Case B: merged "Efficiency...Time" into one column
    merged_col = next((c for c, k in canon.items() if k.startswith("efficiency") and "time" in k), None)

    src = eff_col or merged_col
    if src is None:
        raise ValueError(f"Could not find an Efficiency column; saw: {raw_cols}")

    # Pull just the percent number from values (works for '66%', '66 % 9m 9s', etc.)
    eff = df[src].astype(str).str.extract(r"(\d+(?:\.\d+)?)\s*%")[0]
    df["EfficiencyPct"] = pd.to_numeric(eff, errors="coerce").round().astype("Int64")

    # Fallback: if no % was found but entries are numeric-like
    if df["EfficiencyPct"].isna().all():
        df["EfficiencyPct"] = (
            df[src].astype(str)
                   .str.replace("%", "", regex=False)
                   .pipe(pd.to_numeric, errors="coerce")
                   .round()
                   .astype("Int64")
        )

    # We don’t need Time at all—drop the source column and any Time column
    df = df.drop(columns=[src], errors="ignore")
    df = df.drop(columns=["Time"], errors="ignore")
    return df

In [18]:
def extractSortStatistics(pdfReportList, *, camelot_flavor="lattice"):
    '''
    This function takes a list of paths to PDF sort reports generated by BD FACS Chorus.
    It extracts the "Sort Statistics" table with Camelot, converts to a pandas DataFrame,
    tidies headers, fixes ligatures, parses numerics, maps population names, and returns
    a list of cleaned DataFrames (one per report).

    pdfReportList: list of absolute paths to PDF sort reports for one sample
    returns: list[pd.DataFrame]
    '''
    sortStatsDfList = []

    # mapping for Population names
    pop_mapping = {
        "microglia": "PU1+",
        "NeuN+ neurons": "NeuN+",
        "SOX10+ oligo": "SOX10+",
        "SOX2+ astrocytes": "SOX2+",
    }

    for file in pdfReportList:
        try:
            # Read tables; strip embedded newlines to reduce merged-cell issues
            tables = camelot.read_pdf(file, flavor='network', pages="all", strip_text="\n")

            if len(tables) <= 2:
                raise ValueError(f"Expected at least 3 tables; found {len(tables)}")

            # Pull the "Sort Statistics" table (index 2 in your reports)
            df = tables[2].df

            # Drop junk first row, then promote next row to header
            df = df.iloc[1:].reset_index(drop=True)
            new_cols = df.iloc[0].astype(str).tolist()
            df = df.iloc[1:].reset_index(drop=True)
            df.columns = new_cols

            # Normalize header text: fix ligatures and remove spaces (keeps your original case)
            def _norm_header(s: str) -> str:
                s = unicodedata.normalize("NFKD", str(s))  # fixes 'Efﬁciency' -> 'Efficiency'
                return s.replace(" ", "")
                
            df.columns = [_norm_header(c) for c in df.columns]

            # Ensure we ONLY keep/produce EfficiencyPct; ignore Time entirely
            df = _ensure_efficiency_pct_only(df)

            # Drop columns we don't want (Time already removed if present)
            df = df.drop(columns=["Tube", "TargetCount", "Time"], errors="ignore")

            # Map population names if present (robust to 'Population' vs 'population')
            if "Population" in df.columns:
                df["Population"] = df["Population"].replace(pop_mapping)

            # Vectorized numeric parsing
            if "SortCount" in df.columns:
                df["SortCount"] = (
                    df["SortCount"].astype(str)
                                   .str.replace(",", "", regex=False)
                                   .pipe(pd.to_numeric, errors="coerce")
                                   .astype("Int64")
                )

            if "SortRate" in df.columns:
                df["SortRate"] = pd.to_numeric(df["SortRate"], errors="coerce").astype("Int64")

            if "EfficiencyPct" in df.columns:
                df["EfficiencyPct"] = (
                    df["EfficiencyPct"]
                    .astype("Int64")  # already numeric from helper; ensure nullable int
                )

            # (Optional) If you later want EstTotalCount, uncomment this block:
            if {"SortCount", "EfficiencyPct"}.issubset(df.columns):
                denom = df["EfficiencyPct"].replace(0, pd.NA)
                df["EstTotalCount"] = ((df["SortCount"] * 100) / denom).round().astype("Int64")

            sortStatsDfList.append(df)

        except Exception as e:
            print(f"[WARN] Skipping {file}: {e}")

    return sortStatsDfList

In [19]:
# test second function
extractSortStatistics(test1)[1]

Unnamed: 0,Population,SortCount,SortRate,EfficiencyPct,EstTotalCount
0,PU1+,53382,97,66,80882
1,NeuN+,1110821,2020,75,1481095
2,SOX10+,213708,388,69,309722
3,SOX2+,30395,55,65,46762


In [20]:
test2 = extractSortStatistics(test1) # it works

In [21]:
def collapseByPopulation(df_list):
    # Known fixed order for this experiment
    order = ["PU1+", "NeuN+", "SOX10+", "SOX2+"]

    # Stack all per-report frames
    df = pd.concat(df_list, ignore_index=True, copy=False)

    # Sum by population and enforce the desired order, filling any missing with 0
    out = (df.groupby("Population", as_index=False, sort=False)["EstTotalCount"].sum()
             .set_index("Population")
             .reindex(order, fill_value=0)
             .reset_index())

    # Nullable integer for clean downstream use
    out["EstTotalCount"] = out["EstTotalCount"].astype("Int64")
    return out

In [22]:
collapseByPopulation(test2)

Unnamed: 0,Population,EstTotalCount
0,PU1+,300484
1,NeuN+,5392996
2,SOX10+,1154194
3,SOX2+,174691


In [23]:
# --- helper: pick a 6-digit sample id from the PDFs in a folder ---
# Takes the most common 6-digit number in all the filenames in one sub-folder to 
# account for typos in the filenames. 

def _extract_sample_id_from_pdfs(pdf_paths) -> str | None:
    """
    Given a list of PDF paths, find 6-digit numbers in filenames.
    Returns the most frequent 6-digit string (or the first found if tie).
    Returns None if no 6-digit number is found.
    """
    sixes = []
    pat = re.compile(r"(\d{6})") # regex to identify 6 digit num.
    for p in pdf_paths:
        m = pat.findall(Path(p).name)
        if m:
            sixes.extend(m)
    if not sixes:
        return None
    counts = Counter(sixes).most_common()
    # most frequent 6-digit id
    return counts[0][0]

In [24]:
# Function to loop through a user-provided data directory and find sub-folders with sort reports in

def batchProcessReports(root_dir,
                       *,
                       out_dir=None,
                       table_index=2,
                       camelot_flavor="network",
                       camelot_kwargs=None,
                       overwrite=True):
    '''
    Function that traverses a user-supplied experiment 'root_dir' and finds sub-folders
    containing pdf reports gerneated by BD Biosciences cell sorters. It runs a camelot-based function to parse each pdf and
    extracts the sort statistics table, wrangles into an appropriate format with correctly encoded
    data-types, and finally sums together sort counts gnereted in diffferent sorts of the same sample. A column 'sample_id' is
    added to each collapsed dataframe.
    Total counts are saved as .csv files. 
    
    Parameters
        ----------
        root_dir : str or Path
            Directory whose immediate children are sample folders (each with sort_reports/).
        out_dir : str or Path or None
            Where to write CSVs. If None, write into each sample folder.
        table_index : int
            Which Camelot table index to read (defaults to 2).
        camelot_flavor : str
            Camelot flavor; "network" by default for your reports.
        camelot_kwargs : dict or None
            Extra kwargs for camelot.read_pdf (e.g., {"row_tol": 5}).
        overwrite : bool
            Overwrite existing CSV files if present.
    
        Returns
        -------
        dict[sample_folder_name, collapsed_df_with_sample_id]
    '''
    root = Path(root_dir).resolve()
    if not root.is_dir():
        raise NotADirectoryError(f"{root} is not a directory")

    out_base = Path(out_dir).resolve() if out_dir else None
    if out_base:
        out_base.mkdir(parents=True, exist_ok=True)

    results = {}

    for sort_dir in root.glob("*/sort_reports"):
        if not sort_dir.is_dir():
            continue

        sample_dir = sort_dir.parent
        sample_folder_name = sample_dir.name

        # collect PDFs
        pdfs = sorted(p.resolve() for p in sort_dir.glob("*.pdf"))
        if not pdfs:
            print(f"[INFO] No PDFs in {sort_dir}; skipping {sample_folder_name}")
            continue

        # derive sample_id from filenames (first 6-digit number) ---
        sample_id = _extract_sample_id_from_pdfs([str(p) for p in pdfs])
        if sample_id is None:
            print(f"[WARN] No 6-digit sample id found in PDFs under {sort_dir}. Using folder name.")
            sample_id = sample_folder_name  # fallback

        # extract & collapse
        df_list = extractSortStatistics(
            [str(p) for p in pdfs],
            camelot_flavor=camelot_flavor
        )
        collapsed = collapseByPopulation(df_list)

        # add sample_id column to the collapsed 4-row df
        collapsed.insert(0, "sample_id", str(sample_id))

        # decide output path (still name CSV after parent folder)
        out_dir = out_base if out_base else sample_dir
        out_path = out_dir / f"{sample_folder_name}.csv"

        if out_path.exists() and not overwrite:
            print(f"[INFO] Exists, not overwriting: {out_path}")
        else:
            collapsed.to_csv(out_path, index=False)
            print(f"[OK] Wrote {out_path}")

        results[sample_folder_name] = collapsed

    #return results

In [30]:
#  try it out
outDir1 = Path.joinpath(Path.home(), "phd_work/experiment_383/exp_383_flowCytometry_analysis/output/per_sample_cell_counts")
if not outDir1.exists():
    os.makedirs(outDir1)

batchProcessReports(sort_data_path,
                    out_dir=outDir1,
                    table_index=2,
                    camelot_flavor="network",
                    camelot_kwargs=None,
                    overwrite=True
                    )

[OK] Wrote /home/tmurphy/phd_work/experiment_383/exp_383_flowCytometry_analysis/output/per_sample_cell_counts/25-05-07-experiment-383-sort-1-918310.csv
[OK] Wrote /home/tmurphy/phd_work/experiment_383/exp_383_flowCytometry_analysis/output/per_sample_cell_counts/25-05-07-experiment-383-sort-2-917423.csv
[OK] Wrote /home/tmurphy/phd_work/experiment_383/exp_383_flowCytometry_analysis/output/per_sample_cell_counts/25-05-08-experiment-383-sort-3-918309.csv
[OK] Wrote /home/tmurphy/phd_work/experiment_383/exp_383_flowCytometry_analysis/output/per_sample_cell_counts/25-05-09-experiment-383-sort-4-916462.csv
[OK] Wrote /home/tmurphy/phd_work/experiment_383/exp_383_flowCytometry_analysis/output/per_sample_cell_counts/25-05-09-experiment-383-sort-5-918277.csv
[OK] Wrote /home/tmurphy/phd_work/experiment_383/exp_383_flowCytometry_analysis/output/per_sample_cell_counts/25-05-13-experiment-383-sort-6-917432.csv
[OK] Wrote /home/tmurphy/phd_work/experiment_383/exp_383_flowCytometry_analysis/output/p

In [31]:
# Function to concatenate csv files 
def merge_csvs(csv_directory, *, axis=0, out_dir=None, filename="combinedDf.csv", keep_order=True):
    """
    Concatenate all CSV files in `csv_directory` along the given axis and write a single CSV.
    - axis=0: row-wise stack (headers align by name). Index is reset.
    - axis=1: column-wise merge (each DF's index is reset first).
    
    Returns the combined DataFrame.
    """
    csv_dir = Path(csv_directory).resolve()
    if not csv_dir.is_dir():
        raise NotADirectoryError(f"{csv_dir} is not a directory")

    # Collect CSV files
    files = list(csv_dir.glob("*.csv"))
    if not keep_order:
        files = sorted(files)  # stable, lexicographic order
    # If you truly want "natural" OS order, leave as-is (but it's not guaranteed/stable)

    if not files:
        raise FileNotFoundError(f"No .csv files found in {csv_dir}")

    # Read all CSVs
    dfs = [pd.read_csv(p) for p in files]

    # Concatenate
    if axis == 0:
        combined = pd.concat(dfs, axis=0, ignore_index=True)  # header appears once when writing
    elif axis == 1:
        dfs = [d.reset_index(drop=True) for d in dfs]
        combined = pd.concat(dfs, axis=1)
    else:
        raise ValueError("axis must be 0 (rows) or 1 (columns)")

    # Decide output location
    out_base = Path(out_dir).resolve() if out_dir else csv_dir
    out_base.mkdir(parents=True, exist_ok=True)

    # Ensure .csv suffix
    out_name = filename if str(filename).lower().endswith(".csv") else f"{filename}.csv"
    out_path = out_base / out_name

    # Write once with a single header
    combined.to_csv(out_path, index=False)

    return combined

In [33]:
outDir2 = Path.joinpath(Path.home(), "phd_work/experiment_383/exp_383_flowCytometry_analysis/output/combined_counts")
if not outDir2.exists():
    os.mkdir(outDir2)

merge_csvs(outDir1, 
           axis=0,
           out_dir=outDir2,
           filename="Exp_383_FANS_combined_counts",
           keep_order=False)

Unnamed: 0,sample_id,Population,EstTotalCount
0,918310,PU1+,144824
1,918310,NeuN+,4359464
2,918310,SOX10+,668362
3,918310,SOX2+,249886
4,917423,PU1+,256364
...,...,...,...
283,917445,SOX2+,358748
284,918278,PU1+,300484
285,918278,NeuN+,5392996
286,918278,SOX10+,1154194
