In [1]:
# Python library to handle paths / filenames / listing directories
import os
# Python Regular Expression library
import re
# The Python data analysis library: https://pandas.pydata.org/
import pandas as pd

In [2]:
# Dummy read files are located in the subdirectory. These are empty, but for our manifest generation, that's fine!
BASEDIR = 'reads/'

In [3]:
# First let's grab the contents of our directory
filenames = os.listdir(BASEDIR)
filenames

['CC11CM0_S1_L001_R1_001.fastq.gz',
 'CC11CM0_S1_L001_R2_001.fastq.gz',
 'CC11CM10_S11_L001_R1_001.fastq.gz',
 'CC11CM10_S11_L001_R2_001.fastq.gz',
 'CC11CM11_S12_L001_R1_001.fastq.gz',
 'CC11CM11_S12_L001_R2_001.fastq.gz',
 'CC11CM12_S13_L001_R1_001.fastq.gz',
 'CC11CM12_S13_L001_R2_001.fastq.gz',
 'CC11CM13_S14_L001_R1_001.fastq.gz',
 'CC11CM13_S14_L001_R2_001.fastq.gz',
 'CC11CM14_S15_L001_R1_001.fastq.gz',
 'CC11CM14_S15_L001_R2_001.fastq.gz',
 'CC11CM15_S16_L001_R1_001.fastq.gz',
 'CC11CM15_S16_L001_R2_001.fastq.gz',
 'CC11CM16_S17_L001_R1_001.fastq.gz',
 'CC11CM16_S17_L001_R2_001.fastq.gz',
 'CC11CM17_S18_L001_R1_001.fastq.gz',
 'CC11CM17_S18_L001_R2_001.fastq.gz',
 'CC11CM18_S19_L001_R1_001.fastq.gz',
 'CC11CM18_S19_L001_R2_001.fastq.gz',
 'CC11CM19_S20_L001_R1_001.fastq.gz',
 'CC11CM19_S20_L001_R2_001.fastq.gz',
 'CC11CM1_S2_L001_R1_001.fastq.gz',
 'CC11CM1_S2_L001_R2_001.fastq.gz',
 'CC11CM20_S21_L001_R1_001.fastq.gz',
 'CC11CM20_S21_L001_R2_001.fastq.gz',
 'CC11CM21_S22_L001_

In [4]:
# Here is the regular expression we built over at https://pythex.org
re_fn = re.compile(r'^(?P<specimen>[A-Za-z0-9]+)_(S\d+)_(L\d+)_(?P<read>R\d)_(\d+)\.fastq\.gz$')

In [5]:
# Let's try it out, looking at the first filename
for filename in filenames:
    filename_match = re_fn.search(filename)
    break
filename_match

<re.Match object; span=(0, 31), match='CC11CM0_S1_L001_R1_001.fastq.gz'>

In [6]:
# The groupdict contains our *named* groups, in this case the specimen and read. 
filename_match.groupdict()

{'specimen': 'CC11CM0', 'read': 'R1'}

In [7]:
# We can also just access the named groups directly....
filename_match['specimen']

'CC11CM0'

In [8]:
# We create an empty dataframe for our manifest...
manifest = pd.DataFrame()
for filename in filenames:
    # ... and then for each filename....
    filename_match = re_fn.search(filename)
    # ... use our regular expression to break the filename into parts
    if filename_match is None:
        # Check to be sure there are no filenames where our regular expression fails
        print(filename)
        continue
    # implicit else
    # ... set the specimen name
    manifest.loc[filename_match['specimen'], 'specimen'] = filename_match['specimen']
    # And the proper read (R1 or R2)
    manifest.loc[filename_match['specimen'], filename_match['read']] = os.path.join(BASEDIR, filename)
manifest
    
    

Unnamed: 0,specimen,R1,R2
CC11CM0,CC11CM0,reads/CC11CM0_S1_L001_R1_001.fastq.gz,reads/CC11CM0_S1_L001_R2_001.fastq.gz
CC11CM10,CC11CM10,reads/CC11CM10_S11_L001_R1_001.fastq.gz,reads/CC11CM10_S11_L001_R2_001.fastq.gz
CC11CM11,CC11CM11,reads/CC11CM11_S12_L001_R1_001.fastq.gz,reads/CC11CM11_S12_L001_R2_001.fastq.gz
CC11CM12,CC11CM12,reads/CC11CM12_S13_L001_R1_001.fastq.gz,reads/CC11CM12_S13_L001_R2_001.fastq.gz
CC11CM13,CC11CM13,reads/CC11CM13_S14_L001_R1_001.fastq.gz,reads/CC11CM13_S14_L001_R2_001.fastq.gz
...,...,...,...
CC11CM96,CC11CM96,reads/CC11CM96_S97_L001_R1_001.fastq.gz,reads/CC11CM96_S97_L001_R2_001.fastq.gz
CC11CM97,CC11CM97,reads/CC11CM97_S98_L001_R1_001.fastq.gz,reads/CC11CM97_S98_L001_R2_001.fastq.gz
CC11CM98,CC11CM98,reads/CC11CM98_S99_L001_R1_001.fastq.gz,reads/CC11CM98_S99_L001_R2_001.fastq.gz
CC11CM99,CC11CM99,reads/CC11CM99_S100_L001_R1_001.fastq.gz,reads/CC11CM99_S100_L001_R2_001.fastq.gz
