In [5]:
"""
Data parsers for files generated from different types of assays:
- ELISA (Enzyme-linked immunosorbent assay):
    -> data files are generated by a microplate reader from 96-well plates
- Softmax:
    -> another microplate reader for ELISA assays that allows for more extensive 
       data analysis for 96 and 384-well plates
- FACS (Fluorescence-activated cell sorting):
    -> data files are generated by different FACS machines and preprocessed with FlowJo. 
    -> the output data files are parsed
- Biacore
- Biacore4000
- MSD
Some of the machine-generated data is sometimes preprocessed with a specific software, i.e. FlowJo for FACS. 
"""

from StringIO import StringIO
import re
import os
import csv
import pprint

# init pprint
pp = pprint.PrettyPrinter(indent=2)

In [6]:
def straight_elisa_parse_file(data_file):
    """ Parsing Straight ELISA files. Used for a single and multiple plates in one file.
    Args:
        data_file: file to Parse
    Returns:
        plates: a dictionary of plate, position, values
    """
    data_file.seek(0)
    data_file = StringIO(data_file.read().replace('\r', '\n'))
    plate_pattern = re.compile(r"^Plate:\t([^\t]+)\t")
    values_pattern = re.compile(r"^\t[^\t()A-Za-z]*((\t\d+(\.\d+)?){%d})\s*$" % 12)
    plates = {}
    for line in data_file:
        plate_match = plate_pattern.match(line)
        if plate_match:
            # Reset count
            last_position = 1
            plate_name = plate_match.group(1).strip()
            plates[plate_name] = []
        else:
            values_match = values_pattern.match(line)
            if values_match:
                values = values_match.group(1).strip().split("\t")
                for position, value in zip( range(last_position, last_position + len(values)), values):
                    plates[plate_name].append(float(value))
                last_position += len(values)
    return plates

# test a single 96 ELISA plate
#file_name = 'Data/1_straight_elisa.txt'
# test multiple 96 ELISA plates | NOTE: make sure to test with an original file (not splitted) 
file_name = 'Data/9_splitted_384.txt'
with open(file_name) as data_file: 
    plates = straight_elisa_parse_file(data_file)
    pp.pprint(plates)

{ 'A3792.Ag-79-01': [ 0.1133,
                      0.116,
                      0.101,
                      0.1139,
                      0.1051,
                      0.1179,
                      0.1309,
                      0.1234,
                      0.1148,
                      0.121,
                      0.1157,
                      0.1172,
                      0.1273,
                      0.1197,
                      0.1164,
                      0.1178,
                      0.1102,
                      0.0914,
                      0.117,
                      0.1223,
                      0.1287,
                      0.1292,
                      0.1287,
                      0.1244,
                      0.1293,
                      0.108,
                      0.1143,
                      0.1014,
                      0.0968,
                      0.1014,
                      0.1115,
                      0.112,
                      0.1049,
                

In [7]:
def softmax_parse_file(data_file):
    """ Parsing 384 ELISA files. Used for a single and multiple plates in one file.
    Args:
        data_file: file to Parse
    Returns:
        plates: a dictionary of plate, position, values
    """
    data_file = StringIO(data_file.read().replace('\r', '\n'))
    cols = 24
    plate_pattern = re.compile(r"^Plate:\t([^\t]+)\t")
    values_pattern = re.compile(r"^\t[^\t()A-Za-z]*((\t\d+(\.\d+)?){%d})\s*$" % cols)
    plate_names = []
    col_headers = '123456789101112131415161718192021222324' # 1-24 numeric col headers
    plates = {}
    for line in data_file:
        plate_match = plate_pattern.match(line)
        value_match = values_pattern.match(line)
        if plate_match:
            last_position = 1
            plate_name = plate_match.group(1).strip()
            plate_names.append(plate_name)
        elif value_match:
            values = value_match.group(1).strip().split("\t")
            current_plate_name = plate_names[-1]
            plates.setdefault(current_plate_name, [])
            for value in values:
                plates[current_plate_name].append(float(value))
    return plates

# test a single 384 ELISA plate
#file_name = 'Data/2_elisa_384.txt'
file_name = 'Data/2a_multiple_elisa_384.txt'
#file_name = 'Data/5_elisa_384_no_temp.txt'
with open(file_name) as data_file:
    plates = softmax_parse_file(data_file)
    pp.pprint(plates)

{ 'PTH cross screen S3210-01 and 02': [ 0.0548,
                                        0.0511,
                                        0.0546,
                                        0.0551,
                                        0.117,
                                        0.0568,
                                        0.1388,
                                        0.0957,
                                        0.3267,
                                        0.2228,
                                        0.1439,
                                        0.0761,
                                        0.2641,
                                        0.13,
                                        0.0924,
                                        0.1247,
                                        0.1051,
                                        0.0799,
                                        0.1259,
                                        0.1028,
                                        0.4

In [8]:
def get_matrix_pos(col, row):
    """ From col and row show position in the matrix. """
    return col + 24 * (row - 1)

def split_4_quadrants(data_file):
    """ Split 384 file into 4 plates represented by each quadrant: 
            UL (Upper Left) | UR (Upper Right)
            LL (Lower Left) | LR (Lower Right)
    Args:
        data_file: ELISA 384-well plate
    Returns:
        An ordered dictionary of quadrant_platename_1: {pos_1: value, ...},
                                 quadrant_platename_2: {pos_1: value, ...}
    """
    plates = softmax_parse_file(data_file)
    quadrants = {'UL': {'x': [1, 12], 'y': [1, 8]},
                 'UR': {'x': [13, 24], 'y': [1, 8]},
                 'LL': {'x': [1, 12], 'y': [9, 16]},
                 'LR': {'x': [13, 24], 'y': [9, 16]}
                 }
    quadrant_plates = {}
    for plate, values in plates.iteritems():
        for quadrant, coordinates in quadrants.iteritems():
            # plate name is: "quadrant_plate"
            plate_name = "{0}_{1}".format(quadrant, plate)
            # get the list of predefined coordinates on x and y axis
            cols_coordinates = coordinates['x']
            rows_coordinates = coordinates['y']
            quadrant_plates.setdefault(plate_name, [])
            # walk through coordinates, get the position on the matrix, and show the correspoinding value
            for row in range(rows_coordinates[0], rows_coordinates[1] + 1):
                for col in range(cols_coordinates[0], cols_coordinates[1] + 1):
                    pos = get_matrix_pos(col, row)
                    quadrant_plates[plate_name].append(values[pos-1])
    return quadrant_plates

# test a single 384 ELISA - splitting into 4 quadrants
file_name = 'Data/2_elisa_384.txt'
with open(file_name) as data_file: 
    plates = split_4_quadrants(data_file)
    pp.pprint(plates)

{ 'LL_PTH cross screen S3210-01 and 02': [ 0.083,
                                           0.0516,
                                           0.0504,
                                           0.0519,
                                           0.0531,
                                           0.0517,
                                           0.0736,
                                           0.0519,
                                           0.0553,
                                           0.0527,
                                           0.0763,
                                           0.0558,
                                           0.0844,
                                           0.0494,
                                           0.051,
                                           0.0502,
                                           0.0537,
                                           0.0527,
                                           0.0747,
                                 

In [9]:
def prepare_default_dict(plates):
    """ Quadrants are used to separate plate names; user-defined plate names will be used in the future.
    Args:
        plates: a list of plate names being parsed
    """
    quadrants = ['UL', 'UR', 'LL', 'LR']
    quadrant_plates = {}
    for plate in plates:
        for quadrant in quadrants:
            plate_name = "{0}_{1}".format(quadrant, plate)
            quadrant_plates.setdefault(plate_name, [])
    return quadrant_plates

def get_matrix(rows, cols):
    """
    Args:
        rows: a number of rows in the matrix
        cols: a number of cols in the matrix
    Return:
        A matrix for rows x cols and take into account offset of 1.
    """
    return [(i, j) for i in range(1, cols + 1) for j in range(1, rows + 1)]

def update_pos_value(plate_name, quadrant, pos, values, quadrant_plates):
    """ Add pos:value to a plate.
    """
    plate_name = "{0}_{1}".format(quadrant, plate_name)
    return quadrant_plates[plate_name].append(values[pos-1])

def split_96_quadrants(data_file):
    """ Split 384 plate into 96 quadrant in the following format:
          1 2 3 4 ...
        1 A B A B ...
        2 C D C D ...
        .
        .
        where A == Plate 1, B == Plate 2, C == Plate 3, D == Plate 4
    """
    # get the plates
    plates = softmax_parse_file(data_file)
    # generate default dict
    quadrant_plates = prepare_default_dict(plates.keys())
    # generate the matrix for the 384 plates
    matrix = get_matrix(24, 16)
    # add the approapiate data to each plate
    for plate, values in plates.iteritems():
        for pos in matrix:
            pos_y = pos[0]
            pos_x = pos[1]
            # algorithm to determine which data is related to which plate
            # based on (x, y) coordinates in the 384 plate matrix
            if pos_x % 2 != 0 and pos_y % 2 != 0:
                pos = get_matrix_pos(pos_x, pos_y)
                update_pos_value(plate, 'UL', pos, values, quadrant_plates)
            elif pos_x % 2 != 0 and pos_y % 2 == 0:
                pos = get_matrix_pos(pos_x, pos_y)
                update_pos_value(plate, 'UR', pos, values, quadrant_plates)
            elif pos_x % 2 == 0 and pos_y % 2 != 0:
                pos = get_matrix_pos(pos_x, pos_y)
                update_pos_value(plate, 'LL', pos, values, quadrant_plates)
            elif pos_x % 2 == 0 and pos_y % 2 == 0:
                pos = get_matrix_pos(pos_x, pos_y)
                update_pos_value(plate, 'LR', pos, values, quadrant_plates)

    return quadrant_plates

file_name = 'Data/2_elisa_384.txt'
with open(file_name) as data_file:
    plates = split_96_quadrants(data_file)
    pp.pprint(plates)

{ 'LL_PTH cross screen S3210-01 and 02': [ 0.0511,
                                           0.0551,
                                           0.0568,
                                           0.0957,
                                           0.2228,
                                           0.0761,
                                           0.13,
                                           0.1247,
                                           0.0799,
                                           0.1028,
                                           0.0716,
                                           0.0636,
                                           0.0526,
                                           0.0521,
                                           0.0511,
                                           0.0535,
                                           0.0538,
                                           0.0538,
                                           0.0594,
                                 

In [10]:
FACS_PATTERN = re.compile(r'[\w\d_-]+_[A-Z]\d+_(?P<well>[A-Z]\d{2})')
ANTIGEN_PATTERN = re.compile(r'(Ag\d+)')
ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"


def well_name_position(well_name, columns=12):
    '''Convert a well name (A01-K12) or (1-96) to its position (1-96)
    '''
    match = re.match( r"^(?P<row>[A-W])?(?P<column>\d{1,2})$", well_name.upper())
    if match:
        # If there is an alphanumeric row position
        if match.group("row"):
            row = ALPHABET.index(match.group("row"))
            column = int(match.group("column"))
            return (row * columns) + column
        # Otherwise the only position is an absolute one
        else:
            return int(match.group("column"))
    else:
        raise ValueError("Invalid well_name: %s" % str(well_name))


def antigen_header(antigen_header):
    """ Return antigen name
    """
    match = ANTIGEN_PATTERN.search(antigen_header)
    return match.group(1) if match else antigen_header


def facs_parse_file(data):
    """ Parsing ACCURI and FLOWJO FACS files.
    Args:
        data_file: file to Parse
    Returns:
        plates: a dictionary of plate, position, values
    """
    data_file.seek(0)
    filename = os.path.split(data_file.name)[1]
    data = data_file.read().replace('\r', '\n')
    reader = csv.reader(StringIO(data), dialect=csv.Sniffer().sniff(data))
    rows = [ row for row in reader if row ]
    default_name = os.path.splitext(filename)[0]
    antigen_names = [antigen_header(c) for c in rows[0][1:]]
    plates = {}
    for row in rows[1:]:
        match = FACS_PATTERN.search(row[0])
        if match:
            well = match.groupdict()['well']
            position = well_name_position(well)
            for column, ag_name in zip(row[1:], antigen_names):
                plate_name = match.groupdict().get('plate', default_name)
                full_plate_name = "{0} - {1}".format(plate_name, ag_name)
                plates.setdefault(full_plate_name, []).append(float(column))
    return plates

# file_name = 'Data/6_facs_96.txt'
file_name = 'Data/7_facs_384.txt'
with open(file_name) as data_file:
    plates = facs_parse_file(data_file)
    pp.pprint(plates)

{ '7_facs_384 - CHOKO 22': [ 139.0,
                             132.0,
                             140.0,
                             101.0,
                             1311.0,
                             1851.0,
                             125.0,
                             1199.0,
                             383.0,
                             4100.0,
                             3663.0,
                             303.0,
                             293.0,
                             118.0,
                             184.0,
                             877.0,
                             2878.0,
                             1656.0,
                             182.0,
                             262.0,
                             165.0,
                             215.0,
                             229.0,
                             5221.0,
                             130.0,
                             119.0,
                             122.0,
                    

In [11]:
def biacore_parse_file(data_file):
    """ Parsing Biacore files.
    Args:
        data_file: file to Parse
    Returns:
        plates: a dictionary of plate, position, values
    """
    data_file.seek(0)
    data = data_file.read().replace('\r', '\n')
    reader = csv.reader(StringIO(data), dialect=csv.Sniffer().sniff(data))
    rows = [row for row in reader if row]
    plates = {}
    for row in rows:
        if len(row) != 3:
            raise ValueError("Every row should have 3 columns")
        name = row[0].strip()
        position = well_name_position(row[1])
        value = float(row[2])
        plates.setdefault(name, []).append(value)
    return plates


def biacore4000_parse_file(data_file):
    """ Parsing Biacore4000 files.
    Args:
        data_file: file to Parse
    Returns:
        plates: a dictionary of plate, position, values
    """
    data_file.seek(0)
    data = data_file.read().replace('\r', '\n')
    reader = csv.reader(StringIO(data), dialect=csv.Sniffer().sniff(data))
    rows = [row for row in reader if row]
    plates = {}
    # skip the first line
    for row in rows[1:]:
        sample_name = row[7]
        antigen = row[5]
        value = row[3]
        match = re.match(r'^(.+)_(\w\d{2})$', sample_name)
        if match:
            plate_name = '{0} - {1}'.format(match.group(1), antigen)
            plates.setdefault(plate_name, []).append(float(value))
    return plates


print "Biacore: " + "====="*10
#file_name = 'Data/3_biacore.csv'
file_name_biacore = 'Data/3a_biacore_multiple.csv'
with open(file_name_biacore) as data_file:
    plates = biacore_parse_file(data_file)
    pp.pprint(plates)


print "Biacore4000: " + "====="*10

file_name_biacore_4000 = 'Data/4_biacore_4000.txt'
with open(file_name_biacore_4000) as data_file:
    plates = biacore4000_parse_file(data_file)
    pp.pprint(plates)
    
    

{ '260.B1.8B2.Ag104-01': [ 7.7,
                           8.0,
                           470.5,
                           856.9,
                           871.9,
                           818.7,
                           716.5,
                           514.9,
                           432.4,
                           594.8,
                           587.4,
                           938.0,
                           245.0,
                           428.5,
                           587.7,
                           226.9,
                           195.9,
                           400.3,
                           329.1,
                           452.0,
                           436.3,
                           449.8,
                           739.6,
                           827.5,
                           543.9,
                           269.4,
                           147.1,
                           125.2,
                           412.8,
                  

In [12]:
def msd_parse_file(data_file):
    """ Parsing MSD files.
    Args:
        data_file: file to Parse
    Returns:
        plates: a dictionary of plate, position, values
    """
    data_file = StringIO(data_file.read().replace('\r', '\n'))
    plate_pattern = re.compile(r"^Plate #\s+:\s+([^\s]+)\s*$")
    values_pattern = re.compile(r"^[A-Z]+((\s+-?\d+){%d})\s*$" % 12)
    plate_names = []
    plates = {}
    for line in data_file:
        plate_match = plate_pattern.match(line)
        if plate_match:
            plate_name = plate_match.group(1)
            plate_names.append(plate_name)
        else:
            values_match = values_pattern.match(line)
            if values_match:
                values = values_match.group(1).strip().split()
                current_plate_name = plate_names[-1]
                plates.setdefault(current_plate_name, [])
                for value in values:
                    plates[current_plate_name].append(float(value))
    return plates

# file_name = 'Data/8_msd_single_plate.txt'
file_name = 'Data/8a_msd_multiple_plates.txt'
with open(file_name) as data_file:
    plates = msd_parse_file(data_file)
    for plate, values in plates.iteritems():
        print plate, " : ", len(values)
        print values

test-plate-1  :  96
[46.0, 41.0, 44.0, 58.0, 39.0, 49.0, 41.0, 36.0, 42.0, 40.0, 43.0, 34.0, 32.0, 38.0, 36.0, 27.0, 34.0, 37.0, 40.0, 29.0, 263.0, 41.0, 85.0, 63.0, 51.0, 68.0, 52.0, 81.0, 69.0, 57.0, 48.0, 33.0, 93.0, 97.0, 38.0, 79.0, 936.0, 138.0, 224.0, 42.0, 123.0, 46.0, 216.0, 42.0, 227.0, 80.0, 52.0, 37.0, 62.0, 291.0, 42.0, 46.0, 53.0, 36.0, 35754.0, 37.0, 32.0, 37.0, 40.0, 34.0, -2.0, 3.0, 2.0, -3.0, 0.0, -2.0, 4.0, -7.0, -7.0, 1.0, -3.0, -8.0, -5.0, -3.0, -2.0, -2.0, -7.0, -1.0, -3.0, -7.0, -9.0, -6.0, -7.0, -10.0, 1.0, -2.0, 0.0, 1.0, -6.0, -2.0, -2.0, 2.0, -4.0, -5.0, 1.0, -4.0]
test-plate-2  :  96
[52.0, 100.0, 52.0, 81.0, 57.0, 79.0, 47.0, 49.0, 43.0, 39.0, 44.0, 42.0, 45.0, 48.0, 49.0, 70.0, 45.0, 44.0, 48.0, 46.0, 201.0, 39.0, 71.0, 60.0, 64.0, 67.0, 64.0, 78.0, 69.0, 55.0, 56.0, 46.0, 79.0, 69.0, 43.0, 59.0, 774.0, 94.0, 90.0, 48.0, 76.0, 51.0, 93.0, 49.0, 83.0, 63.0, 54.0, 42.0, 59.0, 100.0, 42.0, 50.0, 46.0, 38.0, 70.0, 42.0, 38.0, 41.0, 40.0, 43.0, 5.0, 0.0, 9.0, 7