# Create Sample Sheets and Define Groups/Conditions

In [None]:
import pandas as pd
import numpy as np
import os 

In [None]:
#DATA_PATH = os.getcwd()

DATA_PATH = '/data/expression_atlas/v1/GSE122459/'
# DATA_PATH = '/data/expression_atlas/v1/GSE110914/'
# DATA_PATH = '/data/expression_atlas/v1/GSE162828/'
# DATA_PATH = '/data/expression_atlas/v1/GSE120178/'
# DATA_PATH = '/data/expression_atlas/v1/GSE102371/'
# DATA_PATH = '/data/expression_atlas/v1/GSE112087/'
# DATA_PATH = '/data/expression_atlas/v1/GSE139358/'
# DATA_PATH = '/data/expression_atlas/v1/GSE112087/'

METADATA_FH = '' + '%s_metadata.csv' % DATA_PATH.rstrip('/').split('/')[-1]

SRX_COLUMN = 'experiment_accession'
# SRX_COLUMN = 'run_accession

SAMPLE_CONDITION_COLUMNS = {
    'condition_1': 'sample_description',
    }

# SAMPLE_CONDITION_COLUMNS = {
#     'condition_1': 'sample_description',
#     'condition_2': 'sample_description',
#     }

SAMPLE_GROUP_COLUMNS = {
    'group_1': 'instrument_model',
    }

# SAMPLE_GROUP_COLUMNS = {

#     }

# # GSE139358
# SAMPLE_CONDITIONS = {
#     'condition_1' : {
#         'LDG_0':'TREAT_1',
#         'NDN_0':'TREAT_1',
#         'LDG_SLE':'TREAT_1',
#         'NDN_SLE':'TREAT_1', 
#         'NDN_ctrl':'CONTROL',
#         },
#     'condition_2' : {
#         'NDN':'A',
#         'LDG':'B',
#         }
#     }

# # GSE112087
# SAMPLE_CONDITIONS = {
#     'condition_1' : {
#         'MON_SLE':'TREAT_1', 
#         'MS_NORM':'CONTROL',
#         }
#     }

# GSE122459
SAMPLE_CONDITIONS = {
    'condition_1' : {
        'SLE':'TREAT_1', 
        'healthy':'CONTROL',
        }
    }

# # GSE110914
# SAMPLE_CONDITIONS = {
#     'condition_1' : {
#         'T1D':'TREAT_2',
#         'preT1D':'TREAT_1', 
#         'HC':'CONTROL',
#         }
#     }

# # GSE162828
# SAMPLE_CONDITIONS = {
#     'condition_1': {
#         'SLE':'TREAT_1',
#         'Healthy':'CONTROL',
#         }
#     }

# # GSE120178
# SAMPLE_CONDITIONS = {
#     'condition_1' : {
#         'A':'TREAT_1',
#         'E':'TREAT_1',
#         'L':'TREAT_1',
#         'B':'CONTROL',  
#         }
#     }

# # GSE102371
# SAMPLE_CONDITIONS = {
#   'condition_1' : {
#     'islet preparation from normo':'CONTROL',
#     'islet preparation from long':'TREAT_1',
#     'islet preparation from short':'TREAT_1',
#         }
#     }

# GSE122459
SAMPLE_GROUPS = {
    'group_1': {
        'Illumina HiSeq 2500':'A',
        'NextSeq 500':'B',
        },
    }

# SAMPLE_GROUPS = {

#     }

# List of columns to keep in metadata file.
KEEP_COLUMNS = [ 
            'single_end', 
            'strandedness', 
            'experiment_accession', 
            'submission_accession', 
            'library_layout', 
            'library_selection', 
            'library_source', 
            'library_strategy', 
            'library_name', 
            'instrument_model', 
            'instrument_platform', 
            'read_count', 
            'tax_id', 
            'sample_title', 
            'experiment_title', 
            'sample_description',
        ]

def filter_condition(cell: str, condition_key: str) -> bool:
    '''Modify to check for a key present in SAMPLE_CONDITION

    Args:
        cell (str) contents of single cell from metadata dataframe
        condition_key (str) key to search cell 

    Returns:
        (bool) transfer key to cell
    '''
    return cell.startswith(condition_key)
    # return condition_key in cell
    # return cell.endswith(condition_key)

def filter_group(cell: str, group_key: str) -> bool:
    '''Modify to check for key present in SAMPLE_GROUP

    Args:
        cell (str) contents of single cell from metadata dataframe
        group_key (str) key to search cell

    Returns:
        (bool) transfer key to cell
    '''
    return cell.startswith(group_key)
    # return group_key in cell
    # return cell.endswith(group_key)

In [None]:
# Read output of nf-core RNA-seq samplesheet into dataframe

samplesheet_valid = pd.read_csv(os.path.join(DATA_PATH, 'rnaseq_output/pipeline_info/samplesheet.valid.csv'))
print(samplesheet_valid.to_string())
print(samplesheet_valid['sample_description'].to_list())


In [None]:
# Match sample conditions and groups to those in SAMPLE_CONDITION_PREFIXES and SAMPLE_GROUP_PREFIXES 

for k, v in SAMPLE_CONDITION_COLUMNS.items():
    samplesheet_valid[k] = np.nan
    for kg, vg in SAMPLE_CONDITIONS[k].items():
        samplesheet_valid.loc[samplesheet_valid[v].map(lambda x: filter_condition(x, kg)), k] = vg
    assert not samplesheet_valid[k].isnull().any()


for k, v in SAMPLE_GROUP_COLUMNS.items():
    samplesheet_valid[k] = np.nan
    for kg, vg in SAMPLE_GROUPS[k].items():
        samplesheet_valid.loc[samplesheet_valid[v].map(lambda x: filter_group(x, kg)), k] = vg
    assert not samplesheet_valid[k].isnull().any()


In [None]:
# Push metadata to csv. 

metadata = samplesheet_valid[
                        [SRX_COLUMN] + 
                        list(SAMPLE_CONDITIONS.keys()) + 
                        list(SAMPLE_GROUPS.keys()) + 
                        [c for c in KEEP_COLUMNS if c != SRX_COLUMN]
                    ].copy()

metadata.rename({SRX_COLUMN: 'accession'}, inplace=True, axis=1)

# Sample dataframe from output of nf-core rnaseq lists distinct samples by SRR*, 
# but groups samples by SRX* for analysis.
metadata.drop_duplicates(inplace=True)

single_groups = []
for c in metadata.columns:
    if not c.startswith('group') or c.startswith('condition'):
        continue
    if len(metadata[c].value_counts()) == 1:
        single_groups.append(c)

metadata.drop(single_groups, axis=1, inplace=True)

metadata.to_csv(METADATA_FH,index=False)
metadata