# Create Sample Sheets and Define Groups/Conditions
* User input needed:
    * Define functions, condition levels, and group levels in cell below.
    * Define specific samples to drop.

In [None]:
import pandas as pd
import numpy as np
import os 

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


### Define functions, condition levels, and group levels in metadata dataframe.
* Description of required metadata columns to be defined in SAMPLE_CONDITIONS:
    * sample_type_1 -> description of primary designation of cell type, tissue, etc.
        * Ex.:
            * BLOOD, PBMC, SYNOVIUM, ILEUM...
    * sample_type_* -> secondary description of designation of cell type, tissue, etc.
       * Ex.:
           * Subdivided tissues, etc.
    * disease_* -> description of related disease for each sample. 
        * Ex.:
            * SYSTEMIC_LUPUS_ERYTHEMATOSUS, RHEUMATOID_ARTHRITIS, SJOGRENS_SYNDROME, ULCERATIVE_COLITIIS, NA(if multiple diseases)...
    * sample_condition_* -> description of condition (HEALTHY/DISEASE) for each sample
        * sample_condition_1 -> primary disease phenotype ex. SLE, RA, T1D, etc.
        * Meant to be partially redundant with below, where below conditions are used to create design matrix, and this is stored as metadata.
    * condition_* -> description of condition (HEALTHY/DISEASE) for each sample
        * condition_1 -> primary disease phenotype, ex. SLE, RA, T1D, T2D, etc.
        * condition_* -> SNPs, drug treatments, etc.
* Description of variables that need to be set in below cell for creating design matrix:
    * SRX_COLUMN -> the SRX accession is usually present in 'experiment_accession' column, but sometimes present in 'run_accession' column depending on project.
    * SAMPLE_CONDITION_COLUMNS -> dictionary with key='condition_name' and value='column to search for different condition levels'.
    * SAMPLE_GROUP_COLUMNS -> similar to above, dictionary with key='group_name' and value='column to search for different group levels'.
    * SAMPLE_CONDITIONS -> dictionary of dictionaries, with outer key='condition_name', inner key='string or pattern to search for in cell', and inner value='condition level to set'.
    * SAMPLE_GROUPS -> dictionary of dictionaries, with outer key='group_name', inner key='string or patter to search for in cell', and inner value='group level to set'.
* Description of functions that need to be configured in below cell:
    * filter_conditon -> described in docstring.
    * filter_group -> described in docstring.

In [None]:
DATA_PATH = '/data/expression_atlas/runs/%s/' % os.getcwd().split('/')[-1]

RESULTS_PATH = DATA_PATH + 'de_results/%s' % DATA_PATH.rstrip('/').split('/')[-1]

if not os.path.exists(DATA_PATH+'de_results'):
    os.mkdir(DATA_PATH+'de_results')

METADATA_FH = DATA_PATH + os.getcwd().split('/')[-1] + '_metadata.csv'

# Define functions, condition levels, and group levels in metadata dataframe.
SRX_COLUMN = 'experiment_accession'

# Example setup for SAMPLE_CONDITION, SAMPLE_CONDITION_COLUMNS, etc.

# SAMPLE_CONDITION_COLUMNS = {
#     'condition-1': 'sample_description',
#     'sample_type_1': 'sample_description',
#     'disease_1': 'sample_description',
#     'sample_condition_1': 'sample_description',
#     'sample_condition_2': 'sample_description',
#     }

# SAMPLE_GROUP_COLUMNS = {
#     }

# SAMPLE_CONDITIONS = {
#     'condition-1' : { 
#         'DR': 'DISEASE_1',
#         'DM': 'DISEASE_1',
#         'Control': 'CONTROL',
#         },
#     'sample_type_1' : {
#         '':'PBMC',
#         },
#     'sample_condition_2': {
#         'DR': 'TYPE_2_DIABETES_RETINOPATHY',
#         'DM': 'TYPE_2_DIABETES_MELLITUS',
#         'Control': 'HEALTHY',
#         },
#     'sample_condition_1': {
#         'DR': 'TYPE_2_DIABETES',
#         'DM': 'TYPE_2_DIABETES',
#         'Control': 'HEALTHY',
#         },
#     'disease_1' : {
#         'DR':'TYPE_2_DIABETES',
#         'DM':'TYPE_2_DIABETES',
#         'Control':'TYPE_2_DIABETES',
#         },
#     }


# SAMPLE_GROUPS = {
#     }

# List of columns to keep in metadata file.
KEEP_COLUMNS = [ 
            'single_end', 
            'strandedness', 
            'experiment_accession', 
            'submission_accession', 
            'library_layout', 
            'library_selection', 
            'library_source', 
            'library_strategy', 
            'library_name', 
            'instrument_model', 
            'instrument_platform', 
            'read_count', 
            'tax_id', 
            'sample_title', 
            'experiment_title', 
            'sample_description',
        ]

def filter_condition(cell: str, condition_key: str) -> bool:
    '''Modify to check for a key present in SAMPLE_CONDITION

    Args:
        cell (str) contents of single cell from metadata dataframe
        condition_key (str) key to search cell 

    Returns:
        (bool) transfer key to cell
    '''
    # return cell.startswith(condition_key)
    return condition_key in cell
    # return cell.endswith(condition_key)

def filter_group(cell: str, group_key: str) -> bool:
    '''Modify to check for key present in SAMPLE_GROUP

    Args:
        cell (str) contents of single cell from metadata dataframe
        group_key (str) key to search cell

    Returns:
        (bool) transfer key to cell
    '''
    #return cell.startswith(group_key)
    return group_key in cell
    # return cell.endswith(group_key)

In [None]:
# Read output of nf-core RNA-seq samplesheet into dataframe

samplesheet_valid = pd.read_csv(os.path.join(DATA_PATH, 'rnaseq_output/pipeline_info/samplesheet.valid.csv'))
samplesheet_valid


In [None]:
# Merge run_table into samplesheet_valid if extra metadata from run_table is needed.

# run_table = pd.read_csv(os.path.join(DATA_PATH, 'rnaseq_output/pipeline_info/SraRunTable.txt'))
# run_table
# samplesheet_valid = samplesheet_valid.merge(
#     run_table.loc[:,['Run',]].rename(
#         {'Run': 'run_accession'},
#         axis=1,
#     ), 
#     on='run_accession',
# )
# samplesheet_valid

### Drop samples from metadata.
* Samples are dropped based on "SRX" id in metadata dataframe.

In [None]:
# Filter out samples that aren't desired in further analysis.

samples_to_remove = []

sample_indices = samplesheet_valid[samplesheet_valid[SRX_COLUMN].isin(samples_to_remove)].index

sample_indices
samplesheet_valid.drop(sample_indices, axis=0, inplace=True)

samplesheet_valid

In [None]:
# Match sample conditions and groups to those in SAMPLE_CONDITION_PREFIXES and SAMPLE_GROUP_PREFIXES 

for k, v in SAMPLE_CONDITION_COLUMNS.items():
    samplesheet_valid[k] = np.nan
    for kg, vg in SAMPLE_CONDITIONS[k].items():
        samplesheet_valid.loc[samplesheet_valid[v].map(lambda x: filter_condition(x, kg)), k] = vg
    try:
        assert not samplesheet_valid[k].isnull().any()
    except Exception as e:
        print(k, v, samplesheet_valid[k])
        raise e


for k, v in SAMPLE_GROUP_COLUMNS.items():
    samplesheet_valid[k] = np.nan
    for kg, vg in SAMPLE_GROUPS[k].items():
        samplesheet_valid.loc[samplesheet_valid[v].map(lambda x: filter_group(x, kg)), k] = vg
    try:
        assert not samplesheet_valid[k].isnull().any()
    except Exception as e:
        print(k, v, samplesheet_valid[k])
        raise e


### Save metadata to csv.

In [None]:
# Push metadata to csv. 

metadata = samplesheet_valid[
                        [SRX_COLUMN] + 
                        list(SAMPLE_CONDITIONS.keys()) + 
                        list(SAMPLE_GROUPS.keys()) + 
                        [c for c in KEEP_COLUMNS if c != SRX_COLUMN]
                    ].copy()

metadata.rename({SRX_COLUMN: 'accession'}, inplace=True, axis=1)


# Collapse technical replicates into eachother by summing read counts across runs.
if 'read_count' in metadata.columns:
    metadata['read_count'] = metadata.groupby('accession')['read_count'].transform('sum')

# Sample dataframe from output of nf-core rnaseq lists distinct samples by SRR*, 
# but groups samples by SRX* for analysis.
metadata.drop_duplicates(inplace=True)

single_groups = []
for c in metadata.columns:
    if not c.startswith('group') or c.startswith('condition'):
        continue
    if len(metadata[c].value_counts()) == 1:
        single_groups.append(c)

metadata.drop(single_groups, axis=1, inplace=True)

metadata.to_csv(METADATA_FH, index=False)
metadata