# Organize samples
We leverage the sample tables from CCLE, Sanger, and DepMap to make sure we have a consistent set of samples to work with. We export a file that contains names of cell lines based on the Broad, CCLE, and Sanger (`formatted/cell-lines-names.raw.txt`) and then manually checked it (`formatted/cell-lines-names.formatted.txt`). 

In [1]:
import pandas as pd

gdsc_model_info = pd.read_csv('source/gdsc/model_list_20200204.csv')
ccle_model_info = pd.read_csv('source/ccle-2019/data_clinical_sample.txt', sep='\t', comment='#')

fibroblast_maps = (pd.
                   read_excel('source/ccle-2019/41586_2019_1186_MOESM4_ESM.xlsx', 
                              sheet_name='Cell line name changes')
                   .iloc[:45, :]
                   .set_index('old_CCLE_ID')
                   .loc[:, 'new_CCLE_ID'])

gdsc_to_ccle = gdsc_model_info.loc[:, ['model_id', 'CCLE_ID']].dropna()
gdsc_to_ccle['CCLE_ID'].replace(fibroblast_maps, inplace=True)
gdsc_to_ccle = gdsc_to_ccle.set_index('model_id')['CCLE_ID']

depmap = pd.read_csv('source/depmap/sample_info.csv')
depmap['CCLE_Name'].replace(fibroblast_maps, inplace=True)
depmap_maps = depmap.loc[:, ['CCLE_Name', 'Sanger_Model_ID']].dropna()
depmap_maps = depmap_maps[~depmap_maps['Sanger_Model_ID'].isin(gdsc_to_ccle.to_frame().reset_index()['model_id'])].set_index('Sanger_Model_ID')['CCLE_Name']
gdsc_to_ccle = pd.concat([gdsc_to_ccle, depmap_maps])

## Generate unique samples

In [2]:
ccle_unique = ccle_model_info['SAMPLE_ID'].replace(fibroblast_maps).drop_duplicates().sort_values()
gdsc_unique = gdsc_model_info['model_id'].drop_duplicates().sort_values()
broad_unique = depmap['DepMap_ID'].drop_duplicates().sort_values()
other_broad = ccle_model_info['DEPMAPID'].dropna()[~ccle_model_info['DEPMAPID'].dropna().isin(broad_unique)]

## Concat by sample name type

In [3]:
all_broad = pd.concat([
    depmap['DepMap_ID'],
    ccle_model_info['DEPMAPID'],
    gdsc_model_info['BROAD_ID']
]).dropna().drop_duplicates().sort_values().reset_index(drop=True)

all_broad = all_broad[all_broad.str.len().eq(10)].reset_index(drop=True)
all_broad = pd.DataFrame('', index=all_broad, columns=['ccle', 'sanger'])

In [4]:
all_ccle = pd.concat([
    ccle_model_info['SAMPLE_ID'].replace(fibroblast_maps),
    gdsc_model_info['CCLE_ID'].replace(fibroblast_maps),
    depmap['CCLE_Name'].replace(fibroblast_maps)
]).dropna().drop_duplicates().sort_values().reset_index(drop=True)

all_ccle = pd.DataFrame('', index=all_ccle, columns=['sanger', 'broad'])

In [5]:
all_sanger = pd.concat([
    gdsc_model_info['model_id'],
    depmap['Sanger_Model_ID'],
]).dropna().drop_duplicates().sort_values().reset_index(drop=True)

all_sanger = pd.DataFrame('', index=all_sanger, columns=['ccle', 'broad'])

In [6]:
depmap_formatted = (depmap
 .loc[:, ['DepMap_ID', 'CCLE_Name', 'Sanger_Model_ID']]
 .rename(columns={'CCLE_Name': 'ccle_name', 'Sanger_Model_ID': 'sanger', 'DepMap_ID': 'broad'})
)
depmap_formatted['ccle_name'] = depmap_formatted['ccle_name'].replace(fibroblast_maps)

ccle_formatted = (ccle_model_info
                  .loc[:, ['SAMPLE_ID', 'DEPMAPID']]
                  .rename(columns={'SAMPLE_ID': 'ccle_name', 'DEPMAPID': 'broad'})
                 )
ccle_formatted['ccle_name'] = ccle_formatted['ccle_name'].replace(fibroblast_maps)

sanger_formatted = (
    gdsc_model_info
    .loc[:, ['model_id', 'CCLE_ID', 'BROAD_ID']]
    .rename(columns={'model_id': 'sanger', 'CCLE_ID': 'ccle_name', 'BROAD_ID': 'broad'})
)
sanger_formatted['ccle_name'] = sanger_formatted['ccle_name'].replace(fibroblast_maps)

In [7]:
depmap_sanger = pd.concat([
    depmap_formatted,
    sanger_formatted,
])

missing_ccle = ccle_formatted[~ccle_formatted['ccle_name'].isin(depmap_sanger['ccle_name'])]
depmap_sanger = pd.concat([
    depmap_sanger,
    missing_ccle
])

depmap_sanger = (depmap_sanger
 .sort_values(['broad', 'ccle_name', 'sanger'])
 .drop_duplicates(['broad', 'ccle_name'], keep='first')
)

depmap_sanger.to_csv('formatted/cell-line-names.raw.txt', sep='\t', index=False)

## Check

In [9]:
checked = pd.read_csv('formatted/cell-line-names.formatted.txt', sep='\t')

In [10]:
idx_ccle = (all_ccle.reset_index()['index'].isin(checked['ccle_name']) | all_ccle.reset_index()['index'].isin(checked['alt_ccle']))
idx_ccle.value_counts()

True     1927
False       1
Name: index, dtype: int64

In [11]:
all_ccle.reset_index()[~idx_ccle]['index']

1673    SR786_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE;SR786...
Name: index, dtype: object

In [12]:
idx_broad = all_broad.reset_index()['index'].isin(checked['broad']) | all_broad.reset_index()['index'].isin(checked['alt_broad'])
idx_broad.value_counts()

True    1823
Name: index, dtype: int64

In [13]:
checked['broad'].value_counts()[checked['broad'].value_counts().gt(1)]

Series([], Name: broad, dtype: int64)

In [14]:
checked['sanger'].value_counts().value_counts()

1    1580
Name: sanger, dtype: int64