In [1]:
import pathlib
import re

import numpy as np
import pandas as pd
import tqdm.notebook

# 1. Training data

In [2]:
root = pathlib.Path('data/raw/')
files = list(root.glob('*.tsv'))

In [3]:
def is_normal(barcode):
    # Parsing the TCGA sample barcode to find whether tumor (01-09), normal (10-19),
    # or control (20-29). See https://www.biostars.org/p/313063/#313066.
    sample_type = barcode[13:15]
    if sample_type < '10':
        return False
    elif sample_type >= '10' and sample_type < '20':
        return True
    else:
        raise ValueError

In [4]:
full_data = pd.DataFrame()

for file in files:
    data = pd.read_csv(file, sep='\t')
    reshaped = (
        data
        .set_index('gene')
        .stack()
        .unstack('gene')
        .reset_index()
        .rename(columns={'index': 'sample_barcode'})
        .assign(source=file.stem)
    )
    reshaped.columns.rename(None, inplace=True)
    
    full_data = pd.concat([full_data, reshaped])

In [5]:
full_data = (
    full_data
    .assign(cancer=lambda df: df['sample_barcode'].apply(is_normal).astype(int))
)

full_data.head(2)

Unnamed: 0,sample_barcode,A1BG,A1CF,A2BP1,A2LD1,A2ML1,A2M,A4GALT,A4GNT,AAA1,...,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,psiTPTE22,tAKR,source,cancer
0,TCGA-2W-A8YY-01A-11R-A37O-07,71.1123,0.0,0.7153,101.1159,58.6552,2818.2439,114.4492,0.3577,0.0,...,1256.0801,0.3577,740.701,3442.0601,1228.8984,903.0758,2.5036,0.0,CESC,0
1,TCGA-4J-AA1J-01A-21R-A38B-07,27.2619,0.4272,4.2717,110.205,3340.0256,4097.7873,965.8266,2.1358,0.0,...,928.663,15.378,747.1166,4852.6271,643.3148,812.0461,8.1162,0.0,CESC,0


# 2. Test data

## 2.1. Breast cancer single cell gene expression

In [6]:
breast_cancer_df = pd.read_csv(
    'GSE75688_GEO_processed_Breast_Cancer_raw_TPM_matrix.txt.gz', 
    sep='\t'
)

breast_cancer_labels_df = pd.read_csv('GSE75688_final_sample_information.txt.gz', sep='\t')

In [7]:
breast_cancer_clean_labels = (
    breast_cancer_labels_df
    .query('type == "SC"')
    .assign(cancer=lambda df: (df['index'] == 'Tumor').astype(int))
    .rename(columns={'sample': 'sample_barcode'})
    .filter(items=['sample_barcode', 'cancer'])
)

breast_cancer_clean_labels.head(2)

Unnamed: 0,sample_barcode,cancer
0,BC01_02,1
1,BC01_03,1


In [8]:
filtered_breast_cancer_df = (
    breast_cancer_df
    .groupby('gene_name')
    .mean()
    .stack()
    .unstack('gene_name')
    .reset_index()
    .rename(columns={'index': 'sample_barcode'})
    .merge(breast_cancer_clean_labels, how='right', on='sample_barcode')
    .assign(source='breast')
)

filtered_breast_cancer_df.head(2)

Unnamed: 0,sample_barcode,5S_rRNA,7SK,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2ML1-AS1,...,snoZ247,snoZ278,snoZ40,snoZ5,snoZ6,snosnR60_Z15,snosnR66,yR211F11.2,cancer,source
0,BC01_02,0.0,0.0,28.01,0.0,0.0,0.0,0.0,18.13,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,breast
1,BC01_03,0.0,0.0,6.92,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,breast


In [9]:
# Check the frequencies match Kim et al.
filtered_breast_cancer_df['cancer'].value_counts()

1    317
0    198
Name: cancer, dtype: int64

## 2.2. Melanoma single cell gene expression

In [10]:
melanoma_df = pd.read_csv('GSE72056_melanoma_single_cell_revised_v2.txt.gz', 
                          sep='\t')

In [11]:
formatted_melanoma_df = (
    melanoma_df
    # Average to clean the two genes with duplicates (MATCH1 and MATCH2)
    .groupby('Cell')
    .mean()
    .stack()
    .unstack('Cell')
    .reset_index()
    # Rename well-known error due to MS Excel
    .rename(columns={'1-Dec': 'DEC1', 'index': 'sample_barcode'})
    .assign(
        cancer=lambda df: df['malignant(1=no,2=yes,0=unresolved)']
                            .map({1: 0, 2: 1, 0: np.nan}),
        source='melanoma'
    )
    .drop(columns=['tumor', 'malignant(1=no,2=yes,0=unresolved)', 
                   'non-malignant cell type (1=T,2=B,3=Macro.4=Endo.,5=CAF;6=NK)'])
)

formatted_melanoma_df.columns.name = None

formatted_melanoma_df.head(2)

Unnamed: 0,sample_barcode,DEC1,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2MP1,A4GALT,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,cancer,source
0,Cy72_CD45_H02_S758_comb,0.0,0.0,0.0,0.23266,0.0,0.0,0.39835,0.0,0.0,...,0.0,2.8663,4.2161,1.0786,1.2449,0.0,0.0,0.0,0.0,melanoma
1,CY58_1_CD45_B02_S974_comb,0.0,0.0,0.0,0.21412,0.0,0.0,0.31267,0.0,0.0,...,0.0,0.0,1.3057,1.751,1.06,0.0,0.0,0.0,0.0,melanoma


In [12]:
# Check the frequencies match Kim et al.
formatted_melanoma_df['cancer'].value_counts()

0.0    3256
1.0    1257
Name: cancer, dtype: int64

# 3. Harmonize train and test data

They don't report exactly the same genes. Only ~90% of train genes overlap with the other two.

In [13]:
train_genes = set(full_data.iloc[0, 1:-2].index)
breast_cancer_genes = set(filtered_breast_cancer_df.iloc[0, 1:-1].index)
melanoma_genes = set(formatted_melanoma_df.iloc[0, 1:-1].index)

In [14]:
shared_genes = list(
    train_genes
    .intersection(breast_cancer_genes)
    .intersection(melanoma_genes)
)

len(shared_genes)

18416

In [15]:
train_df = full_data[['sample_barcode', 'source', 'cancer'] + shared_genes]

train_df.head(2)

Unnamed: 0,sample_barcode,source,cancer,C3orf30,TMEM31,FAM57B,ZNF366,NSMCE1,FAM150B,CTSB,...,TRIM55,KRT19,SNORD35B,RASGEF1B,CELA1,PERP,ITGAM,HIBADH,TMEM176A,LAP3
0,TCGA-2W-A8YY-01A-11R-A37O-07,CESC,0,0.0,0.3577,1.7883,6.4378,1175.608,1.073,29436.6953,...,0.0,9517.8827,0.0,254.2918,0.0,7296.495,248.2117,1050.7868,375.8941,2053.2904
1,TCGA-4J-AA1J-01A-21R-A38B-07,CESC,0,0.4272,0.4272,0.5297,12.3879,794.1051,2.563,19197.3516,...,0.4272,54491.6702,0.0,174.7117,0.0,17494.2332,98.6758,811.1918,678.3426,2566.4246


In [16]:
test_df = pd.concat([
    filtered_breast_cancer_df[['sample_barcode', 'source', 'cancer'] + shared_genes],
    formatted_melanoma_df[['sample_barcode', 'source', 'cancer'] + shared_genes]
])

test_df.head(2)

Unnamed: 0,sample_barcode,source,cancer,C3orf30,TMEM31,FAM57B,ZNF366,NSMCE1,FAM150B,CTSB,...,TRIM55,KRT19,SNORD35B,RASGEF1B,CELA1,PERP,ITGAM,HIBADH,TMEM176A,LAP3
0,BC01_02,breast,1.0,0.0,0.0,0.0,0.0,10.15,0.0,93.22,...,0.0,321.25,0.0,0.0,0.0,11.5,0.0,63.86,0.0,47.68
1,BC01_03,breast,1.0,0.0,0.0,0.0,0.0,73.42,0.0,30.79,...,0.0,531.77,0.0,0.0,0.0,53.5,0.0,19.04,0.0,10.73


In [19]:
train_df.to_csv('data/filtered/train.tsv.gz', sep='\t', index=False)
test_df.to_csv('data/filtered/test.tsv.gz', sep='\t', index=False)