In [48]:
from pathlib import Path
import pandas as pd
import tidytcells as tt

In [49]:
data_dir = Path.cwd().parents[1] / "tcr_data"

raw_dir = data_dir/"raw"/"mira"
preprocessed_dir = data_dir/"preprocessed"/"mira"

## Load data

In [50]:
ci = pd.read_csv(raw_dir / "peptide-detail-ci.csv")
cii = pd.read_csv(raw_dir / "peptide-detail-cii.csv")

### Data summary

In [51]:
ci_size = len(ci)
cii_size = len(cii)

ci_groups = ci["Amino Acids"].nunique()
cii_groups = cii["Amino Acids"].nunique()

print(
    f"""
    Data sizes:
    ci: {ci_size}
    cii:{cii_size}

    Group sizes:
    ci: {ci_groups}
    cii:{cii_groups}
    """
)


    Data sizes:
    ci: 154320
    cii:6809

    Group sizes:
    ci: 269
    cii:56
    


## Clean data

In [52]:
ci.head()

Unnamed: 0,TCR BioIdentity,TCR Nucleotide Sequence,Experiment,ORF Coverage,Amino Acids,Start Index in Genome,End Index in Genome
0,CASSAQGTGDRGYTF+TCRBV27-01+TCRBJ01-02,GAGTCGCCCAGCCCCAACCAGACCTCTCTGTACTTCTGTGCCAGCA...,eAV93,"ORF1ab,surface glycoprotein","ADAGFIKQY,AELEGIQY,LADAGFIKQY,TLADAGFIK",533,24073
1,CASSLVATGNTGELFF+TCRBV07-09+TCRBJ02-02,CGCACAGAGCAGGGGGACTCGGCCATGTATCTCTGTGCCAGCAGCT...,eOX56,"ORF1ab,surface glycoprotein","ADAGFIKQY,AELEGIQY,LADAGFIKQY,TLADAGFIK",533,24073
2,CASSKGTVSGLSG+TCRBV21-01+TCRBJ02-07,GAGATCCAGTCCACGGAGTCAGGGGACACAGCACTGTATTTCTGTG...,eAV93,"ORF1ab,surface glycoprotein","ADAGFIKQY,AELEGIQY,LADAGFIKQY,TLADAGFIK",533,24073
3,CALKVGADTQYF+TCRBV30-01+TCRBJ02-03,CTGAGTTCTAAGAAGCTCCTTCTCAGTGACTCTGGCTTCTATCTCT...,eQD124,"ORF1ab,surface glycoprotein","ADAGFIKQY,AELEGIQY,LADAGFIKQY,TLADAGFIK",533,24073
4,CASSLWASGRGGTGELFF+TCRBV27-01+TCRBJ02-02,AGCCCCAACCAGACCTCTCTGTACTTCTGTGCCAGCAGTTTATGGG...,eAV93,"ORF1ab,surface glycoprotein","ADAGFIKQY,AELEGIQY,LADAGFIKQY,TLADAGFIK",533,24073


In [53]:
cii.head()

Unnamed: 0,TCR BioIdentity,TCR Nucleotide Sequence,Experiment,ORF Coverage,Amino Acids,Start Index in Genome,End Index in Genome
0,CASSGSSQGPLWETQYF+TCRBV05-06+TCRBJ02-05,TTGTTGCTGGGGGACTCGGCCCTCTATCTCTGTGCCAGCAGCGGGT...,eLH58,surface glycoprotein,"KVFRSSVLHSTQDLFLPFF,MFVFLVLLPLVSSQCVNLT,NLTTRT...",21563,21739
1,CARRAADTQYF+TCRBV02-01+TCRBJ02-03,NCTCTGAAGATCCGGTCCACAAAGCTGGAGGACTCAGCCATGTACT...,eNL187,surface glycoprotein,"KVFRSSVLHSTQDLFLPFF,MFVFLVLLPLVSSQCVNLT,NLTTRT...",21563,21739
2,CATSRGVSGYTF+TCRBV19-01+TCRBJ01-02,CTCACTGTGACATCGGCCCAAAAGAACCCGACAGCTTTCTATCTCT...,eNL187,surface glycoprotein,"KVFRSSVLHSTQDLFLPFF,MFVFLVLLPLVSSQCVNLT,NLTTRT...",21563,21739
3,CASSPGTGDYEQYF+TCRBV05-01+TCRBJ02-07,GTGAGCACCTTGGAGCTGGGGGACTCGGCCCTTTATCTTTGCGCCA...,eNL192,surface glycoprotein,"KVFRSSVLHSTQDLFLPFF,MFVFLVLLPLVSSQCVNLT,NLTTRT...",21563,21739
4,CSARGQYSNQPQHF+TCRBV20-X+TCRBJ01-05,GTGACCAGTGCCCATCCTGAAGACAGCAGCTTCTACATCTGCAGTG...,eNL187,surface glycoprotein,"KVFRSSVLHSTQDLFLPFF,MFVFLVLLPLVSSQCVNLT,NLTTRT...",21563,21739


### Reformat

In [54]:
def reformat(df):
    df[["CDR3B", "TRBV", "TRBJ"]] = df.apply(
        lambda row: row["TCR BioIdentity"].split("+"), result_type="expand", axis=1
    )
    return df[["Amino Acids", "CDR3B", "TRBV", "TRBJ"]]


ci = reformat(ci)
cii = reformat(cii)

### Standardise

In [55]:
ci["TRBV"] = ci["TRBV"].map(lambda x: tt.tr.standardise(x, enforce_functional=True))
ci["CDR3B"] = ci["CDR3B"].map(tt.junction.standardise)
ci["TRBJ"] = ci["TRBJ"].map(lambda x: tt.tr.standardise(x, enforce_functional=True))
ci["Amino Acids"] = ci["Amino Acids"].map(tt.aa.standardize)

cii["TRBV"] = cii["TRBV"].map(lambda x: tt.tr.standardise(x, enforce_functional=True))
cii["CDR3B"] = cii["CDR3B"].map(tt.junction.standardise)
cii["TRBJ"] = cii["TRBJ"].map(lambda x: tt.tr.standardise(x, enforce_functional=True))
cii["Amino Acids"] = cii["Amino Acids"].map(tt.aa.standardize)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


### Drop empty rows

In [56]:
ci = ci.dropna()
cii = cii.dropna()

### Rename and reorder columns

In [57]:
def rename_and_reorder_columns(df: pd.DataFrame):
    df = df[["TRBV", "CDR3B", "TRBJ", "Amino Acids"]]
    df = df.rename(columns={"Amino Acids": "Epitope"})
    return df.reset_index(drop=True)


ci = rename_and_reorder_columns(ci)
cii = rename_and_reorder_columns(cii)

### Combine

In [58]:
combined = pd.concat((ci, cii), ignore_index=True)

In [59]:
combined_size = len(combined)
combined_groups = combined["Epitope"].nunique()

print(
    f"""
    Combined size:      {combined_size}
    Combined group size:{combined_groups}
    """
)


    Combined size:      44629
    Combined group size:149
    


In [60]:
filtered = (
    combined.groupby("Epitope")
    .filter(lambda x: len(x) > 200)
    .sort_values(by=["Epitope", "TRBV", "CDR3B", "TRBJ"], ignore_index=True)
)

## Save preprocessed data

In [61]:
filtered.to_csv(preprocessed_dir / "filtered.csv", index=False)

## Partition data

In [62]:
minimum_testing_epitope_group_size = filtered.groupby("Epitope").size().min()

test_data = filtered.groupby("Epitope").sample(n=minimum_testing_epitope_group_size, random_state=420)
remainder = filtered[~filtered.index.isin(test_data.index)]

test_data = test_data.sample(frac=1, random_state=420)

In [63]:
remainder_group_sizes = filtered.groupby("Epitope").size().sort_values() - minimum_testing_epitope_group_size
training_epitope_groups = remainder_group_sizes[remainder_group_sizes > 1000]
minimum_training_epitope_group_size = training_epitope_groups.min()

training_data_before_subsampling = remainder[remainder.Epitope.isin(training_epitope_groups.index)]
training_data = training_data_before_subsampling.groupby("Epitope").sample(n=minimum_training_epitope_group_size, random_state=420)

In [64]:
test_data.to_csv(preprocessed_dir/"test.csv", index=False)
training_data.to_csv(preprocessed_dir/"train.csv", index=False)