In [33]:
from pathlib import Path
import pandas as pd
import tidytcells as tt

In [34]:
raw_dir = Path("/home/yutanagano/UCLOneDrive/MBPhD/projects/tcr_embedder/data/raw/mira")
preprocessed_dir = Path(
    "/home/yutanagano/UCLOneDrive/MBPhD/projects/tcr_embedder/data/preprocessed/mira"
)

## Load data

In [35]:
ci = pd.read_csv(raw_dir / "peptide-detail-ci.csv")
cii = pd.read_csv(raw_dir / "peptide-detail-cii.csv")

### Data summary

In [36]:
ci_size = len(ci)
cii_size = len(cii)

ci_groups = ci["Amino Acids"].nunique()
cii_groups = cii["Amino Acids"].nunique()

print(
    f"""
    Data sizes:
    ci: {ci_size}
    cii:{cii_size}

    Group sizes:
    ci: {ci_groups}
    cii:{cii_groups}
    """
)


    Data sizes:
    ci: 154320
    cii:6809

    Group sizes:
    ci: 269
    cii:56
    


## Clean data

In [37]:
ci.head()

Unnamed: 0,TCR BioIdentity,TCR Nucleotide Sequence,Experiment,ORF Coverage,Amino Acids,Start Index in Genome,End Index in Genome
0,CASSAQGTGDRGYTF+TCRBV27-01+TCRBJ01-02,GAGTCGCCCAGCCCCAACCAGACCTCTCTGTACTTCTGTGCCAGCA...,eAV93,"ORF1ab,surface glycoprotein","ADAGFIKQY,AELEGIQY,LADAGFIKQY,TLADAGFIK",533,24073
1,CASSLVATGNTGELFF+TCRBV07-09+TCRBJ02-02,CGCACAGAGCAGGGGGACTCGGCCATGTATCTCTGTGCCAGCAGCT...,eOX56,"ORF1ab,surface glycoprotein","ADAGFIKQY,AELEGIQY,LADAGFIKQY,TLADAGFIK",533,24073
2,CASSKGTVSGLSG+TCRBV21-01+TCRBJ02-07,GAGATCCAGTCCACGGAGTCAGGGGACACAGCACTGTATTTCTGTG...,eAV93,"ORF1ab,surface glycoprotein","ADAGFIKQY,AELEGIQY,LADAGFIKQY,TLADAGFIK",533,24073
3,CALKVGADTQYF+TCRBV30-01+TCRBJ02-03,CTGAGTTCTAAGAAGCTCCTTCTCAGTGACTCTGGCTTCTATCTCT...,eQD124,"ORF1ab,surface glycoprotein","ADAGFIKQY,AELEGIQY,LADAGFIKQY,TLADAGFIK",533,24073
4,CASSLWASGRGGTGELFF+TCRBV27-01+TCRBJ02-02,AGCCCCAACCAGACCTCTCTGTACTTCTGTGCCAGCAGTTTATGGG...,eAV93,"ORF1ab,surface glycoprotein","ADAGFIKQY,AELEGIQY,LADAGFIKQY,TLADAGFIK",533,24073


In [38]:
cii.head()

Unnamed: 0,TCR BioIdentity,TCR Nucleotide Sequence,Experiment,ORF Coverage,Amino Acids,Start Index in Genome,End Index in Genome
0,CASSGSSQGPLWETQYF+TCRBV05-06+TCRBJ02-05,TTGTTGCTGGGGGACTCGGCCCTCTATCTCTGTGCCAGCAGCGGGT...,eLH58,surface glycoprotein,"KVFRSSVLHSTQDLFLPFF,MFVFLVLLPLVSSQCVNLT,NLTTRT...",21563,21739
1,CARRAADTQYF+TCRBV02-01+TCRBJ02-03,NCTCTGAAGATCCGGTCCACAAAGCTGGAGGACTCAGCCATGTACT...,eNL187,surface glycoprotein,"KVFRSSVLHSTQDLFLPFF,MFVFLVLLPLVSSQCVNLT,NLTTRT...",21563,21739
2,CATSRGVSGYTF+TCRBV19-01+TCRBJ01-02,CTCACTGTGACATCGGCCCAAAAGAACCCGACAGCTTTCTATCTCT...,eNL187,surface glycoprotein,"KVFRSSVLHSTQDLFLPFF,MFVFLVLLPLVSSQCVNLT,NLTTRT...",21563,21739
3,CASSPGTGDYEQYF+TCRBV05-01+TCRBJ02-07,GTGAGCACCTTGGAGCTGGGGGACTCGGCCCTTTATCTTTGCGCCA...,eNL192,surface glycoprotein,"KVFRSSVLHSTQDLFLPFF,MFVFLVLLPLVSSQCVNLT,NLTTRT...",21563,21739
4,CSARGQYSNQPQHF+TCRBV20-X+TCRBJ01-05,GTGACCAGTGCCCATCCTGAAGACAGCAGCTTCTACATCTGCAGTG...,eNL187,surface glycoprotein,"KVFRSSVLHSTQDLFLPFF,MFVFLVLLPLVSSQCVNLT,NLTTRT...",21563,21739


### Reformat

In [39]:
def reformat(df):
    df[["CDR3B", "TRBV", "TRBJ"]] = df.apply(
        lambda row: row["TCR BioIdentity"].split("+"), result_type="expand", axis=1
    )
    return df[["Amino Acids", "CDR3B", "TRBV", "TRBJ"]]


ci = reformat(ci)
cii = reformat(cii)

### Standardise

In [40]:
ci["TRBV"] = ci["TRBV"].map(lambda x: tt.tcr.standardise(x, enforce_functional=True))
ci["CDR3B"] = ci["CDR3B"].map(tt.junction.standardise)
ci["TRBJ"] = ci["TRBJ"].map(lambda x: tt.tcr.standardise(x, enforce_functional=True))

cii["TRBV"] = cii["TRBV"].map(lambda x: tt.tcr.standardise(x, enforce_functional=True))
cii["CDR3B"] = cii["CDR3B"].map(tt.junction.standardise)
cii["TRBJ"] = cii["TRBJ"].map(lambda x: tt.tcr.standardise(x, enforce_functional=True))

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


### Drop empty rows

In [42]:
def drop_empty_rows(df):
    return df[
        df["TRBV"].notna()
        & df["CDR3B"].notna()
        & df["Amino Acids"].str.match("^[A-Z]+$")
    ]


ci = drop_empty_rows(ci)
cii = drop_empty_rows(cii)

### Rename and reorder columns

In [43]:
def rename_and_reorder_columns(df: pd.DataFrame):
    df = df[["TRBV", "CDR3B", "TRBJ", "Amino Acids"]]
    df = df.rename(columns={"Amino Acids": "Epitope"})
    return df.reset_index(drop=True)


ci = rename_and_reorder_columns(ci)
cii = rename_and_reorder_columns(cii)

### Combine

In [44]:
combined = pd.concat((ci, cii), ignore_index=True)

In [45]:
combined_size = len(combined)
combined_groups = combined["Epitope"].nunique()

print(
    f"""
    Combined size:      {combined_size}
    Combined group size:{combined_groups}
    """
)


    Combined size:      23132
    Combined group size:148
    


In [74]:
filtered = (
    combined.groupby("Epitope")
    .filter(lambda x: len(x) > 200)
    .sort_values(by=["Epitope", "TRBV", "CDR3B", "TRBJ"], ignore_index=True)
)

## Save preprocessed data

In [75]:
filtered.to_csv(preprocessed_dir / "filtered.csv", index=False)

## Separate into training and validation

In [83]:
valid = filtered.groupby("Epitope").sample(100, random_state=420)
train = filtered[~filtered.index.isin(valid.index)]

In [86]:
valid.to_csv(preprocessed_dir / "valid.csv", index=False)
train.to_csv(preprocessed_dir / "train.csv", index=False)

## Separate into training and validation

In [16]:
valid_frac = 0.1
valid_rows = int(valid_frac * len(combined))

shuffled = combined.sample(frac=1, random_state=420)


def get_clean_ordered_subset(df: pd.DataFrame):
    df = df.groupby("Epitope").filter(lambda x: len(x) >= 2)
    return df.sort_values(by=["Epitope", "TRBV", "CDR3B", "TRBJ"], ignore_index=True)


valid = get_clean_ordered_subset(shuffled.iloc[:valid_rows])
train = get_clean_ordered_subset(shuffled.iloc[valid_rows:])

In [17]:
train.to_csv(preprocessed_dir / "train.csv", index=False)
valid.to_csv(preprocessed_dir / "valid.csv", index=False)