as we saw in the IEDB_analyze, there are only a few elements which have calculated but not curated values. that's why we decided to only use curated values, they are set far more often and are also more trustworthy.

In [4]:
import pandas as pd
import numpy as np

In [7]:
path_prefix = '../../data/IEDB'

curatedPropertiesAll = [
  "Receptor - IEDB Receptor ID",
  "Epitope - Name",
  "Epitope - Source Organism",
  "Assay - Type", 
  "Assay - MHC Allele Names", # maybe interesting because EPIC Trace does something with Allele -> https://github.com/DaniTheOrange/EPIC-TRACE/blob/main/src/construct_long.py
  "Chain 1 - Type",
  "Chain 1 - Curated V Gene",
  "Chain 1 - Curated J Gene",
  "Chain 1 - Protein Sequence", # this one is a very long chain
  "Chain 1 - CDR3 Curated", # CDR Sequences also have start and end (curated/calculated properties), which is a integer
  "Chain 1 - CDR1 Curated",
  "Chain 1 - CDR2 Curated",
  "Chain 2 - Type",
  "Chain 2 - Curated V Gene",
  "Chain 2 - Curated J Gene",
  "Chain 2 - Protein Sequence", # this one is a very long chain
  "Chain 2 - CDR3 Curated",
  "Chain 2 - CDR1 Curated",
  "Chain 2 - CDR2 Curated",
  ]

curatedPropertiesBeta = [
  "Receptor - IEDB Receptor ID",
  "Epitope - Name",
  "Epitope - Source Organism",
  "Assay - Type", 
  "Assay - MHC Allele Names", # maybe interesting because EPIC Trace does something with Allele -> https://github.com/DaniTheOrange/EPIC-TRACE/blob/main/src/construct_long.py
  "Chain 2 - Type",
  "Chain 2 - Curated V Gene",
  "Chain 2 - Curated J Gene",
  "Chain 2 - Protein Sequence", # this one is a very long chain
  "Chain 2 - CDR3 Curated",
  "Chain 2 - CDR1 Curated",
  "Chain 2 - CDR2 Curated",
  ]


df_all = pd.read_csv(f"{path_prefix}/IEDB_positive_only.csv", usecols=curatedPropertiesAll)
df_beta = pd.read_csv(f"{path_prefix}/IEDB_positive_beta_only.csv", usecols=curatedPropertiesBeta)

  df_all = pd.read_csv(f"{path_prefix}/IEDB_positive_only.csv", usecols=curatedPropertiesAll)
  df_beta = pd.read_csv(f"{path_prefix}/IEDB_positive_beta_only.csv", usecols=curatedPropertiesBeta)


Now a bit of renaming has to be done because we want column names like this:

- TCR_name
-	TRAV
-	TRAJ
-	TRA_CDR3
-	TRBV
-	TRBJ
-	TRB_CDR3
-	TRAC
-	TRBC
-	TRA_leader
-	TRB_leader
-	Linker
-	Link_order
-	TRA_5_prime_seq
-	TRA_3_prime_seq
-	TRB_5_prime_seq
-	TRB_3_prime_seq
-	Epitope
-	Score_TRA
-	Score_TRB


In [9]:
df_all.rename(columns={'Receptor - IEDB Receptor ID': 'TCR_name'}, inplace=True)

# alpha chain
df_all.rename(columns={'Chain 1 - Curated V Gene': 'TRAV'}, inplace=True)
df_all.rename(columns={'Chain 1 - Curated J Gene': 'TRAJ'}, inplace=True)
df_all.rename(columns={'Chain 1 - CDR3 Curated': 'TRA_CDR3'}, inplace=True)

# beta chain
df_all.rename(columns={'Chain 2 - Curated V Gene': 'TRBV'}, inplace=True)
df_all.rename(columns={'Chain 2 - Curated J Gene': 'TRBJ'}, inplace=True)
df_all.rename(columns={'Chain 2 - CDR3 Curated': 'TRB_CDR3'}, inplace=True)

## Same for beta only
df_beta.rename(columns={'Receptor - IEDB Receptor ID': 'TCR_name'}, inplace=True)

# beta chain
df_beta.rename(columns={'Chain 2 - Curated V Gene': 'TRBV'}, inplace=True)
df_beta.rename(columns={'Chain 2 - Curated J Gene': 'TRBJ'}, inplace=True)
df_beta.rename(columns={'Chain 2 - CDR3 Curated': 'TRB_CDR3'}, inplace=True)

new_column_names = []

## add np.nan for missing columns and remove the other columns
# Define the column names
column_names = ['TRAC', 'TRBC', 'TRA_leader', 'TRB_leader', 'Linker', 'Link_order', 
                'TRA_5_prime_seq', 'TRA_3_prime_seq', 'TRB_5_prime_seq', 'TRB_3_prime_seq']

columns_to_keep = ['TRAC', 'TRBC', 'TRA_leader', 'TRB_leader', 'Linker', 'Link_order', #nan columns
                'TRA_5_prime_seq', 'TRA_3_prime_seq', 'TRB_5_prime_seq', 'TRB_3_prime_seq', #nan columns
                'TRAV', 'TRAJ', 'TRA_CDR3', 'TRBV', 'TRBJ', 'TRB_CDR3', 'TCR_name'] #renamed columns

columns_to_keep_beta = ['TRAC', 'TRBC', 'TRA_leader', 'TRB_leader', 'Linker', 'Link_order', #nan columns
                'TRA_5_prime_seq', 'TRA_3_prime_seq', 'TRB_5_prime_seq', 'TRB_3_prime_seq', #nan columns
                'TRBV', 'TRBJ', 'TRB_CDR3', 'TCR_name'] #renamed columns

# Assign columns with default value np.nan
df_all = df_all.assign(**{col: np.nan for col in column_names})
df_beta = df_beta.assign(**{col: np.nan for col in column_names})

df_all.to_csv('../../data/IEDB/IEDB_positive_stitchr_all.csv', index=False, columns=columns_to_keep)
df_beta.to_csv('../../data/IEDB/IEDB_positive_stitchr_beta.csv', index=False, columns=columns_to_keep_beta)
