## In this file the attributes are analyzed because there are curated and calculated attributes.

In [4]:
import pandas as pd

path_prefix = '../../data/IEDB'
df = pd.read_csv(f"{path_prefix}/IEDB.csv")
print(df.info())


  df = pd.read_csv(f"{path_prefix}/IEDB.csv")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211234 entries, 0 to 211233
Data columns (total 29 columns):
 #   Column                       Non-Null Count   Dtype 
---  ------                       --------------   ----- 
 0   Receptor - IEDB Receptor ID  211234 non-null  int64 
 1   Epitope - Name               211234 non-null  object
 2   Epitope - Source Organism    207297 non-null  object
 3   Assay - Type                 211234 non-null  object
 4   Assay - MHC Allele Names     211111 non-null  object
 5   Chain 1 - Type               57216 non-null   object
 6   Chain 1 - Curated V Gene     38939 non-null   object
 7   Chain 1 - Calculated V Gene  8615 non-null    object
 8   Chain 1 - Curated J Gene     36000 non-null   object
 9   Chain 1 - Calculated J Gene  8455 non-null    object
 10  Chain 1 - Protein Sequence   772 non-null     object
 11  Chain 1 - CDR3 Curated       56773 non-null   object
 12  Chain 1 - CDR3 Calculated    729 non-null     object
 13  Chain 1 - CDR1

now as the data is in the panda dataframe, it's ready to be analyzed. We want to know if there are entries in which there are only calculated but not curated values.

In [3]:
# Masks for curated and calculated attributes for each feature
v_gene_curated_mask = ~df['Chain 2 - Curated V Gene'].isnull()
v_gene_calculated_mask = ~df['Chain 2 - Calculated V Gene'].isnull()

j_gene_curated_mask = ~df['Chain 2 - Curated J Gene'].isnull()
j_gene_calculated_mask = ~df['Chain 2 - Calculated J Gene'].isnull()

cdr3_curated_mask = ~df['Chain 2 - CDR3 Curated'].isnull()
cdr3_calculated_mask = ~df['Chain 2 - CDR3 Calculated'].isnull()

# Calculate counts for each category
v_gene_curated_count = v_gene_curated_mask.sum()
v_gene_calculated_count = v_gene_calculated_mask.sum()
v_gene_curated_and_calculated_count = (v_gene_curated_mask & v_gene_calculated_mask).sum()
v_gene_only_calculated_count = v_gene_calculated_count - v_gene_curated_and_calculated_count

j_gene_curated_count = j_gene_curated_mask.sum()
j_gene_calculated_count = j_gene_calculated_mask.sum()
j_gene_curated_and_calculated_count = (j_gene_curated_mask & j_gene_calculated_mask).sum()
j_gene_only_calculated_count = j_gene_calculated_count - j_gene_curated_and_calculated_count

cdr3_curated_count = cdr3_curated_mask.sum()
cdr3_calculated_count = cdr3_calculated_mask.sum()
cdr3_curated_and_calculated_count = (cdr3_curated_mask & cdr3_calculated_mask).sum()
cdr3_only_calculated_count = cdr3_calculated_count - cdr3_curated_and_calculated_count

# Output the results
print("V Gene curated count:", v_gene_curated_count)
print("V Gene calculated count:", v_gene_calculated_count)
print("V Gene curated and calculated count:", v_gene_curated_and_calculated_count)
print("V Gene only calculated count:", v_gene_only_calculated_count)

print("J Gene curated count:", j_gene_curated_count)
print("J Gene calculated count:", j_gene_calculated_count)
print("J Gene curated and calculated count:", j_gene_curated_and_calculated_count)
print("J Gene only calculated count:", j_gene_only_calculated_count)

print("CDR3 curated count:", cdr3_curated_count)
print("CDR3 calculated count:", cdr3_calculated_count)
print("CDR3 curated and calculated count:", cdr3_curated_and_calculated_count)
print("CDR3 only calculated count:", cdr3_only_calculated_count)

V Gene curated count: 159649
V Gene calculated count: 12331
V Gene curated and calculated count: 11724
V Gene only calculated count: 607
J Gene curated count: 157016
J Gene calculated count: 12127
J Gene curated and calculated count: 11512
J Gene only calculated count: 615
CDR3 curated count: 180899
CDR3 calculated count: 137692
CDR3 curated and calculated count: 137289
CDR3 only calculated count: 403


there are less than 1700 entries which have only calculated values. we drop them because they are not as accurate as curated values.