# Prepare dataset for BCR classifier
# follow the instructions in the https://github.com/naity/protein-transformer/blob/main/notebooks/bcr_preprocessing.ipynb
# save dataset to the path BCR_classifier/data_dir/bcr.parquet

In [None]:
import pandas as pd


In [None]:
# data source: https://www.iedb.org/downloader.php?file_name=doc/bcr_full_v3.zip
# download the data and unzip it to the data_dir
df = pd.read_csv("../data_dir/bcr_full_v3.csv", header=1)
print(df.shape)
df.head()



In [None]:
# human BCRs
bcr = df.loc[df["Organism IRI"] == "http://purl.obolibrary.org/obo/NCBITaxon_9606"]
bcr.shape


In [None]:
# clean up
bcr = bcr.dropna(subset="Source Organism")


In [None]:
bcr["Source Organism"].value_counts()[:20]

In [None]:
# COVID BCRs
# covid = bcr.loc[bcr["Source Organism"].str.startswith("SARS-CoV2")]
covid = bcr.loc[bcr["Source Organism"] == "SARS-CoV2"]
covid["target"] = "SARS-CoV2"
covid.shape

In [None]:
# HIV-1 BCRs
hiv1 = bcr.loc[bcr["Source Organism"].str.contains("HIV-1")]
hiv1["target"] = "HIV-1"
hiv1.shape

In [None]:
# concat both
data = pd.concat([covid, hiv1])
data.shape

In [None]:
data.to_parquet("../data_dir/bcr.parquet")

In [None]:
# datas were generated by the script in the https://github.com/naity/protein-transformer/blob/main/notebooks/bcr_preprocessing.ipynb
# save data to BCR_classifier/data_dir/bcr.parquet
bcr = pd.read_parquet("../data_dir/bcr.parquet")
bcr.head()
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(bcr["target"])
bcr["label"] = le.transform(bcr["target"])
bcr.head()

In [None]:
from sklearn.model_selection import train_test_split
train_val, test = train_test_split(bcr, test_size=0.2, random_state=42)
train, val = train_test_split(train_val, test_size=0.1111, random_state=42, stratify=train_val["label"])  # 0.1111 ≒ 10% overall
train.to_csv("../data_dir/bcr_train.csv", index=False)
val.to_csv("../data_dir/bcr_val.csv", index=False)
test.to_csv("../data_dir/bcr_test.csv", index=False)