# EDA beta concatenated

First we do some cleaning of the dataset because there are some unused and alpha-chain specific columns.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import wandb
import os
from dotenv import load_dotenv

In [None]:
# -----------------------------------------------------------------------------
# W&B Setup
# -----------------------------------------------------------------------------
load_dotenv()
PROJECT_NAME = os.getenv("MAIN_PROJECT_NAME")
print(f"PROJECT_NAME: {PROJECT_NAME}")
run = wandb.init(project=PROJECT_NAME, job_type=f"download dataset", entity="ba-zhaw")
config = wandb.config


# Download corresponding artifact (= dataset) from W&B
precision = "gene" # gene or allele
levenshtein_data_path = f"/home/ubuntu/BA_ZHAW/data/EDA/beta/"
download_path = "/home/ubuntu/BA_ZHAW/data/WnB_Download/beta"
output_path = f'/home/ubuntu/BA_ZHAW/data/EDA/beta/'
output_file_name = f'beta_{precision}_levenshtein.tsv'
dataset_name = f"beta_{precision}"
artifact = run.use_artifact(f"{dataset_name}:latest")
data_dir = artifact.download(download_path)

run.finish()

train_file_path = f"{data_dir}/{precision}/train.tsv"
test_file_path = f"{data_dir}/{precision}/test.tsv"
val_file_path = f"{data_dir}/{precision}/validation.tsv"

df_train = pd.read_csv(train_file_path, sep="\t")
df_test = pd.read_csv(test_file_path, sep="\t")
df_test.drop(["Unnamed: 0"], axis=1, inplace=True, errors='ignore')
df_validation = pd.read_csv(val_file_path, sep="\t")
df_seen = pd.concat([df_train, df_validation])
df_levenshtein = pd.read_csv(f"{levenshtein_data_path}/beta_{precision}_levenshtein.tsv", sep="\t") #  remove if levenshtein data isn't available

In [None]:

output_filename = f"/home/ubuntu/BA_ZHAW/data/EDA/beta/ydata_profiling_{precision}.html"
df = pd.concat([df_train, df_test, df_validation])
df = df.drop(columns=["TCR_name", "TRAV", "TRAJ", "TRA_CDR3", "TRAC", "TRBC", "TRA_CDR3"], errors='ignore') # not interesting for analyze of beta chain
print(f"the whole dataset has {len(df)} entries and the following columns are considered for this notebook:")
df.head()

## Pandas Profiling
pandas-profiling package naming was changed. To continue profiling data use ydata-profiling instead!
-> That's why we use ydata-profiling

In [None]:
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Profiling Report")

In [None]:
#profile.to_widgets()
profile.to_file(output_filename)

In [None]:
profile = ProfileReport(df_levenshtein, title=f"Report paired levenshtein {precision}")
levenshtein_data_path = f"/home/ubuntu/BA_ZHAW/data/EDA/beta/"
profile.to_file(f"{levenshtein_data_path}/profiling_{precision}_levenshtein.tsv")

## Tasks

In [None]:
print("Each entry has a minimum levenshtein to the seen data. From this values, we take the mean.")

df_levenshtein_TPP2 = df_levenshtein[df_levenshtein["task"] == "TPP2"]
print(f"mean levenshtein distance of TRB TPP2: {df_levenshtein_TPP2['min_levenshtein_cdr_to_seen'].mean()}")
print(f"mean levenshtein distance of Epitope TPP2: {df_levenshtein_TPP2['min_levenshtein_epitope_to_seen'].mean()}")

df_levenshtein_TPP3 = df_levenshtein[df_levenshtein["task"] == "TPP3"]
print(f"mean levenshtein distance of TRB TPP3: {df_levenshtein_TPP3['min_levenshtein_cdr_to_seen'].mean()}")
print(f"mean levenshtein distance of Epitope TPP3: {df_levenshtein_TPP3['min_levenshtein_epitope_to_seen'].mean()}")

## Epitopes

In [None]:
unique_epitope_count = df["Epitope"].value_counts().count()
epitope_count = df["Epitope"].notnull().sum()
print(f"there are {epitope_count} epitopes and {unique_epitope_count} distinct epitopes")

In [None]:
print(len(df["Epitope"].drop_duplicates(keep=False)))

In [None]:
print(len(df_test[df_test['task'] == 'TPP3']))

Now a visualisation of how the epitopes are distributed.

In [None]:
# Count the occurrences of each Epitope
epitope_counts = df['Epitope'].value_counts()

# Filter Epitopes that have more than 2 entries
filtered_epitope_counts = epitope_counts[epitope_counts > 100]

# Plotting
plt.figure(figsize=(10, 6))
filtered_epitope_counts.plot(kind='bar', color='teal')
plt.xlabel('Epitope')
plt.ylabel('Count')
plt.title('Distribution of Epitopes with More Than 100 Entries')
plt.xticks(rotation=45)
plt.show()

## CDR-3 Regions (beta)

In [None]:
unique_cdr_count = df["TRB_CDR3"].value_counts().count()
print(f"there are {unique_cdr_count} unique CDR-3 beta regions")

Distribution of TRB_CDR3 Sequence Lengths

In [None]:
# Calculate TRB_CDR3 sequence lengths
df['TRB_CDR3_Length'] = df['TRB_CDR3'].apply(len)

plt.figure(figsize=(10, 6))
plt.hist(df['TRB_CDR3_Length'])
plt.title('Distribution of TRB_CDR3 Sequence Lengths')
plt.xlabel('TRB_CDR3 Length')
plt.ylabel('Frequency')
plt.show()

## V and J region (beta)

How many entries do have V **and** J region

In [None]:
paired_VJ_count = df[["TRBV", "TRBJ"]].notnull().all(axis=1).sum()
print(f"There are {paired_VJ_count} entries which have the V and J region")

visual representation of the distribution of V and J regions. 

In [None]:
# Define a function to categorize each entry
def categorize_row(row):
    if pd.notnull(row['TRBV']) and pd.notnull(row['TRBJ']):
        return 'Both V & J Regions'
    elif pd.notnull(row['TRBV']):
        return 'Only V Region'
    elif pd.notnull(row['TRBJ']):
        return 'Only J Region'
    else:
        return 'Neither'

# Apply the function to each row
df['Category'] = df.apply(categorize_row, axis=1)

# Count the number of entries in each category
category_counts = df['Category'].value_counts()

# Plotting
plt.figure(figsize=(10, 6))
plt.pie(category_counts, labels=category_counts.index, autopct='%1.1f%%', startangle=140, colors=['skyblue', 'lightgreen', 'orange', 'lightcoral'])
plt.title('Distribution of Entries by V & J Region Presence')
plt.show()

Exploring how many unique combinations of TRBV and TRBJ genes there are could be informative. This might help in understanding the diversity of T-cell receptor beta chains represented in data.

In [None]:
# Count unique TRBV and TRBJ combinations
unique_combinations = df.groupby(['TRBV', 'TRBJ']).size().reset_index(name='Count')

# Plotting the top 20 most frequent combinations
top_combinations = unique_combinations.sort_values(by='Count', ascending=False).head(20)

plt.figure(figsize=(12, 8))
plt.barh(top_combinations['TRBV'] + '-' + top_combinations['TRBJ'], top_combinations['Count'], color='purple')
plt.xlabel('Count')
plt.ylabel('TRBV-TRBJ Combinations')
plt.title('Top 20 TRBV and TRBJ Combinations')
plt.show()

### V Region
How many entries do have a V region and how many different V regions are there

In [None]:
unique_V_count = df['TRBV'].nunique()
print(f"There are {unique_V_count} unique V regions")

In [None]:
V_count = df["TRBV"].notnull().sum()
print(f"There are {V_count} entries for J region")

### J Region
How many entries do have J region and how many different J regions are there

In [None]:
unique_J_count = df["TRBJ"].nunique()
print(f"There are {unique_J_count} unique J regions")

In [None]:
J_count = df["TRBJ"].notnull().sum()
print(f"There are {J_count} entries for J region")

## MHC
How many entries do have MHC A **and** MHC B value

In [None]:
paired_MHC_count = df[["MHC"]].notnull().all(axis=1).sum()
print(f"There are {paired_MHC_count} entries which have the MHC region")

In [None]:
unique_MHC_count = df["MHC"].nunique()
print(f"There are {unique_MHC_count} unique MHC values")

In [None]:
MHC_count = df["MHC"].notnull().sum()
print(f"There are {MHC_count} entries MHC")

Distribution of MHC

In [None]:
# Count the occurrences of each MHC
mhc_counts = df['MHC'].value_counts()

# Filter Epitopes that have more than 2 entries
filtered_mhc_counts = mhc_counts[mhc_counts > 100]

# Plotting
plt.figure(figsize=(10, 6))
filtered_mhc_counts.plot(kind='bar', color='teal')
plt.xlabel('MHC')
plt.ylabel('Count')
plt.title('Distribution of MHC with More Than 10 Entries')
plt.xticks(rotation=45)
plt.show()