# EDA paired concatenated

First we do some cleaning of the dataset because there are some unused columns.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("../../data/customDatasets/Stitchr_paired_concatenated.tsv", sep='\t', low_memory=False)
df_for_profiling = df.drop(columns=["TCR_name", "TRA_nt", "TRB_nt", "TRA_aa", "TRB_aa", "TRAC", "TRBC"], inplace=False)
print(f"the whole dataset has {len(df)} entries and the following columns are considered for this notebook:")
df.head()

## Pandas Profiling
pandas-profiling package naming was changed. To continue profiling data use ydata-profiling instead!
-> That's why we use ydata-profiling

In [None]:
from ydata_profiling import ProfileReport
profile = ProfileReport(df_for_profiling, title="Profiling Report", minimal=True)
profile.to_widgets()
profile.to_file("ydata_profiling_outputs/stitchr_paired.html") 

## Epitopes

In [None]:
unique_epitope_count = df["Epitope"].value_counts().count()
epitope_count = df["Epitope"].notnull().sum()
print(f"there are {epitope_count} epitopes and {unique_epitope_count} unique epitopes")

Now a visualisation of how the epitopes are distributed.

In [None]:
# Count the occurrences of each Epitope
epitope_counts = df['Epitope'].value_counts()

# Filter Epitopes that have more than 2 entries
filtered_epitope_counts = epitope_counts[epitope_counts > 100]

# Plotting
plt.figure(figsize=(10, 6))
filtered_epitope_counts.plot(kind='bar', color='teal')
plt.xlabel('Epitope')
plt.ylabel('Count')
plt.title('Distribution of Epitopes with More Than 100 Entries')
plt.xticks(rotation=45)
plt.show()

## CDR-3 Regions (alpha)

In [None]:
unique_cdr_count = df["TRA_CDR3"].value_counts().count()
print(f"there are {unique_cdr_count} unique CDR-3 alpha regions")

Distribution of TRA_CDR3 sequence lengths

In [None]:
# Calculate TRA_CDR3 sequence lengths
df['TRA_CDR3_Length'] = df['TRA_CDR3'].apply(len)

plt.figure(figsize=(10, 6))
plt.hist(df['TRA_CDR3_Length'])
plt.title('Distribution of TRA_CDR3 Sequence Lengths')
plt.xlabel('TRA_CDR3 Length')
plt.ylabel('Frequency')
plt.show()

## CDR-3 Regions (beta)

In [None]:
unique_cdr_count = df["TRB_CDR3"].value_counts().count()
print(f"there are {unique_cdr_count} unique CDR-3 beta regions")

Distribution of TRB_CDR3 sequence lengths

In [None]:
# Calculate TRB_CDR3 sequence lengths
df['TRB_CDR3_Length'] = df['TRB_CDR3'].apply(len)

plt.figure(figsize=(10, 6))
plt.hist(df['TRB_CDR3_Length'])
plt.title('Distribution of TRB_CDR3 Sequence Lengths')
plt.xlabel('TRB_CDR3 Length')
plt.ylabel('Frequency')
plt.show()

## V and J region (alpha)

How many entries do have V **and** J region

In [None]:
paired_VJ_count = df[["TRAV", "TRAJ"]].notnull().all(axis=1).sum()
print(f"There are {paired_VJ_count} entries which have the V and J region")

visual representation of the distribution of V and J regions

In [None]:
# Define a function to categorize each entry
def categorize_row(row):
    if pd.notnull(row['TRAV']) and pd.notnull(row['TRAJ']):
        return 'Both V & J Regions'
    elif pd.notnull(row['TRAV']):
        return 'Only V Region'
    elif pd.notnull(row['TRAJ']):
        return 'Only J Region'
    else:
        return 'Neither'

# Apply the function to each row
df['Category'] = df.apply(categorize_row, axis=1)

# Count the number of entries in each category
category_counts = df['Category'].value_counts()

# Plotting
plt.figure(figsize=(10, 6))
plt.pie(category_counts, labels=category_counts.index, autopct='%1.1f%%', startangle=140, colors=['skyblue', 'lightgreen', 'orange', 'lightcoral'])
plt.title('Distribution of Entries by V & J Region Presence')
plt.show()

Exploring how many unique combinations of TRAV and TRAJ genes there are could be informative. This might help in understanding the diversity of T-cell receptor beta chains represented in data.

In [None]:
# Count unique TRBV and TRBJ combinations
unique_combinations = df.groupby(['TRAV', 'TRAJ']).size().reset_index(name='Count')

# Plotting the top 20 most frequent combinations
top_combinations = unique_combinations.sort_values(by='Count', ascending=False).head(20)

plt.figure(figsize=(12, 8))
plt.barh(top_combinations['TRAV'] + '-' + top_combinations['TRAJ'], top_combinations['Count'], color='purple')
plt.xlabel('Count')
plt.ylabel('TRAV-TRAJ Combinations')
plt.title('Top 20 TRAV and TRAJ Combinations')
plt.show()

## V and J region (beta)

How many entries do have V **and** J region

In [None]:
paired_VJ_count = df[["TRBV", "TRBJ"]].notnull().all(axis=1).sum()
print(f"There are {paired_VJ_count} entries which have the V and J region")

visual representation of the distribution of V and J regions. 

In [None]:
# Define a function to categorize each entry
def categorize_row(row):
    if pd.notnull(row['TRBV']) and pd.notnull(row['TRBJ']):
        return 'Both V & J Regions'
    elif pd.notnull(row['TRBV']):
        return 'Only V Region'
    elif pd.notnull(row['TRBJ']):
        return 'Only J Region'
    else:
        return 'Neither'

# Apply the function to each row
df['Category'] = df.apply(categorize_row, axis=1)

# Count the number of entries in each category
category_counts = df['Category'].value_counts()

# Plotting
plt.figure(figsize=(10, 6))
plt.pie(category_counts, labels=category_counts.index, autopct='%1.1f%%', startangle=140, colors=['skyblue', 'lightgreen', 'orange', 'lightcoral'])
plt.title('Distribution of Entries by V & J Region Presence')
plt.show()

Exploring how many unique combinations of TRBV and TRBJ genes there are could be informative. This might help in understanding the diversity of T-cell receptor beta chains represented in data.

In [None]:
# Count unique TRBV and TRBJ combinations
unique_combinations = df.groupby(['TRBV', 'TRBJ']).size().reset_index(name='Count')

# Plotting the top 20 most frequent combinations
top_combinations = unique_combinations.sort_values(by='Count', ascending=False).head(20)

plt.figure(figsize=(12, 8))
plt.barh(top_combinations['TRBV'] + '-' + top_combinations['TRBJ'], top_combinations['Count'], color='purple')
plt.xlabel('Count')
plt.ylabel('TRBV-TRBJ Combinations')
plt.title('Top 20 TRBV and TRBJ Combinations')
plt.show()

### V Region
How many entries do have a V region and how many different V regions are there

In [None]:
unique_V_count = df['TRAV'].nunique()
print(f"There are {unique_V_count} unique V regions in alpha chain")
unique_V_count = df['TRBV'].nunique()
print(f"There are {unique_V_count} unique V regions in beta chain")

In [None]:
V_count = df["TRAV"].notnull().sum()
print(f"There are {V_count} entries for J region in alpha chain")
V_count = df["TRBV"].notnull().sum()
print(f"There are {V_count} entries for J region in beta chain")

### J Region
How many entries do have J region and how many different J regions are there

In [None]:
unique_J_count = df["TRAJ"].nunique()
print(f"There are {unique_J_count} unique J regions in alpha chain")
unique_J_count = df["TRBJ"].nunique()
print(f"There are {unique_J_count} unique J regions in beta chain")

In [None]:
J_count = df["TRAJ"].notnull().sum()
print(f"There are {J_count} entries for J region in alpha chain")
J_count = df["TRBJ"].notnull().sum()
print(f"There are {J_count} entries for J region in beta chain")

## MHC
How many entries do have MHC A **and** MHC B value

In [None]:
paired_MHC_count = df[["MHC A", "MHC B"]].notnull().all(axis=1).sum()
print(f"There are {paired_MHC_count} entries which have the MHC A and MHC B region")

### MHC A

In [None]:
unique_MHCa_count = df["MHC A"].nunique()
print(f"There are {unique_MHCa_count} unique MHC A values")

In [None]:
MHCa_count = df["MHC A"].notnull().sum()
print(f"There are {MHCa_count} entries MHC A")

Distribution of MHC A

In [None]:
# Count the occurrences of each MHC A
mhc_counts = df['MHC A'].value_counts()

# Filter Epitopes that have more than 2 entries
filtered_mhc_counts = mhc_counts[mhc_counts > 100]

# Plotting
plt.figure(figsize=(10, 6))
filtered_mhc_counts.plot(kind='bar', color='teal')
plt.xlabel('MHC A')
plt.ylabel('Count')
plt.title('Distribution of MHC A with More Than 10 Entries')
plt.xticks(rotation=45)
plt.show()

### MHC B

In [None]:
unique_MHCb_count = df["MHC B"].nunique()
print(f"There are {unique_MHCb_count} unique MHC B values")

In [None]:
MHCb_count = df["MHC B"].notnull().sum()
print(f"There are {MHCb_count} entries for MHC B")

Distribution of MHC A

In [None]:
# Count the occurrences of each MHC B
mhc_counts = df['MHC B'].value_counts()

# Filter Epitopes that have more than 2 entries
filtered_mhc_counts = mhc_counts[mhc_counts > 10]

# Plotting
plt.figure(figsize=(10, 6))
filtered_mhc_counts.plot(kind='bar', color='teal')
plt.xlabel('MHC A')
plt.ylabel('Count')
plt.title('Distribution of MHC A with More Than 10 Entries')
plt.xticks(rotation=45)
plt.show()