# **PART 2:** Preparing data for epitope annotation with TCRex

In [1]:
import os
# Set the working directory to the repository directory
os.chdir("/home/sebastiaan/PhD/Repositories/book_chapter/")

Once again, we will need the `pandas` library for handling the data.

In [None]:
import pandas as pd

TCRex has sets an upper limit of 50,000 to the amount of sequences that the user may upload. Therefore, files containing more than 50K sequences should be split into smaller chunks that satisfy the upload limit of the TCRex software.

We will start out by writing a function that does just that.

In [9]:
# Function to split data in smaller files
def split_data(
    fn: str, 
    folder: str, 
    tcrex_limit: int = 50000
    ):  
    """
    Split a file containing TCR sequences into smaller files defined by a user given maximum number of sequences.
    
    Args:
        - fn: name of the file that needs to be split
        - folder: path to the directory were the splitted files need to be stored
        - tcrex_limit: maximum number of sequences in the splitted files (default = 50K)
    """
    for i, chunk_start in enumerate(range(0, nr_rows, tcrex_limit), start=1):
        # Select chunk
        data_chunk = data[chunk_start:chunk_start + tcrex_limit]
        # Save chunk to target directory
        data_chunk.to_csv(
            os.path.join(folder, '_'.join([fn, str(i)]) +'.tsv'),
            sep = '\t',
            index = False
            )

Next, we will check every file to see whether it exceeds the TCRex data limit. If so, we will split the file into smaller chunks (of size 50,000) using the function we defined previously. 

In [None]:
# Define the directory to collect all parsed (i.e. splitted) files
parsed_data = "../data/parsed_data"
origin_data = "../data/examples"

# If the parsed_data folder does not exist, create it
if not os.path.exists(parsed_data):
    os.makedir(parsed_data)

# Loop over files and split them if they contain more than 50K TCRs
for fn in files:
    # Read in every file with name fn
    data = pd.read_csv(os.path.join(origin_data, fn), sep = '\t')
    # Calculate the number of TCRs in the file
    nr_rows = data.shape[0]
    # If the number of TCRs exceeds the TCRex limit, split the file fn in smaller files
    if nr_rows > tcrex_limit:
        new_folder = os.path.join(parsed_data, fn.split('.')[0])
        if not os.path.exists(new_folder):
            os.mkdir(new_folder)
        # Use the split_data function we defined previously to split the file
        split_data(
            fn = fn, 
            tcrex_limit = tcrex_limit, 
            folder = new_folder
        )
    # If the number of TCRs does not exceed the TCRex limit, move the file to the new folder
    else:
        os.replace(
            source = os.path.join(origin_data, fn),
            destination = os.path.join(parsed_data, fn)
            )