# **Download BBBC036 Dataset**
https://bbbc.broadinstitute.org/BBBC036
- 693 Mechanisms of Action (MoA) labels
- 5 channels (ERSyto, ERSytoBleed, Hoechst, Mito, Ph_golgi)
    - Per channel:
        -  384 wells (a01, a02, a03, ...)
        - 6 FoV (s1, s2, ..., s6)

### Step 1
- Run **_download_bbbc036.sh_**
    - Customizable parameters: "PARENT_DIR" | "PLATE_NUMBERS" | "-P 16"


### Step 2
- Unzip .zip files
    - Customizable parameters: "PARENT_DIR" | "ZIP_DIR" | "UNZIP_DIR"
- Example:
```plaintext
        csv_unzip
        ├── 24279-ERSyto
        │   ├── cdp2bioactives_a01_s1_*.tif
        │   ├── cdp2bioactives_a01_s2_*.tif
        │   ├── cdp2bioactives_a01_s3_*.tif
        │   └── ...
        ├── 24279-ERSytoBleed
        ├── 24279-Hoechst
        ├── 24279-Mito
        ├── 24279-Ph_golgi
        ├── ...
        └── 24279-profiles.csv
        └── ...


In [None]:
import os
import subprocess
import shutil
from tqdm import tqdm

PARENT_DIR = "/projectnb/btec-design3/novanetworks/kaggle_HPA/2021/data/bbbc036/raw/"
ZIP_DIR = "/projectnb/btec-design3/novanetworks/kaggle_HPA/2021/data/bbbc036/raw/csv_zip"

# Define the output directory where ZIP contents will be extracted
UNZIP_DIR = os.path.join(PARENT_DIR, "csv_unzip")
os.makedirs(UNZIP_DIR, exist_ok=True)

# List files in the zip directory
files = os.listdir(ZIP_DIR)
zip_files = [file for file in files if file.endswith(".zip")]
csv_files = [file for file in files if file.endswith(".csv")]

batch_size = 5
zip_file_batches = [zip_files[i:i + batch_size] for i in range(0, len(zip_files), batch_size)]

# Unzip each batch of ZIP files
for batch_index, zip_batch in enumerate(tqdm(zip_file_batches, desc="Processing batches"), start=1):
    for zip_file in tqdm(zip_batch, desc=f"Unzipping batch {batch_index}", leave=False):
        zip_file_path = os.path.join(ZIP_DIR, zip_file)
        
        try:
            subprocess.run(["unzip", zip_file_path, "-d", UNZIP_DIR], check=True)
            # print(f"Extracted {zip_file} to {UNZIP_DIR}")
            # Delete the ZIP file after successful extraction to free up the space
            os.remove(zip_file_path)
            # print(f"Deleted {zip_file} after extraction")
        except subprocess.CalledProcessError as e:
            print(f"Failed to extract {zip_file}: {str(e)}")
        except Exception as e:
            print(f"An error occurred: {str(e)}")

    # Copy all CSV files to the output directory after each batch
    for csv_file in tqdm(csv_files, desc="Copying CSV files", leave=False):
        src_csv_path = os.path.join(ZIP_DIR, csv_file)
        dst_csv_path = os.path.join(UNZIP_DIR, csv_file)
        shutil.copy(src_csv_path, dst_csv_path)
        # print(f"Copied {csv_file} to {UNZIP_DIR}")

    if batch_index < len(zip_file_batches):
        print(f"Batch {batch_index} completed. Proceeding to the next batch.")


# **Process CSV Files**

### Step 1
- Use **_{plate}-profiles.csv_** file for each plate and **_BBBC036_v1_DatasetGroundTruth.csv_** to make a new csv file for model training 
    - Customizable parameters: "PROFILE_DIR" | "GT_DIR"
- Purpose:
    - Match images with their corresponding BROAD sample numbers and MoA labels
    - Convert ground truth labels into formats interpretable by the model
- Output:
    - **_train_bbbc.csv_** <font color="orange">&rarr; Place this file into "nova-networks/Nova-Classifier/dataloaders/split"</font>

### Step 2
- Use **_train_bbbc.csv_** and convert MoA labels to class numbers, such as class 0, 1, ..., 692
- Output:
    - **_bbbc_numclass.csv_**

### Step 3
- Use **_bbbc_numclass.csv_** and expand the each row to contain Metadata_Plate, Metadata_Well, Metadata_broad_sample, and 693 classes columns
- If the label is presented in the image, class columns will be labeled as 1, otherwise 0
- Output:
    - **_transformed_file.csv_**
 
 ### Step 4
 - Use **_transformed_file.csv_** to check whether the cropped image folder have the image listed in the csv
 - This also checks whether this image ID (Metadata_Plate + Metadata_Well) have 10 individual cropped cell
 - If it meets both criteria, the row corresponding to the image ID will be saved, otherwise removed from the csv
 - Output:
    - **_bbbc2_rename_10cell.csv_** for model training


In [None]:
# Step 1
import pandas as pd
import os

# Path to "{plate}-profiles.csv" files
PROFILE_DIR = '/projectnb/btec-design3/novanetworks/kaggle_HPA/2021/data/bbbc036/raw/csv_unzip'

# Path to the "BBBC036_v1_DatasetGroundTruth.csv" file in the metadata folder
GT_DIR = '/projectnb/btec-design3/novanetworks/nova-networks/preprocessing/BBBC036/metadata/BBBC036_v1_DatasetGroundTruth.csv'


ground_truth = pd.read_csv(GT_DIR)
moa_mapping = ground_truth.set_index('Metadata_broad_sample')['Metadata_moa'].to_dict()


# Create a DataFrame to store filtered & combined data
combined_data = pd.DataFrame(columns=["Metadata_Plate","Metadata_Well", "Metadata_broad_sample", "Metadata_moa"])

for file in os.listdir(PROFILE_DIR):
    if file.endswith("-profiles.csv"):
        profile_data = pd.read_csv(os.path.join(PROFILE_DIR, file))

        profile_data['Metadata_moa'] = profile_data['Metadata_broad_sample'].map(moa_mapping)
        profile_data.dropna(subset=['Metadata_moa'], inplace=True)

        combined_data = pd.concat([combined_data, profile_data[["Metadata_Plate","Metadata_Well", "Metadata_broad_sample", "Metadata_moa"]]], ignore_index=True)


combined_data.to_csv(os.path.join(PROFILE_DIR, "train_bbbc.csv"), index=False)



In [None]:
# Step 2
import pandas as pd

file_path = 'dataloaders/split/train_bbbc.csv'
df = pd.read_csv(file_path)

all_classes = df.iloc[:, 3].str.split('|').explode()
unique_classes = all_classes.unique()
class_to_number = {class_name: i for i, class_name in enumerate(unique_classes)}

print(len(class_to_number)) #693 classes
df.iloc[:, 3] = df.iloc[:, 3].str.split('|').apply(lambda classes: '|'.join(str(class_to_number[c]) for c in classes))

df.to_csv('bbbc_numclass.csv', index=False)


In [None]:
# Step 3
import pandas as pd
import numpy as np

# Load the original CSV file
original_df = pd.read_csv('bbbc_numclass.csv')

# Split the classes in the fourth column and create a list of unique classes
class_labels = '|'.join(original_df.iloc[:, 3]).split('|')
classes = set(class_labels)

# Convert class labels to integers
class_labels_int = [int(label) for label in class_labels]
# Get unique class labels and sort them
sorted_classes = sorted(set(class_labels_int))

# Create a new DataFrame filled with zeros
transformed_df = pd.DataFrame(np.zeros((len(original_df), len(sorted_classes) + 3)), columns=['Metadata_Plate', 'Metadata_Well', 'Metadata_broad_sample'] + [str(cls) for cls in sorted_classes])

# Iterate over each row in the original DataFrame
for index, row in original_df.iterrows():
    # Extract the classes from the fourth column
    class_labels = [int(label) for label in row[3].split('|')]
    # Set the values for the class labels to 1
    transformed_df.iloc[index, 3:] = [1 if cls in class_labels else 0 for cls in sorted_classes]
    # Set the values for the first three columns
    transformed_df.iloc[index, 0] = int(row[0])
    transformed_df.iloc[index, 1] = row[1]
    transformed_df.iloc[index, 2] = row[2]

# Save the transformed DataFrame to a new CSV file
transformed_df.to_csv('transformed_file.csv', index=False)


In [None]:
# Step 4
import os
import pandas as pd
from skimage.io import imread
import numpy as np

csv_file_path = 'transformed_file.csv'

# Path to the folder containing the images
image_folder_path = '../preprocessing/train_bbbc_2/cell'

df = pd.read_csv(csv_file_path)

ii=0

for index, row in df.iterrows():
    plate=[]
    plate_value = str(int(float(row["Metadata_Plate"])))
    plate.append(plate_value)
    well=[]
    well.append(row["Metadata_Well"])
    for p in plate:
        #print(p)
        for w in well:
            ii=0
            #print(w)
            for k in range(1):
                for i in range(10):
                    image_file_path = os.path.join(image_folder_path, f'{p}_{w}_s{k+1}_cell{i+1}.png')
                    # print(image_file_path)
                    if os.path.exists(image_file_path):
                        reading = imread(image_file_path)
                        ii += 1
                        # Rename the file
                        os.rename(image_file_path, os.path.join(image_folder_path, f'{p}_{w}_cell{ii}.png'))
                        # print(image_file_path)
                        # print(os.path.join(image_folder_path, f'{p}_{w}_cell{ii}.png'))
                

In [None]:
'''
    Extracts cells present in image folder and makes new csv
'''

import os
import pandas as pd

# Path to the CSV file with image IDs
csv_file_path = 'transformed_file.csv'

# Path to the folder containing the images
image_folder_path = '../preprocessing/train_bbbc_2/cell'

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

# Create an empty list to store rows for the new DataFrame
new_rows = []

# Iterate through each row in the original DataFrame
for index, row in df.iterrows():
    plate_value = str(int(float(row["Metadata_Plate"])))
    image_file_path = os.path.join(image_folder_path, f'{plate_value}_{row["Metadata_Well"]}_cell10.png')  # Assuming images have a .jpg extension

    # print(image_file_path)
    
    # Check if the image file exists
    if os.path.exists(image_file_path):
        # print(row)
        new_rows.append(row)
        # break

# Create a new DataFrame from the list of rows
new_df = pd.DataFrame(new_rows)

# Save the new DataFrame to a new CSV file
new_csv_file_path = 'bbbc2_rename_10cell.csv'
new_df.to_csv(new_csv_file_path, index=False)


### Step 2
- Split into train, test, val

# **Run Preprocessing**

- Run preproc_bbbc.sh