# Download the RSNA dataset
- RSNA Pneumonia Detection Challenge for predicting whether pneumonia exists in a given image.
- first download the kaggle, follow commands below:
```
cd ~/datasets/
mkdir RSNA_Pneumonia
cd RSNA_Pneumonia
kaggle competitions download -c rsna-pneumonia-detection-challenge
unzip rsna-pneumonia-detection-challenge.zip

```

## 1. Preprocess for classfication.
- orinial rsna annotations, if for each brounding box, change the annotations for each image
- first change the boungdingbox x,y,w,h->x1,y1,x2,y2
- aggregate multiple boxes into one patient.
- group by pateint. 
- make label of patient-level, if have boundingbox, unhealthy:1, otherwise healthy:0
- len of annotations: 30227->26684, because change to pateint level


In [42]:
import pandas as pd
dataset_folder = '/u/home/lj0/datasets/RSNA_Pneumonia'
annotations = 'stage_2_train_labels.csv'
df = pd.read_csv(dataset_folder + '/' + annotations)
print('original-df len:', len(df))
df.head()

original-df len: 30227


Unnamed: 0,patientId,x,y,width,height,Target
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,,,,,0
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,,,,,0
2,00322d4d-1c29-4943-afc9-b6754be640eb,,,,,0
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,,,,,0
4,00436515-870c-4b36-a041-de91049b9ab4,264.0,152.0,213.0,379.0,1


In [43]:
from sklearn.model_selection import train_test_split
# create bounding boxes change x,y,width,height to x1,y1,x2,y2
def create_bbox(row):
    if row["Target"] == 0:
        return 0
    else:
        x1 = row["x"]
        y1 = row["y"]
        x2 = x1 + row["width"]
        y2 = y1 + row["height"]
        return [x1, y1, x2, y2]

df["bbox"] = df.apply(lambda x: create_bbox(x), axis=1)

# aggregate multiple boxes into one patient
df = df[["patientId", "bbox"]]
df = df.groupby("patientId").agg(list)
df = df.reset_index()
df["bbox"] = df["bbox"].apply(lambda x: None if x == [0] else x)

# create new labels for entire image, healthy if no bounding box
df["Target"] = df["bbox"].apply(lambda x: 0 if x == None else 1)
print('len of new-df:', len(df))


# split data
test_fac=0.15
train_df, test_val_df = train_test_split(
    df, test_size=test_fac * 2, random_state=0)
test_df, valid_df = train_test_split(
    test_val_df, test_size=0.5, random_state=0)

print(f"Number of train samples: {len(train_df)}")
print(train_df["Target"].value_counts())
print(f"Number of valid samples: {len(valid_df)}")
print(valid_df["Target"].value_counts())
print(f"Number of test samples: {len(test_df)}")
print(test_df["Target"].value_counts())

RSNA_TRAIN_CSV = '/u/home/lj0/Code/VLP-Seminars/annotations/train.csv'
RSNA_VALID_CSV = '/u/home/lj0/Code/VLP-Seminars/annotations/val.csv'
RSNA_TEST_CSV = '/u/home/lj0/Code/VLP-Seminars/annotations/test.csv'

print('saving train csv file: ', RSNA_TRAIN_CSV)
print('saving valid csv file: ', RSNA_VALID_CSV)
print('saving test csv file: ', RSNA_TEST_CSV)
train_df.to_csv(RSNA_TRAIN_CSV, index=False)
valid_df.to_csv(RSNA_VALID_CSV, index=False)
test_df.to_csv(RSNA_TEST_CSV, index=False)


len of new-df: 26684
Number of train samples: 18678
0    14516
1     4162
Name: Target, dtype: int64
Number of valid samples: 4003
0    3075
1     928
Name: Target, dtype: int64
Number of test samples: 4003
0    3081
1     922
Name: Target, dtype: int64
saving train csv file:  /u/home/lj0/Code/VLP-Seminars/annotations/train.csv
saving valid csv file:  /u/home/lj0/Code/VLP-Seminars/annotations/val.csv
saving test csv file:  /u/home/lj0/Code/VLP-Seminars/annotations/test.csv


In [44]:
df.head()

Unnamed: 0,patientId,bbox,Target
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,,0
1,000924cf-0f8d-42bd-9158-1af53881a557,,0
2,000db696-cf54-4385-b10b-6b16fbb3f985,"[[316.0, 318.0, 486.0, 796.0], [660.0, 375.0, ...",1
3,000fe35a-2649-43d4-b027-e67796d412e0,"[[570.0, 282.0, 839.0, 691.0], [83.0, 227.0, 3...",1
4,001031d9-f904-4a23-b3e5-2c088acd19c6,"[[66.0, 160.0, 439.0, 768.0], [552.0, 164.0, 9...",1


## 2. Preprocess for detection
- also simialr to classfication, just save the final outputs to json file.

In [46]:
import pandas as pd
import numpy as np
import json

def prepare_detection_json(df, path):
    data = []
    for row in df.itertuples():
        filename = row.patientId + ".dcm"
        if row.Target == 0:
            bbox = np.zeros((1, 4)).tolist()  # Convert numpy array to list for JSON
        else:
            bbox = np.array(row.bbox).tolist()  # Convert numpy array to list for JSON
        data.append({"filename": filename, "bbox": bbox})

    # Save data to JSON file
    with open(path, 'w') as f:
        json.dump(data, f, indent=4)

    print(f"Data saved to {path}")

dataset_folder = '/u/home/lj0/datasets/RSNA_Pneumonia'
annotations = 'stage_2_train_labels.csv'
df = pd.read_csv(dataset_folder + '/' + annotations)
print('original-df len:', len(df))

# Define your `create_bbox` function (you'll need to implement this part)
df["bbox"] = df.apply(lambda x: create_bbox(x), axis=1)

# Aggregate multiple boxes
df = df[["patientId", "bbox"]]
df = df.groupby("patientId").agg(list)
df = df.reset_index()
df["bbox"] = df["bbox"].apply(lambda x: None if x == [0] else x)

# Create labels
df["Target"] = df["bbox"].apply(lambda x: 0 if x is None else 1)

# Split data
from sklearn.model_selection import train_test_split

train_df, test_val_df = train_test_split(df, test_size=5337 * 2, random_state=0)
test_df, valid_df = train_test_split(test_val_df, test_size=0.5, random_state=0)

print(f"Number of train samples: {len(train_df)}")
print(train_df["Target"].value_counts())
print(f"Number of valid samples: {len(valid_df)}")
print(valid_df["Target"].value_counts())
print(f"Number of test samples: {len(test_df)}")
print(test_df["Target"].value_counts())

# Save data as JSON files
RSNA_DETECTION_TRAIN_JSON = '/u/home/lj0/Code/VLP-Seminars/annotations/train_det.json'
RSNA_DETECTION_VALID_JSON = '/u/home/lj0/Code/VLP-Seminars/annotations/val_det.json'
RSNA_DETECTION_TEST_JSON = '/u/home/lj0/Code/VLP-Seminars/annotations/test_det.json'

prepare_detection_json(train_df, RSNA_DETECTION_TRAIN_JSON)
prepare_detection_json(valid_df, RSNA_DETECTION_VALID_JSON)
prepare_detection_json(test_df, RSNA_DETECTION_TEST_JSON)


original-df len: 30227
Number of train samples: 16010
0    12431
1     3579
Name: Target, dtype: int64
Number of valid samples: 5337
0    4155
1    1182
Name: Target, dtype: int64
Number of test samples: 5337
0    4086
1    1251
Name: Target, dtype: int64
Data saved to /u/home/lj0/Code/VLP-Seminars/annotations/train_det.json
Data saved to /u/home/lj0/Code/VLP-Seminars/annotations/val_det.json
Data saved to /u/home/lj0/Code/VLP-Seminars/annotations/test_det.json
