# Data preparation

In [None]:
import os
import pandas as pd
import json

import wandb

PROJECT_NAME = 'lemon-test1'
ENTITY = 'wandb_course'
RAW_DATA_FOLDER = 'lemon-dataset/'
ANNOTATIONS_FILE = 'lemon-dataset/annotations/instances_default.json'
PREFIX = 'lemon_dataset'
TOTAL_IMAGES = len(os.listdir(RAW_DATA_FOLDER))

In [None]:
RAW_DATA_AT = f'{PREFIX}_raw_data_{str(TOTAL_IMAGES)}'
RAW_DATA_AT

In [None]:
PROCESSED_DATA_AT = f'{PREFIX}_split_data'
PROCESSED_DATA_AT

## Register raw data as an artifact

In [None]:
run = wandb.init(project=PROJECT_NAME, entity=ENTITY, job_type="upload")

# create an artifact for all the raw data
raw_data_at = wandb.Artifact(RAW_DATA_AT, type="raw_data")

# add all images in the directory to the artifact
raw_data_at.add_dir(RAW_DATA_FOLDER, name='images')

# add annotations file to the artifact
raw_data_at.add_file(ANNOTATIONS_FILE, name='annotations/instances_default.json')

# save artifact to W&B
run.log_artifact(raw_data_at)

# finalize run
run.finish()

## Pre-process data for binary classification

In [None]:
run = wandb.init(project=PROJECT_NAME, entity=ENTITY, job_type="data_split")

# find the most recent ("latest") version of the full raw data
raw_data_at = run.use_artifact(f'{RAW_DATA_AT}:latest')

# Download the dataset and load annotations file 
dataset_dir = raw_data_at.download()
data = json.load(open(os.path.join(dataset_dir, 'annotations/instances_default.json')))

In [None]:
annotations = pd.DataFrame.from_dict(data['annotations'])
images = pd.DataFrame.from_dict(data['images'])

In [None]:
annotations.head()

In [None]:
images.head()

In [None]:
df = annotations[['image_id', 'category_id']].groupby('image_id')['category_id'].apply(lambda x: list(set(x))).reset_index()
df.head()

In [None]:
df['mold'] = df['category_id'].apply(lambda x: 4 in x)
df['mold'].value_counts()

In [None]:
df = pd.merge(df, images[['id', 'file_name']], left_on='image_id', right_on='id')

In [None]:
del df['id']

In [None]:
df['fruit_id'] = df['file_name'].apply(lambda x: x.split('/')[1].split('_')[0])

In [None]:
df.head()

In [None]:
df['fold'] = -1

In [None]:
from sklearn.model_selection import StratifiedGroupKFold

X = df.index.values
y = df.mold.values
groups = df.fruit_id.values

cv = StratifiedGroupKFold(n_splits=10, random_state=42, shuffle=True)
for i, (train_idxs, test_idxs) in enumerate(cv.split(X, y, groups)):
    df['fold'].iloc[test_idxs] = i

In [None]:
df['stage'] = df['fold'].apply(lambda x: 'test' if x == 0 else ('valid' if x == 1 else 'train'))

In [None]:
df.to_csv('data_split.csv', index=False)
df.head()

In [None]:
# create an artifact for all the raw data
processed_data_at = wandb.Artifact(PROCESSED_DATA_AT, type="split_data")

# add data split file to the artifact
processed_data_at.add_file('data_split.csv')

# add images to the artifact
processed_data_at.add_dir(dataset_dir)

# save artifact to W&B
run.log_artifact(processed_data_at)

# finalize run
run.finish()