In [1]:
import os, warnings
import wandb
import numpy as py
import pandas as pd
from fastai.vision.all import *
from sklearn.model_selection import StratifiedGroupKFold

import params
warnings.filterwarnings('ignore')

In this notebook, we split data into train, test, validate using previous data artifact

In [2]:
run = wandb.init(project=params.WANDB_PROJECT, entity=params.ENTITY, job_type="data_split")

[34m[1mwandb[0m: Currently logged in as: [33myihuanghz95[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
#previous stored artifact
raw_data_at = run.use_artifact(f'{params.RAW_DATA_AT}:latest')
path = Path(raw_data_at.download())

[34m[1mwandb[0m: Downloading large artifact bdd_simple_1k:latest, 846.57MB. 4007 files... 
[34m[1mwandb[0m:   4007 of 4007 files downloaded.  
Done. 0:0:13.3


file names, groups and target 

In [6]:
fnames = os.listdir(path/'images')
groups = [s.split('-')[0] for s in fnames]

In [7]:
orig_eda_table = raw_data_at.get("eda_table")

[34m[1mwandb[0m: Downloading large artifact bdd_simple_1k:latest, 846.57MB. 4007 files... 
[34m[1mwandb[0m:   4007 of 4007 files downloaded.  
Done. 0:0:11.4


In [8]:
y = orig_eda_table.get_column('bicycle')

In [9]:
df = pd.DataFrame()
df['File_Name'] = fnames
df['fold'] = -1

cv = StratifiedGroupKFold(n_splits=10)
for i, (train_idxs, test_idxs) in enumerate(cv.split(fnames, y, groups)):
    df.loc[test_idxs, ['fold']] = i

In [10]:
df['Stage'] = 'train'
df.loc[df.fold == 0, ['Stage']] = 'test'
df.loc[df.fold == 1, ['Stage']] = 'valid'
del df['fold']
df.Stage.value_counts()

train    800
valid    100
test     100
Name: Stage, dtype: int64

Add data to artifact

In [11]:
processed_data_at = wandb.Artifact(params.PROCESSED_DATA_AT, type="split_data")

In [12]:
df.to_csv('data_split.csv', index=False)

In [13]:
processed_data_at.add_file('data_split.csv')
processed_data_at.add_dir(path)

[34m[1mwandb[0m: Adding directory to artifact (./artifacts/bdd_simple_1k:latest)... Done. 3.5s


Add back to the EDA table

In [14]:
data_split_table = wandb.Table(dataframe=df[['File_Name', 'Stage']])
join_table = wandb.JoinedTable(orig_eda_table, data_split_table, "File_Name")

In [15]:
processed_data_at.add(join_table, "eda_table_data_split")

<wandb.sdk.artifacts.artifact_manifest_entry.ArtifactManifestEntry at 0x7fe1f96624f0>

In [16]:
run.log_artifact(processed_data_at)
run.finish()