# Kaggle Dataset Upload Lab

In [1]:
import pandas as pd
import os
from discover.assets.idgen import AssetIDGen
from discover.container import DiscoverContainer
from discover.core.flow import DataPrepStageDef, PhaseDef
from discover.infra.utils.file.io import IOService
from discover.infra.config.app import AppConfigReader
from discover.infra.utils.file.compress import ZipFileHandler

pd.options.display.max_rows = 999

In [2]:
config_reader = AppConfigReader()
env = config_reader.get_environment()
filepath = os.path.join("data", "stage", env, "reviews.csv")
zipfilepath = os.path.join("data", "stage", env, "reviews.zip")
jsonfilepath = os.path.join("data", "stage", env, "dataset-metadata.json")
dataset_dir = os.path.dirname(zipfilepath)

## Load Dataset

In [3]:
container = DiscoverContainer()
container.init_resources()
container.wire(
    modules=[
        "discover.flow.data_prep.stage",
        "discover.app.base",
    ],
)

In [None]:
idg = AssetIDGen()
asset_id = idg.get_asset_id(
    asset_type="dataset",
    phase=PhaseDef.DATAPREP,
    stage=DataPrepStageDef.TQA,
    name="review",
)

# Instantiate the repository
repo = container.repo.dataset_repo()
# Load the dataset from the repository
dataset = repo.get(asset_id, distributed=False)
df = dataset.content

## Stage Dataset

In [5]:
reviews = df[["id", "content"]]
IOService.write(filepath=filepath, data=reviews)
zipper = ZipFileHandler()
zipper.compress_file(file_path=filepath, zip_path=zipfilepath)
if os.path.exists(zipfilepath):
    os.remove(filepath)
    print(f"Dataset is staged at {zipfilepath}")

Compressed data/stage/dev/reviews.csv into data/stage/dev/reviews.zip
Dataset is staged at data/stage/dev/reviews.zip


## Prepare Dataset for Upload

In [None]:
dataset_metadata = {
    "title": "AppVoCAI Reviews Dataset",
    "id": f"johnjamesai/{dataset.name}",
    "licenses": [{"name": "CC0-1.0"}],
    "isPrivate": True,
}
IOService.write(filepath=jsonfilepath, data=dataset_metadata)

## Upload Dataset

In [None]:
#!kaggle datasets create -p {dataset_dir}

Starting upload for file reviews.zip
100%|██████████████████████████████████████| 6.19M/6.19M [00:04<00:00, 1.30MB/s]
Upload successful: reviews.zip (6MB)
Your private Dataset is being created. Please check progress at https://www.kaggle.com/datasets/johnjamesai/review


In [11]:
!kaggle datasets version -p{dataset_dir} --m "Updating dataset visibility to private"

Starting upload for file reviews.zip
100%|██████████████████████████████████████| 6.19M/6.19M [00:05<00:00, 1.30MB/s]
Upload successful: reviews.zip (6MB)
Dataset version is being created. Please check progress at https://www.kaggle.com/johnjamesai/review
