# Kaggle Dataset Upload Lab

In [1]:
import pandas as pd
import os
from genailabslm.assets.idgen import DatasetIDGen
from genailabslm.container import GenAILabSLMContainer
from genailabslm.core.flow import StageDef, PhaseDef
from genailabslm.infra.utils.file.io import IOService
from genailabslm.infra.config.app import AppConfigReader
from genailabslm.infra.persistence.cloud.kaggle import KaggleService

pd.options.display.max_rows = 999

In [2]:
config_reader = AppConfigReader()
env = config_reader.get_environment()
username = config_reader.get_env_var("KAGGLE_USERNAME")
filepath = os.path.join("data", "stage", env, "reviews.csv")
title = f"AppVoCAI Reviews Dataset ({env.upper()})"

## Load Dataset

In [3]:
container = GenAILabSLMContainer()
container.init_resources()
container.wire(
    modules=[
        "genailabslm.flow.stage.base",
        "genailabslm.app.base",
    ],
)

In [4]:
idg = AssetIDGen()
asset_id = idg.get_asset_id(
    asset_type="dataset",
    phase=PhaseDef.DATAPREP,
    stage=StageDef.PREPROCESS,
    name="review",
)

# Instantiate the repository
repo = container.persist.repo()
# Load the dataset from the repository
dataset = repo.get(asset_id, distributed=False)
df = dataset.content

In [5]:
dataset.asset_id

'dataset-dev-dataprep-preprocess-review'

## Stage Dataset

In [6]:
reviews = df[["id", "content"]]
IOService.write(filepath=filepath, data=reviews)

## Instantiate Kaggle Service and Upload

In [7]:
kags = KaggleService(username=username)
kags.upload(
    filepath=filepath,
    title=title,
    dataset_name=dataset.asset_id,
    private=True,
)

Compressed data/stage/dev/reviews.csv into data/stage/dev/reviews.zip
Starting upload for file reviews.zip


100%|██████████| 6.19M/6.19M [00:05<00:00, 1.30MB/s]


Upload successful: reviews.zip (6MB)
Your private Dataset is being created. Please check progress at https://www.kaggle.com/datasets/johnjames/dataset-dev-dataprep-preprocess-review
