# Kaggle Dataset Upload Lab

In [1]:
import pandas as pd
import os
from genailab.assets.idgen import DatasetIDGen
from genailab.container import GenAILabContainer
from genailab.core.flow import StageDef, PhaseDef
from genailab.infra.utils.file.io import IOService
from genailab.infra.config.app import AppConfigReader
from genailab.infra.persistence.cloud.kaggle import KaggleService

pd.options.display.max_rows = 999

In [2]:
config_reader = AppConfigReader()
env = config_reader.get_environment()
username = config_reader.get_env_var("KAGGLE_USERNAME")
filepath = os.path.join("data", "stage", env, "reviews.csv")
title = f"AppVoCAI Reviews Dataset ({env.upper()})"

## Load Dataset

In [3]:
container = GenAILabContainer()
container.init_resources()
container.wire(
    modules=[
        "genailab.flow.stage.base",
        "genailab.app.base",
    ],
)

In [4]:
idg = AssetIDGen()
asset_id = idg.get_asset_id(
    asset_type="dataset",
    phase=PhaseDef.DATAPREP,
    stage=StageDef.PREPROCESS,
    name="review",
)

# Instantiate the repository
repo = container.persist.repo()
# Load the dataset from the repository
dataset = repo.get(asset_id, distributed=False)
df = dataset.content

In [None]:
dataset.asset_id

## Stage Dataset

In [6]:
reviews = df[["id", "content"]]
IOService.write(filepath=filepath, data=reviews)

## Instantiate Kaggle Service and Upload

In [None]:
kags = KaggleService(username=username)
kags.upload(
    filepath=filepath,
    title=title,
    dataset_name=dataset.asset_id,
    private=True,
)

## Download Dataset from Kaggle

In [4]:
# !pip install --user kaggle
!kaggle datasets download -d johnjames/sentiment-analysis

403 - Forbidden - Permission 'datasets.get' was denied
