# Dataset Staging Lab
Stages datasets for preprocession to avoid repeatedly sampling from the 22m row raw dataset. Also ensures that the test environment comes from the development set population, ensuring that all test observations have model inference data, such as perplexities and sentiments. 

In [1]:
import os
import numpy as np
import pandas as pd
from genailab.infra.utils.file.io import IOService
from tqdm import tqdm


pd.set_option("display.max_columns", 999)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", 999)

## Setup Configuration
Configurations for each file include the source and destination filepaths, the fraction of the source to sample, and random_state for reproducibility.  

In [2]:
configs = [
    {
        "source": "data/raw/reviews",
        "dest": "data/stage/prod/reviews",
        "frac": 1,
        "random_state": 65,
        "force": False,
    },
    {
        "source": "data/raw/reviews",
        "dest": "data/stage/dev/reviews",
        "frac": 0.01,
        "random_state": 51,
        "force": False,
    },
    {
        "source": "data/stage/dev/reviews",
        "dest": "data/stage/test/reviews",
        "frac": 0.1,
        "random_state": 51,
        "force": True,
    },
]

In [3]:
class FilterTask:
    def __init__(self, column: str, frac: float, date: int, random_state: int):
        self._column = column
        self._frac = frac
        self._date = date
        self._random_state = random_state

    def run(self, df: pd.DataFrame) -> pd.DataFrame:
        df2 = df.loc[df[self._column].dt.year> self._date]
        return df2.sample(frac=self._frac, random_state=self._random_state)

## Stage Files
Iterating through the configs, this cell stages the production, development, and test files using the same seeds as used in the pipelines to ensure that the data are the same as that in the workspace.

In [4]:
column = "date"
date = 2020
filepath = None
df = None
for config in tqdm(configs):
    if os.path.exists(config["dest"]) and not config["force"]:
        print(f"File {config['dest']} already exists. Skipping...")
    else:
        if filepath != config["source"]:
            print(f"Reading dataset from {config['source']}.")
            df = IOService.read(filepath=config["source"])
            filepath = config["source"]
        filter = FilterTask(
            column=column,
            frac=config["frac"],
            date=date,
            random_state=config["random_state"],
        )
        data = filter.run(df=df)
        data['date'] = data['date'].astype("datetime64[ms]")
        IOService.write(filepath=config["dest"], data=data)
        print(
            f"Created dataset of {data.shape[0]} rows and persisted to {config['dest']}"
        )

  0%|          | 0/3 [00:00<?, ?it/s]

File data/stage/prod/reviews already exists. Skipping...
File data/stage/dev/reviews already exists. Skipping...
Reading dataset from data/stage/dev/reviews.


100%|██████████| 3/3 [00:00<00:00,  6.95it/s]

Created dataset of 5904 rows and persisted to data/stage/test/reviews





## Validate Results
We compare id's from the development set created to those from the development set in the workspace.  

In [5]:
# Compare dev set
fp1 = "data/stage/dev/reviews"
fp2 = "workspace/dev/dataset/01_dataprep/appvocai_discover-01_dataprep-01_preprocess-review-dataset.parquet"
df1 = IOService.read(fp1)
df2 = IOService.read(fp2)
id1 = df1["id"].sort_values().values
id2 = df2["id"].sort_values().values
assert len(id1) == len(id2)
assert np.array_equal(id1, id2)

FileNotFoundError: [Errno 2] No such file or directory: 'workspace/dev/dataset/01_dataprep/appvocai_discover-01_dataprep-01_preprocess-review-dataset.parquet'

In [None]:
df1.sort_values(by="id").tail()

In [None]:
df2.sort_values(by="id").tail()