# 00_Data_Prep
Prepare Orange Juice sales data to be passed into Azure's Many Models solution.

---

Import libraries

In [None]:
import os
import shutil
import numpy as np
import pandas as pd
from azureml.core.dataset import Dataset
from azureml.core.workspace import Workspace
from azureml.opendatasets import OjSalesSimulated

### 1.0 Download data locally
Establish how many store-brand timeseries we wish to download and save them locally

In [None]:
# Set dataset size
dataset_maxfiles = 10 # Set to 11973 or 0 to get all the files

# Pull all of the data
oj_sales_files = OjSalesSimulated.get_file_dataset()

# Pull only the first `dataset_maxfiles` files
if dataset_maxfiles:
    oj_sales_files = oj_sales_files.take(dataset_maxfiles)

# Create a folder to download
target_path = 'oj_sales_data' 
os.makedirs(target_path, exist_ok=True)

# Download the data
oj_sales_files.download(target_path, overwrite=True)

### 2.0 Data Prep
To create a realistic example, the "Revenue" and "Price" columns are removed, as leaving them in would introduce data leakage. The "Advert" column is also removed to create a purely autoregressive example. In a real solution, other available features that were found to be statistically significant and had known future values could be added to create a more accurate forecast.

In [None]:
# Create folders
train_data_path = os.path.join(target_path, "upload_data")
os.makedirs(train_data_path, exist_ok=True)

# Get list of files
files_list = [os.path.join(path, f) for path, _, files in os.walk(target_path) for f in files
          if path not in (train_data_path)]

# Create dataframes
data = pd.DataFrame()

for file in files_list:
    if 'Store' in file:
        file_name = os.path.basename(file)
        file_extension = os.path.splitext(file_name)[1].lower()
        df = pd.read_csv(file)
        df = df.drop(columns=['Revenue', 'Price'])

        data = pd.concat([data, df])

data.to_csv(os.path.join(train_data_path, 'oj_sales.csv'), index=False)

### 3.0 Upload to Blob
Upload the dataframe to Blob storage where Azure ML can then reference it

In [None]:
# Connect to worksapce
ws = Workspace.from_config()

# Connect to default datastore
datastore = ws.get_default_datastore()

# Remove all checkpoints
for root, dirs, files in os.walk(target_path, topdown=False):
    for dir_name in dirs:
        if dir_name == ".ipynb_checkpoints":
            folder_path = os.path.join(root, dir_name)
            shutil.rmtree(folder_path)
            print(f"Folder '{folder_path}' removed.")

# Upload data
ds_train_path = target_path + '/data'
datastore.upload(src_dir=train_data_path, target_path=ds_train_path, overwrite=True)

### 4.0 Register as dataset in AzureML
Register the uploaded data as a dataset in AzureML to make it easily accessable from Azure ML services

In [None]:
# Create file datasets
ds = Dataset.Tabular.from_delimited_files([(datastore, os.path.join(ds_train_path, 'oj_sales.csv'))])

# Register the file datasets
dataset_name = 'oj_data_small' if 0 < dataset_maxfiles < 11973 else 'oj_data'
dataset_name = dataset_name + '_train'
ds.register(ws, dataset_name, create_new_version=True)

## Next Steps

Now that the data is available in AzureML, it's time to create the training and inference pipelines. Follow the steps in [02_Pipeline.ipynb](0e_Pipeline.ipynb) for that.