# Data Modelling
This is a notebook to experiment with the data modelling of the sales quantity data.
This was done on a cloud instance so the file paths will be different if you are running this locally.
Note that the dataset is also propiertary so it will not be included in this repository.

# 1.Imports and Constants
We will be using the following libraries:

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
WINDOW = 20
BATCH_SIZE = 1000
BUFFER = 100000

# Load data

The data is a csv extracted from an SQL database and cleaned. It contains the following columns:
- **date:** The date of the sale
- **item_code:** The code of the product sold
- **quantity:** The quantity of the product sold on that day

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# %pwd

In [None]:
# filepath = 'sales_quantity.csv' #for local imports
filepath = '/home/mariefloco/sales_quantity.csv' #colab
data = pd.read_csv(filepath,names=['date','item_code','quantity'],header = 0 )
data.head()

# Transform data
We need to change the data so that it has the following format for training:
- **Input:** [*The tokenized item code, day, month, day of the week, day of the year, {A sequence of 20 days of sales data for a particular product}*]
- **Output:** The quantity sold for that product in the following day
>**Note:**
    - *The tokenized item code is the index of the item code in the tokenizer's word index.*
    - *The day component of the date is the day of the month.*
    - *The month component of the date is the month of the year.*
    - *The day of the week is a number between 0 and 6, where 0 is Monday and 6 is Sunday.*
    - *The day of the year is a number between 1 and 365, where January 1st is 1 and December 31st is 365.*
    - *The sequence of 20 days of sales data is the window size we will use for training the model. The data will be normalized*


In [None]:
#extract date features from date column
data['date'] = pd.to_datetime(data['date'])
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day
data['day_of_week'] = data['date'].dt.dayofweek
data['day_of_year'] = data['date'].dt.dayofyear



In [None]:
#stack dataframe based on item_code
item_sales = data.groupby(['item_code','date','year','month','day','day_of_week','day_of_year'])['quantity'].sum().unstack(level=0)
#turn each NaN value to 0
item_sales = item_sales.fillna(0)
item_sales.reset_index(inplace=True)
item_sales.head()

In [None]:
def create_tokenizer(item_code):
    """
    Create a tokenizer to tokenize item codes.

    Args:
        item_code (list or Series): List or Series containing item codes.

    Returns:
        tf.keras.preprocessing.text.Tokenizer: Tokenizer object fitted on item codes.
    """

    # Create a tokenizer with no filters and case-sensitive tokenization
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', lower=False)

    # Fit the tokenizer on the item codes
    tokenizer.fit_on_texts(item_code)

    return tokenizer

# Create a tokenizer using item codes from item_sales dataframe columns
tokenizer = create_tokenizer(item_sales.columns[6:].str.replace(' ', ''))

# Get the length of the tokenizer's word index
tokenizer_word_count = len(tokenizer.word_index)

print(f'Tokenizer has {tokenizer_word_count} tokens')
tokenizer.word_index

In [None]:
# Extract date values from item_sales dataframe columns
dates = np.array(item_sales[['month', 'day', 'day_of_week', 'day_of_year']])

# Perform cyclic encoding on the date values
dates_cyclic = np.sin(dates) + np.cos(dates)

prefix_features = []

# Iterate over each product in item_sales columns
for product in item_sales.columns[6:].str.replace(' ', ''):
    # Create prefix features for the product
    prefix_feature = np.array([
        [
            tokenizer.word_index[product],
            dates_cyclic[j][0],
            dates_cyclic[j][1],
            dates_cyclic[j][2],
            dates_cyclic[j][3]
        ]
        for j in range(WINDOW, len(item_sales))
    ], dtype=np.float64)
    
    prefix_features.append(prefix_feature)

# Get the total number of prefix features arrays and the shape of the first array
prefix_features_count = len(prefix_features)
prefix_features_shape = prefix_features[0].shape

print(f"A total of {prefix_features_count} numpy arrays with each one having shape {prefix_features_shape}")

print(prefix_features[0])

In [None]:
#find mean and standard deviation of each column
mean = item_sales[item_sales.columns[6:]].mean()
std = item_sales[item_sales.columns[6:]].std()
#mean = mean.to_dict()
#std = std.to_dict()

print(f"Mean: {mean}")
print(f"Standard deviation: {std}")

In [None]:
# Normalize the item sales data
normalized_sales = (item_sales[item_sales.columns[6:]] - mean) / std
normalized_sales.head()

In [None]:
#convert each item sales to a tensorflow dataset
sales_datasets = [tf.data.Dataset.from_tensor_slices(normalized_sales[column]) for column in normalized_sales.columns]
sales_datasets[0].element_spec

In [None]:
def window_dataset(token_time_ds, sales_ds, window_size):
    """
    Create a windowed dataset by combining token_time_ds and sales_ds.

    Args:
        token_time_ds (tf.data.Dataset): Dataset containing token and time information.
        sales_ds (tf.data.Dataset): Dataset containing sales information.
        window_size (int): Size of the window for creating sequences.

    Returns:
        tf.data.Dataset: Windowed dataset with input-output pairs.
    """

    # Create windows of size window_size+1, shifting by 1, and dropping any incomplete windows
    sales_ds = sales_ds.window(window_size+1, shift=1, drop_remainder=True)

    # Flatten the windows into individual datasets and combine them into a single dataset
    sales_ds = sales_ds.flat_map(lambda w: w.batch(window_size+1))

    # Concatenate token_time_ds and sales_ds tensors along the last axis
    windowed_tensors = tf.concat((list(token_time_ds), list(sales_ds)), axis=-1)

    # Create a new dataset from the concatenated tensors
    ds = tf.data.Dataset.from_tensor_slices(windowed_tensors)

    # Map each element of the dataset to input-output pairs
    ds = ds.map(lambda x: (x[:-1], x[-1]),num_parallel_calls=tf.data.AUTOTUNE)

    return ds


In [None]:
#window the dataset in batches
ds = [window_dataset(prefix_features[i],sales_datasets[i], WINDOW) for i in range(len(sales_datasets))]
del data
del prefix_features
del sales_datasets
del item_sales

In [None]:
def get_dataset_partitions_tf(ds, ds_size, train_split=0.8, val_split=0.1, test_split=0.1, shuffle=False, shuffle_size=1000):
    """
    Splits a TensorFlow dataset into training, validation, and test partitions.

    Args:
        ds (tf.data.Dataset): The input dataset.
        ds_size (int): The total size of the input dataset.
        train_split (float, optional): The fraction of data to allocate for training. Defaults to 0.8.
        val_split (float, optional): The fraction of data to allocate for validation. Defaults to 0.1.
        test_split (float, optional): The fraction of data to allocate for testing. Defaults to 0.1.
        shuffle (bool, optional): Whether to shuffle the training dataset. Defaults to True.
        shuffle_size (int, optional): The buffer size used for shuffling. Defaults to 10000.

    Returns:
        tuple: A tuple containing the training, validation, and test partitions of the dataset.
    """
    assert (train_split + test_split + val_split) == 1, "The sum of train_split, val_split, and test_split must be 1."
    
    
    train_size = int(train_split * ds_size)
    val_size = int(val_split * ds_size)
    
    if shuffle:
      # Specify seed to always have the same split distribution between runs
      train_ds = ds.take(train_size).shuffle(shuffle_size, seed=12)
    else:
      train_ds = ds.take(train_size)
    val_ds = ds.skip(train_size).take(val_size)
    test_ds = ds.skip(train_size).skip(val_size)
    
    return train_ds, val_ds, test_ds


In [None]:
#split the datasets to train, val, test
ds = [get_dataset_partitions_tf(items, 431, shuffle_size=BUFFER) for items in ds]

In [None]:
#split into individual sets
train_set = [ds[i][0] for i in range(len(ds))]
val_set = [ds[i][1] for i in range(len(ds))]
test_set = [ds[i][2]for i in range(len(ds))]

In [None]:
#turn into tensors
train_set = tf.data.experimental.from_list(train_set).flat_map(lambda x: x)
val_set = tf.data.experimental.from_list(val_set).flat_map(lambda x: x)
test_set = tf.data.experimental.from_list(test_set).flat_map(lambda x: x)

In [None]:
#find the length of each set using map
train_len = train_set.reduce(0, lambda x, _: x + 1).numpy()
val_len = val_set.reduce(0, lambda x, _: x + 1).numpy()
test_len = test_set.reduce(0, lambda x, _: x + 1).numpy()
# train_len = 4787104
# val_len = 598388
# test_len = 598388

print(f"Training set length: {train_len}")
print(f"Validation set length: {val_len}")
print(f"Test set length: {test_len}")


In [None]:
#batch and prefetch the datasets
train_set = train_set.shuffle(BUFFER).batch(BATCH_SIZE).prefetch(1)
val_set = val_set.batch(BATCH_SIZE).prefetch(1)
test_set = test_set.batch(BATCH_SIZE).prefetch(1)

In [None]:
#save the datasets
tf.data.Dataset.save(train_set, 'train_set')
tf.data.Dataset.save(val_set, 'val_set')
tf.data.Dataset.save(test_set, 'test_set')

# Modelling

In [None]:
#create a list of models based on given learning rates and optimizers
def create_model():
    """
    Creates a list of models based on the learning rates and optimizers given.

    Args:
        learning_rate_array (list): A list of learning rates and optimizers to use for each model.

    Returns:
        list: A list of models.
    """
    model = tf.keras.Sequential([
        tf.keras.layers.Conv1D(filters=64, kernel_size=3,
                               strides=1,
                               activation="relu", padding="causal",
                               input_shape=[25, 1]),
        tf.keras.layers.LSTM(64, return_sequences=True),
        tf.keras.layers.LSTM(64),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1)
    ])

    return model


In [None]:
model = create_model()
train_sample = train_set.unbatch().shuffle(BUFFER).take(train_len//10).batch(BATCH_SIZE).prefetch(1)
val_sample = val_set.unbatch().shuffle(BUFFER).take(val_len//10).batch(BATCH_SIZE).prefetch(1)
train_sample

In [None]:
#callback to tune the learning rate
lr_schedule = tf.keras.callbacks.LearningRateScheduler(
    lambda epoch: 1e-8 * 10**(epoch / 20))

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-8)

model.compile(loss=tf.keras.losses.Huber(),
              optimizer=optimizer,
              metrics=["mae"])

history = model.fit(train_sample, epochs=100, callbacks=[lr_schedule],validation_data = val_sample, verbose=2)
