In [0]:
# imports
from datetime import datetime

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [0]:
# load the data from local volumes
VOLUME_ROOT_PATH = "/Volumes/cscie103_catalog/final_project/data"
VOLUME_TARGET_DIR = f"{VOLUME_ROOT_PATH}/raw"
filenames = {
    'holidays_events': 'holidays_events.csv',
    'oil': 'oil.csv',
    'sample_submission': 'sample_submission.csv',
    'stores': 'stores.csv',
    'test': 'test.csv',
    'train': 'train.csv',
    'transactions': 'transactions.csv'
}

holidays_events_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('holidays_events')}", header=True, inferSchema=True)
oil_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('oil')}", header=True, inferSchema=True)
stores_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('stores')}", header=True, inferSchema=True)
transactions_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('transactions')}", header=True, inferSchema=True)
train_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('train')}", header=True, inferSchema=True)

test_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('test')}", header=True, inferSchema=True)

In [0]:
# eda: train
def analyze_train(train_df):
    """
    Wrapped into function for convenience, also to not litter env with variables.
    """
    display(train_df.limit(5))
    display(train_df.summary())
    display(train_df.dtypes)

    # store_nbr
    print(f"Number of unique 'store_nbr': {train_df.select('store_nbr').distinct().count()}")
    store_nbr_all = list(map(lambda x: x[0], train_df.select('store_nbr').distinct().collect()))
    print(f"Unique values in 'store_nbr' column: {store_nbr_all}")
    # family
    print(f"Number of unique values in 'family' column: {train_df.select('family').distinct().count()}")
    family_all = list(map(lambda x: x[0], train_df.select('family').distinct().collect()))
    print(f"Unique values in 'family' column: {family_all}")
    # date
    date_all = list(map(lambda x: x[0], train_df.select('date').distinct().collect()))
    print(f"Range of 'date' column: {min(date_all)}, {max(date_all)}")

analyze_train(train_df)


In [0]:
# eda: stores
def analyze_stores(stores_df):
    """
    Wrapped into function for convenience, also to not litter env with variables.
    """
    display(stores_df.limit(5))
    display(stores_df.summary())
    display(stores_df.dtypes)

    city_all = list(map(lambda x: x[0], stores_df.select('city').distinct().collect()))
    print(f"Unique values in 'city' column: {city_all}")
    state_all = list(map(lambda x: x[0], stores_df.select('state').distinct().collect()))
    print(f"Unique values in 'state' column: {state_all}")
    type_all = list(map(lambda x: x[0], stores_df.select('type').distinct().collect()))
    print(f"Unique values in 'type' column: {type_all}")
    cluster_all = list(map(lambda x: x[0], stores_df.select('cluster').distinct().collect()))
    print(f"Unique values in 'cluster' column: {cluster_all}")

analyze_stores(stores_df)

In [0]:
# eda: transactions
def analyze_transactions(transactions_df):
    """
    Wrapped into function for convenience, also to not litter env with variables.
    """
    display(transactions_df.limit(5))
    display(transactions_df.summary())
    display(transactions_df.dtypes)
analyze_transactions(transactions_df)

def prepare_transactions(transactions_df):
    """
    Transactions has 3 cols: date, store_nbr, transactions
    """
    df = transactions_df.toPandas()
    df['date'] = pd.to_datetime(df['date'])
    df = df.set_index('date')
    df = df.sort_index()

    initial_row_count = df.shape[0]
    df = df.dropna() # drop rows with missing values
    final_row_count = df.shape[0]
    
    if initial_row_count != final_row_count:
        print(f"REMOVED {initial_row_count - final_row_count} ROWS WITH MISSING VALUES")
          
    return df
prepped_transactions_df = prepare_transactions(transactions_df)

def plot_daily_transactions(prepped_transactions_df):
    """
    54 unique stores, let's draw 6 stores per plot - 9 plots
    """
    store_nbrs = sorted(prepped_transactions_df['store_nbr'].unique())
    n_stores = len(store_nbrs)
    stores_per_plot = 6
    n_plots = int(np.ceil(n_stores / stores_per_plot))

    for i in range(n_plots):
        plt.figure(figsize=(14, 6))
        start = i * stores_per_plot
        end = start + stores_per_plot
        selected_stores = store_nbrs[start:end]
        for store in selected_stores:
            store_df = prepped_transactions_df[prepped_transactions_df['store_nbr'] == store]
            plt.plot(store_df.index, store_df['transactions'], label=f'Store {store}', alpha=0.7)
        plt.title(f'Daily Transactions: Stores {selected_stores}')
        plt.xlabel('Date')
        plt.ylabel('Transactions')
        plt.legend()
        plt.tight_layout()
        display(plt.gcf())
        plt.close()

plot_daily_transactions(prepped_transactions_df)

In [0]:
# eda: oil
def analyze_oil(oil_df):
    """
    Wrapped into function for convenience, also to not litter env with variables. 
    """
    display(oil_df.limit(5))
    display(oil_df.summary())
    display(oil_df.dtypes)

    return oil_df
analyze_oil(oil_df)

# prepare: oil
def prepare_oil(oil_df):
    """
    Oil has 2 cols: date, dcoilwtico
    """
    df = oil_df.toPandas()
    # date is not in datetime YYYY-MM-DD format
    df['date'] = pd.to_datetime(df['date']).dt.date
    df = df.set_index('date')
    df = df.sort_index()

    df = df.asfreq(freq='D') # fill missing dates
    # fill missing values, propagate 
    # last valid observation forward to next valid
    df = df.fillna(method='ffill') 

    df = df.reset_index()
    df = spark.createDataFrame(df)

    return df
prepared_oil_df = prepare_oil(oil_df)

def plot_daily_oil_prices(prepared_oil_df):
    df = prepared_oil_df.toPandas()
    plt.figure(figsize=(12, 6))
    plt.plot(df['date'], df['dcoilwtico'], color='blue', alpha=0.7)
    plt.title('Daily Oil Prices')
    plt.xlabel('Date')
    plt.ylabel('dcoilwtico')
    plt.tight_layout()
    display(plt.gcf())
    plt.close()
plot_daily_oil_prices(prepared_oil_df)


In [0]:
# data preparation for ML


In [0]:
# model

# v0: only on train + transactions data
# make sure that train_df and prepped_transactions_df have exactly the same dates, nothing is left out

# make sure that train_df and prepped_transactions_df have exactly the same dates sets
train_dates = train_df.select('date').distinct().toPandas()['date']
transaction_dates = prepped_transactions_df.index.unique()

common_dates = set(train_dates).intersection(set(transaction_dates))

# if there are dates left out for whatever reason - print warning
if len(common_dates) != len(train_dates):
    print(f"WARNING: {len(train_dates) - len(common_dates)} dates left out, out of {len(train_dates)} total dates")

# if there are dates left out for whatever reason - print warning
if len(common_dates) != len(transaction_dates):
    print(f"WARNING: {len(transaction_dates) - len(common_dates)} dates left out")

# train_df = train_df.filter(train_df['date'].isin(common_dates))
# prepped_transactions_df = prepped_transactions_df[prepped_transactions_df.index.isin(common_dates)]

