In [0]:
VOLUME_ROOT_PATH = "/Volumes/cscie103_catalog/final_project/data"
VOLUME_TARGET_DIR = f"{VOLUME_ROOT_PATH}/raw"

filenames = {
    'holidays_events': 'holidays_events.csv',
    'oil': 'oil.csv',
    'sample_submission': 'sample_submission.csv',
    'stores': 'stores.csv',
    'test': 'test.csv',
    'train': 'train.csv',
    'transactions': 'transactions.csv'
}

holidays_events_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('holidays_events')}", header=True, inferSchema=True)
oil_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('oil')}", header=True, inferSchema=True)
stores_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('stores')}", header=True, inferSchema=True)
test_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('test')}", header=True, inferSchema=True)
train_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('train')}", header=True, inferSchema=True)
transactions_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('transactions')}", header=True, inferSchema=True)





In [0]:
from pyspark.sql import functions as F

def print_small_uniques(dfs):
    for i, df in enumerate(dfs):
        print(f"\n=== DataFrame {i} ===")

        for col in df.columns:
            uniques = df.select(col).distinct()
            count = uniques.count()

            if count < 10:
                values = [row[col] for row in uniques.collect()]
                print(f"{col}: {values}")

def describe_columns(dfs):
    for i, df in enumerate(dfs):
        print(f"\n=== DataFrame {i} ===")

        for col in df.columns:
            uniques = df.select(col).distinct()
            count = uniques.count()

            if count < 15:
                values = [row[col] for row in uniques.collect()]
                print(f"{col}: {values}")
            else:
                stats = df.agg(F.min(col).alias("min"), F.max(col).alias("max")).collect()[0]
                print(f"{col}: min={stats['min']}, max={stats['max']}")

from pyspark.sql import functions as F

def describe_named_dfs(named_dfs):
    """
    named_dfs: dict like {"df1": df1, "df2": df2}
    """
    for name, df in named_dfs.items():
        print(f"\n=== {name} ===")

        for col in df.columns:
            uniques = df.select(col).distinct()
            count = uniques.count()

            if count < 10:
                values = [row[col] for row in uniques.collect()]
                print(f"{col}: {values}")
            else:
                stats = df.agg(F.min(col).alias("min"), F.max(col).alias("max")).collect()[0]
                print(f"{col}: min={stats['min']}, max={stats['max']}")



In [0]:
dataframes_dict = {
    "holidays": holidays_events_df,
    "oil": oil_df,
    "stores": stores_df,
    "test": test_df,
    "train": train_df,
    "transactions": transactions_df,
}

describe_named_dfs(dataframes_dict)


In [0]:
for name, df in dataframes_dict.items():
    print(name)
    df.display()

In [0]:
gold = spark.read.table("cscie103_catalog.final_project.gold_daily_store_family")
display(gold)


In [0]:
!pip install missingno


In [0]:
import missingno as msno

In [0]:
train_df.toPandas()['onpromotion'].unique()

In [0]:
gold.toPandas()['onpromotion'].unique()

In [0]:
import numpy as np

gold_df = gold.toPandas()
gold_dfx = gold_df.pivot_table(index=['family', 'store_nbr', 'onpromotion'], columns='date', values='sales', aggfunc='sum').reset_index()
gold_dfx = gold_dfx.replace(0, np.nan)
gold_dfx

In [0]:
gold_dfx

In [0]:
gold_dfx.set_index(['family', 'store_nbr', 'onpromotion']).stack().reset_index()


In [0]:
import numpy as np
import pandas as pd

df = train_df.toPandas()
df['date'] = pd.to_datetime(df['date'])
iso = df['date'].dt.isocalendar()
# df['year_week'] = iso.year.astype(str) + iso.week.astype(str).str.zfill(2)
df['year_week'] = iso.year.astype(str) + iso.week.astype(str).str.zfill(2)
df['year_month'] = df['date'].dt.year.astype(str) + "-"+ df['date'].dt.month.astype(str).str.zfill(2)
dfx = df.pivot_table(index=['family', 'store_nbr'], columns='year_month', values='sales', aggfunc='sum').reset_index()
dfx = dfx.replace(0, np.nan)
dfx

# dfx = pivot_to_yearweek(df)
# dfx
# dfx = train.pivot_table(index=['family', 'store_nbr'], columns='year_week', values='sales', aggfunc='sum').reset_index()
# dfx = dfx.replace(0, np.nan)
# dfx

In [0]:
import matplotlib.pyplot as plt
import missingno as msno

for family in dfx['family'].unique():
    fig, ax = plt.subplots(figsize=(20,4))

    sub = dfx[dfx['family'] == family]

    msno.matrix(sub, ax=ax)

    ax.set_title(f"Missingness Matrix by store x year_month — {family}")

    # show column names on x-axis
    ax.set_xticks(range(len(sub.columns)))
    ax.set_xticklabels(sub.columns, rotation=90, fontsize=8)

    plt.tight_layout()
    plt.show()


In [0]:
import matplotlib.pyplot as plt
import missingno as msno

for family in dfx['family'].unique():
    fig, ax = plt.subplots(figsize=(16,4))   # optional size
    msno.matrix(dfx[dfx['family'] == family], ax=ax)
    ax.set_title(f"Missingness Matrix by store x year_week — {family}")
    plt.show()


In [0]:
cities = spark.read.csv('/Volumes/cscie103_catalog/final_project/data/raw/worldcities.csv', header=True)
display(cities)

In [0]:
gold_cities = (
    gold
    .join(cities, gold.city == cities.city, "left").drop(cities.city) 
)
display(gold_cities)


In [0]:
cities_no_geo = gold_cities.filter(
    (gold_cities.lat.isNull()) & (gold_cities.lng.isNull())
)
display(cities_no_geo)

In [0]:
sorted(gold_cities.columns)

In [0]:
gold_cities.write.mode("overwrite").saveAsTable("cscie103_catalog.final_project.gold_cities")

# format('delta').save('/Volumes/cscie103_catalog/final_project/data/processed/gold_cities')

In [0]:
%%sql
SELECT * FROM gold_cities

In [0]:
%sql
use cscie103_catalog.final_project

In [0]:
%sql
SHOW tables;

In [0]:
%sql
SELECT sum(sales) FROM bronze_train WHERE date = '2017-08-05'

In [0]:
%sql
SELECT sum(sales) FROM silver_train WHERE date = '2017-08-05'

In [0]:
%sql
SELECT sum(sales) FROM silver_training WHERE date = '2017-08-05'