In [0]:
VOLUME_ROOT_PATH = "/Volumes/cscie103_catalog/final_project/data"
VOLUME_TARGET_DIR = f"{VOLUME_ROOT_PATH}/raw"

filenames = {
    'holidays_events': 'holidays_events.csv',
    'oil': 'oil.csv',
    'sample_submission': 'sample_submission.csv',
    'stores': 'stores.csv',
    'test': 'test.csv',
    'train': 'train.csv',
    'transactions': 'transactions.csv'
}

holidays_events_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('holidays_events')}", header=True, inferSchema=True)
oil_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('oil')}", header=True, inferSchema=True)
stores_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('stores')}", header=True, inferSchema=True)
transactions_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('transactions')}", header=True, inferSchema=True)
train_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('train')}", header=True, inferSchema=True)


In [0]:
from pyspark.sql import functions as F

def print_small_uniques(dfs):
    for i, df in enumerate(dfs):
        print(f"\n=== DataFrame {i} ===")

        for col in df.columns:
            uniques = df.select(col).distinct()
            count = uniques.count()

            if count < 10:
                values = [row[col] for row in uniques.collect()]
                print(f"{col}: {values}")

def describe_columns(dfs):
    for i, df in enumerate(dfs):
        print(f"\n=== DataFrame {i} ===")

        for col in df.columns:
            uniques = df.select(col).distinct()
            count = uniques.count()

            if count < 15:
                values = [row[col] for row in uniques.collect()]
                print(f"{col}: {values}")
            else:
                stats = df.agg(F.min(col).alias("min"), F.max(col).alias("max")).collect()[0]
                print(f"{col}: min={stats['min']}, max={stats['max']}")

from pyspark.sql import functions as F

def describe_named_dfs(named_dfs):
    """
    named_dfs: dict like {"df1": df1, "df2": df2}
    """
    for name, df in named_dfs.items():
        print(f"\n=== {name} ===")

        for col in df.columns:
            uniques = df.select(col).distinct()
            count = uniques.count()

            if count < 10:
                values = [row[col] for row in uniques.collect()]
                print(f"{col}: {values}")
            else:
                stats = df.agg(F.min(col).alias("min"), F.max(col).alias("max")).collect()[0]
                print(f"{col}: min={stats['min']}, max={stats['max']}")



In [0]:
dfs = [holidays_events_df, oil_df, stores_df, transactions_df, train_df]
print_small_uniques(dfs)


In [0]:
dfs = [holidays_events_df, oil_df, stores_df, transactions_df, train_df]
describe_columns(dfs)

In [0]:
describe_named_dfs({
    "customers": holidays_events_df,
    "orders": oil_df,
    "stores_df": stores_df,
    "transactions_df": transactions_df,
    "train_df": transactions_df
})



In [0]:
holidays_events_df.display()
oil_df.display()
stores_df.display()
transactions_df.display()
train_df.display()

In [0]:
holidays_events_df.display()

In [0]:
gold = spark.read.table("cscie103_catalog.final_project.gold_daily_store_family")
display(gold)


In [0]:
cities = spark.read.csv('/Volumes/cscie103_catalog/final_project/data/raw/worldcities.csv', header=True)
display(cities)

In [0]:
gold_cities = (
    gold
    .join(cities, gold.city == cities.city, "left")
)
display(gold_cities)


In [0]:
cities_no_geo = gold_cities.filter(
    (gold_cities.lat.isNull()) & (gold_cities.lng.isNull())
)
display(cities_no_geo)