# Bike Stores Relational Database 

In [0]:
pip install kagglehub

In [0]:
%restart_python

In [0]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("dillonmyrick/bike-store-sample-database")

print("Path to dataset files:", path)

# CODE TO DOWNLOAD DATA FROM KAGGLE AND STORE IN VOLUME IN DATABRICKS

In [0]:
import kagglehub
import shutil
import os

# Download latest version from Kaggle
path = kagglehub.dataset_download("dillonmyrick/bike-store-sample-database")

# Define destination directory in the volume
dest_dir = "/Volumes/learning/data/raw/bike-store-sample-database/"
os.makedirs(dest_dir, exist_ok=True)

# Find the actual dataset directory (latest version)
for root, dirs, files in os.walk(path):
    for file in files:
        src_file = os.path.join(root, file)
        dst_file = os.path.join(dest_dir, file)
        shutil.copy2(src_file, dst_file)

display([f for f in os.listdir(dest_dir) if os.path.isfile(os.path.join(dest_dir, f))])

# Convert and save files into Databricks tables


In [0]:
import os

catalog = "learning"
schema = "bike_store"
dataset_path = "/Volumes/learning/data/raw/bike-store-sample-database/"

files = [f for f in os.listdir(dataset_path) if os.path.isfile(os.path.join(dataset_path, f))]

for file in files:
    file_path = os.path.join(dataset_path, file)
    table_name = os.path.splitext(file)[0].replace(" ", "_").lower()
    full_table_name = f"{catalog}.{schema}.{table_name}"
    if file.endswith(".csv"):
        df = spark.read.option("header", "true").option("inferSchema", "true").csv(file_path)
        df.write.mode("overwrite").saveAsTable(full_table_name)
    elif file.endswith(".parquet"):
        df = spark.read.parquet(file_path)
        df.write.mode("overwrite").saveAsTable(full_table_name)

# Exploratory Data Analysis of all tables in Bike Stores dataset 


In [0]:
tables = [row.tableName for row in spark.sql("SHOW TABLES IN learning.bike_store").collect()]

for table in tables:
    print(f"--- {table} ---")
    df = spark.table(f"learning.bike_store.{table}")
    print("Schema:")
    df.printSchema()
    print("Row count:")
    display(spark.sql(f"SELECT COUNT(*) AS row_count FROM learning.data.{table}"))
    print("Sample rows:")
    display(df.limit(10))
    print("Summary statistics (numeric columns):")
    display(df.describe())
    print("\n")