In [0]:
%pip install kaggle


[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
import os

# Replace with your Kaggle credentials from kaggle.json
os.environ['KAGGLE_USERNAME'] = "vidhyarasu"
os.environ['KAGGLE_KEY'] = "c3bbd36a1d5fb7bfd5f45e28614043a5"

# Verify setup
print("Kaggle credentials configured!")


Kaggle credentials configured!


In [0]:
%sh
cd /Volumes/workspace/ecommerce/ecommerce_data

# Only download if Oct file doesn't exist
if [ ! -f "2019-Oct.csv" ]; then
    echo "Downloading dataset..."
    kaggle datasets download -d mkechinov/ecommerce-behavior-data-from-multi-category-store
    unzip -o ecommerce-behavior-data-from-multi-category-store.zip
    rm -f ecommerce-behavior-data-from-multi-category-store.zip
else
    echo "Files already exist, skipping download."
fi

ls -lh


Files already exist, skipping download.
total 14G
-rwxrwxrwx 1 nobody nogroup 8.4G Jan  7 23:04 2019-Nov.csv
-rwxrwxrwx 1 nobody nogroup 5.3G Jan  7 23:06 2019-Oct.csv


In [0]:
from pyspark.sql import functions as F

# Load October data
oct_events = spark.read.csv(
    "/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv",
    header=True,
    inferSchema=True
)

# Optional: Load November data
nov_events = spark.read.csv(
    "/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv",
    header=True,
    inferSchema=True
)

# Combine both months if needed
all_events = oct_events.union(nov_events)

# Quick verification
print(f"October events: {oct_events.count():,}")
print(f"November events: {nov_events.count():,}")
print(f"Total combined events: {all_events.count():,}")

oct_events.show(5)


October events: 42,448,764
November events: 67,501,979
Total combined events: 109,950,743
+-------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|   brand|  price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+
|2019-10-01 00:00:00|      view|  44600062|2103807459595387724|                NULL|shiseido|  35.79|541312140|72d76fde-8bb3-4e0...|
|2019-10-01 00:00:00|      view|   3900821|2053013552326770905|appliances.enviro...|    aqua|   33.2|554748717|9333dfbd-b87a-470...|
|2019-10-01 00:00:01|      view|  17200506|2053013559792632471|furniture.living_...|    NULL|  543.1|519107250|566511c2-e2e3-422...|
|2019-10-01 00:00:01|      view|   1307067|2053013558920217191|  computers.notebook|  lenovo| 251.74|550050854|7

In [0]:
from pyspark.sql import functions as F

# Count unique rows
print("Unique events in Oct:", oct_events.dropDuplicates().count())

# Count unique product IDs (optional)
print("Unique product IDs:", oct_events.select("product_id").distinct().count())


Unique events in Oct: 42418544
Unique product IDs: 166794


In [0]:
from pyspark.sql import DataFrame

# Delete all Spark DataFrames from memory
for v in list(globals().keys()):
    if isinstance(globals()[v], DataFrame):
        del globals()[v]

print("✅ Deleted all Spark DataFrames from memory")


✅ Deleted all Spark DataFrames from memory


In [0]:
from pyspark.sql import SparkSession

# Define paths
OCT_PATH = "/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv"
NOV_PATH = "/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv"

def load_ecommerce_data(month="Oct", sample_fraction=None):
    """
    Load e-commerce data for specified month from workspace paths.

    Parameters:
    -----------
    month : str
        "Oct" or "Nov" or "both"
    sample_fraction : float, optional
        Fraction of data to sample (0.0 to 1.0) for testing.

    Returns:
    --------
    Spark DataFrame with events
    """

    if month == "Oct":
        df = spark.read.csv(OCT_PATH, header=True, inferSchema=True)
    elif month == "Nov":
        df = spark.read.csv(NOV_PATH, header=True, inferSchema=True)
    elif month == "both":
        oct_df = spark.read.csv(OCT_PATH, header=True, inferSchema=True)
        nov_df = spark.read.csv(NOV_PATH, header=True, inferSchema=True)
        df = oct_df.union(nov_df)
    else:
        raise ValueError("Month must be 'Oct', 'Nov', or 'both'")

    # Apply sampling if requested
    if sample_fraction is not None:
        if not 0.0 < sample_fraction <= 1.0:
            raise ValueError("sample_fraction must be between 0.0 and 1.0")
        df = df.sample(fraction=sample_fraction, seed=42)
        print(f"✅ Sampled {sample_fraction*100:.1f}% of data")

    print(f"✅ Loaded {df.count():,} events for {month}")
    return df


In [0]:
# Load your data
events = load_ecommerce_data("Oct")

# Verify it's working
print(f"✅ Ready to go! Loaded {events.count():,} events")
events.show(5)

# Your Day 1 challenges start here...


✅ Loaded 42,448,764 events for Oct
✅ Ready to go! Loaded 42,448,764 events
+-------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|   brand|  price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+
|2019-10-01 00:00:00|      view|  44600062|2103807459595387724|                NULL|shiseido|  35.79|541312140|72d76fde-8bb3-4e0...|
|2019-10-01 00:00:00|      view|   3900821|2053013552326770905|appliances.enviro...|    aqua|   33.2|554748717|9333dfbd-b87a-470...|
|2019-10-01 00:00:01|      view|  17200506|2053013559792632471|furniture.living_...|    NULL|  543.1|519107250|566511c2-e2e3-422...|
|2019-10-01 00:00:01|      view|   1307067|2053013558920217191|  computers.notebook|  lenovo| 251.74|550050854|7c90fc70-0e80-45

In [0]:
# Import Spark SQL module
from pyspark.sql import DataFrame

# Delete all Spark DataFrame variables from Python memory
for name in list(globals().keys()):
    if isinstance(globals()[name], DataFrame):
        del globals()[name]

print("✅ All Spark DataFrame variables deleted from memory.")


✅ All Spark DataFrame variables deleted from memory.


In [0]:
# -----------------------------
# ONE-CELL DATA LOADER FOR E-COMMERCE
# -----------------------------

from pyspark.sql import SparkSession

# Initialize Spark session if not already
spark = SparkSession.builder.getOrCreate()

def load_ecommerce_data(month="Oct", sample_fraction=None):
    """
    Load e-commerce dataset for serverless Databricks.

    Parameters:
    -----------
    month : str
        "Oct", "Nov", or "both"
    sample_fraction : float, optional
        Fraction of data to sample (0 < fraction <= 1)

    Returns:
    --------
    Spark DataFrame
    """
    path_base = "/Volumes/workspace/ecommerce/ecommerce_data/"

    if month == "Oct":
        df = spark.read.csv(f"{path_base}2019-Oct.csv", header=True, inferSchema=True)
    elif month == "Nov":
        df = spark.read.csv(f"{path_base}2019-Nov.csv", header=True, inferSchema=True)
    elif month == "both":
        oct_df = spark.read.csv(f"{path_base}2019-Oct.csv", header=True, inferSchema=True)
        nov_df = spark.read.csv(f"{path_base}2019-Nov.csv", header=True, inferSchema=True)
        df = oct_df.union(nov_df)
    else:
        raise ValueError("Month must be 'Oct', 'Nov', or 'both'")

    if sample_fraction is not None:
        df = df.sample(fraction=sample_fraction, seed=42)
        print(f"✅ Sampled {sample_fraction*100}% of data")

    print(f"✅ Loaded {df.count():,} events for {month}")
    return df

# -----------------------------
# Load your DataFrame here
# -----------------------------
events = load_ecommerce_data("Oct")  # change month/sample_fraction as needed

# Quick verification
events.show(5)
events.printSchema()
