In [1]:
import os
import sys
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# --- WINDOWS ENVIRONMENT SETUP ---
# VS Code might not pick up environment variables set in the terminal immediately.
# We force set them here to ensure Spark works.

# 1. Set HADOOP_HOME if missing
if 'HADOOP_HOME' not in os.environ:
    os.environ['HADOOP_HOME'] = r"C:\hadoop"

# 2. Add hadoop\bin to PATH if missing
if r"hadoop\bin" not in os.environ['PATH']:
    os.environ['PATH'] += ";" + os.path.join(os.environ['HADOOP_HOME'], 'bin')

# 3. FORCE SET JAVA_HOME to JDK 17 (Required for Spark 4.0)
# Replace this path if your JDK 17 is installed elsewhere
os.environ['JAVA_HOME'] = r"C:\Program Files\Java\jdk-17" 
# Also update PATH to include this Java version
os.environ['PATH'] = os.path.join(os.environ['JAVA_HOME'], 'bin') + ";" + os.environ['PATH']

print(f"JAVA_HOME: {os.environ.get('JAVA_HOME')}")
print(f"HADOOP_HOME: {os.environ.get('HADOOP_HOME')}")

# --- INITIALIZE SPARK ---
print("Initializing SparkSession... (This may take a few seconds)")

# Using explicit config helps avoid Windows-specific hangs
spark = SparkSession.builder \
    .appName("LocalPipeline") \
    .master("local[*]") \
    .config("spark.driver.host", "127.0.0.1") \
    .config("spark.sql.warehouse.dir", os.path.abspath("spark-warehouse")) \
    .getOrCreate()

print("SparkSession created successfully!")

# COMMAND ----------
# 1. Configuration: Read from local 'test_data' folder
# Note: This runs locally on your machine, not on the Databricks cluster.

LOCAL_FOLDER_NAME = "test_data"
FILE_FORMAT       = "json"

# Construct the path. Assuming test_data is in the same directory as this notebook.
current_dir = os.getcwd()
local_path = os.path.join(current_dir, LOCAL_FOLDER_NAME)

print(f"Attempting to read data from local path: {local_path}")

JAVA_HOME: C:\Program Files\Java\jdk-17
HADOOP_HOME: C:\hadoop
Initializing SparkSession... (This may take a few seconds)
SparkSession created successfully!
Attempting to read data from local path: d:\study\databricks\data_cleaning\local_environment\test_data


In [2]:
def get_optimal_partition_col(df, candidates, target_partitions=20):
    """
    Selects the best partition column based on cardinality.
    For 1M rows, we aim for ~20 partitions to avoid small files.
    """
    best_col = None
    best_diff = float('inf')    
    for col in candidates:
        if col not in df.columns:
            continue
            
        cardinality = df.select(col).distinct().count()        
        if 1 < cardinality <= 100:
            diff = abs(cardinality - target_partitions)
            if diff < best_diff:
                best_diff = diff
                best_col = col
                
    return best_col

In [3]:
# COMMAND ----------
# 2. Reading Datajects
df_raw = spark.read.format(FILE_FORMAT).option("multiline", "true").load(local_path)
df_raw.columns

['address',
 'cart',
 'category',
 'order',
 'order_item',
 'payment',
 'product',
 'products_sku',
 'subcategory',
 'user',
 'wishlist']

In [4]:

user_table = df_raw.select("user.*")
user_table.show(5, truncate=False)

+---+-------------+-----------------------------+--------------------------+--------------------------+-----------------------+--------------------------------------------+-----------------------------------------------------+----------+---------------------+-----------------+---+--------------+
|age|birth_of_date|company                      |create_time               |delete_time               |email                  |id                                          |job                                                  |password  |phone_number         |real_name        |sex|username      |
+---+-------------+-----------------------------+--------------------------+--------------------------+-----------------------+--------------------------------------------+-----------------------------------------------------+----------+---------------------+-----------------+---+--------------+
|67 |1958-11-27   |Mcknight-Guzman              |2025-11-29 11:57:14.220683|2025-11-29 11:57:14.862795|katiep

In [5]:
user_table = df_raw.select("user.*")
user_table = user_table.withColumn("real_name", F.trim(F.lower(F.col("real_name"))))
user_table = user_table.withColumn("company", F.trim(F.lower(F.col("company"))))
user_table = user_table.withColumn("job", F.trim(F.lower(F.col("job"))))


window_spec = Window.partitionBy("real_name").orderBy("real_name")
df_with_row_number = user_table.withColumn("row_number", F.row_number().over(window_spec))
df_cleaned = df_with_row_number.filter(F.col("row_number") == 1).select(
    "real_name",
    "age",
    "birth_of_date",
    "company",
    "job",
    "phone_number",
    "sex"
)

updated_df = user_table.alias("raw").join(
    df_cleaned.alias("cleaned"),
    on=F.col("raw.real_name") == F.col("cleaned.real_name"),
    how="left"
)
cleaned_columns = df_cleaned.columns
raw_columns = user_table.columns
raw_only_columns = [col for col in raw_columns if col not in cleaned_columns]
df_cleaned = updated_df.select(
    *[F.col(f"cleaned.{col}").alias(col) for col in cleaned_columns] +  
    [F.col(f"raw.{col}").alias(col) for col in raw_only_columns]        
)

candidates = ["sex", "job", "company", "age", "country_code"]
selected_partition_col = get_optimal_partition_col(df_cleaned, candidates, target_partitions=20)

In [6]:
selected_partition_col

'sex'