In [1]:
from pyspark.sql import SparkSession
import os

# Create Spark session with S3 configuration
spark = SparkSession.builder \
    .appName("JupyterS3") \
    .config("spark.driver.extraClassPath", "/opt/spark/jars/hadoop-aws-3.3.4.jar:/opt/spark/jars/aws-java-sdk-bundle-1.12.262.jar") \
    .config("spark.executor.extraClassPath", "/opt/spark/jars/hadoop-aws-3.3.4.jar:/opt/spark/jars/aws-java-sdk-bundle-1.12.262.jar") \
    .getOrCreate()

# Configure S3
aws_access_key = os.getenv("AWS_ACCESS_KEY_ID")
aws_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")

spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", aws_access_key)
spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", aws_secret_key)
spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.amazonaws.com")
spark._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")

# Read from S3
df = spark.read.parquet("s3a://mailshake-analytics/raw/client_id=client_1/entity=campaign/date=2025-12-25/campaign.parquet")
df.show()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/28 00:47:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/12/28 00:47:44 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

+--------+-------+--------------------+--------------------+----------+--------+------------+--------------------+--------------------+-------------+-------------------+--------------------+--------------------+--------------------+
|  object|     id|               title|             created|isArchived|isPaused|wizardStatus|            messages|                 url|sender.object|          sender.id| sender.emailAddress|     sender.fromName|      sender.created|
+--------+-------+--------------------+--------------------+----------+--------+------------+--------------------+--------------------+-------------+-------------------+--------------------+--------------------+--------------------+
|campaign|1488098|2025_12_Sanghamit...|2025-12-19T14:32:...|     false|   false|  InProgress|[{4065551, false,...|https://mailshake...|       sender|164486-178439-false|palsanghamitra22@...|     Sanghamitra Pal|2025-12-19T14:23:...|
|campaign|1488093|2025_12_Vansh_Gup...|2025-12-19T14:12:...|     fal

In [3]:
df = spark.read.parquet("s3a://mailshake-analytics/raw/client_id=client_1/entity=campaign/date=2025-12-25/campaign_modified.parquet")
df.printSchema()

root
 |-- object: string (nullable = true)
 |-- id: long (nullable = true)
 |-- title: string (nullable = true)
 |-- created: string (nullable = true)
 |-- isArchived: boolean (nullable = true)
 |-- isPaused: boolean (nullable = true)
 |-- wizardStatus: string (nullable = true)
 |-- messages: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- isPaused: boolean (nullable = true)
 |    |    |-- object: string (nullable = true)
 |    |    |-- replyToID: long (nullable = true)
 |    |    |-- subject: string (nullable = true)
 |    |    |-- type: string (nullable = true)
 |-- url: string (nullable = true)
 |-- sender.object: string (nullable = true)
 |-- sender.id: string (nullable = true)
 |-- sender.emailAddress: string (nullable = true)
 |-- sender.fromName: string (nullable = true)
 |-- sender.created: string (nullable = true)
 |-- processed_timestamp: timestamp (nullable = true)



In [None]:
# ============================================================================
# IMPORTS
# ============================================================================
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import os
from datetime import datetime, timedelta

# ============================================================================
# SPARK SESSION
# ============================================================================
spark = SparkSession.builder \
    .appName("CampaignMessagesCurationIncremental") \
    .config(
        "spark.driver.extraClassPath",
        "/opt/spark/jars/hadoop-aws-3.3.4.jar:/opt/spark/jars/aws-java-sdk-bundle-1.12.262.jar"
    ) \
    .config(
        "spark.executor.extraClassPath",
        "/opt/spark/jars/hadoop-aws-3.3.4.jar:/opt/spark/jars/aws-java-sdk-bundle-1.12.262.jar"
    ) \
    .getOrCreate()

# ============================================================================
# S3 CONFIG
# ============================================================================
spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", os.getenv("AWS_ACCESS_KEY_ID"))
spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", os.getenv("AWS_SECRET_ACCESS_KEY"))
spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.amazonaws.com")
spark._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")

# ============================================================================
# CONFIG
# ============================================================================
LOAD_TYPE = os.getenv("LOAD_TYPE", "incremental")  # incremental | full

BASE_INPUT_PATH = "s3a://mailshake-analytics/raw"
BASE_OUTPUT_PATH = "s3a://mailshake-analytics/curated"
CURATED_PATH = f"{BASE_OUTPUT_PATH}/entity=campaign_messages/campaign_messages_curated.parquet"

CLIENT_IDS = ["client_1", "client_2", "client_3"]

# ============================================================================
# DETERMINE DATES TO PROCESS (TRUE INCREMENTAL)
# ============================================================================
dates_to_process = []

if LOAD_TYPE == "incremental":
    try:
        existing_df = spark.read.parquet(CURATED_PATH)
        last_processed_date = (
            existing_df
            .select(max("source_date").alias("max_date"))
            .collect()[0]["max_date"]
        )

        print(f"üìå Last processed date: {last_processed_date}")
        start_date = datetime.strptime(str(last_processed_date), "%Y-%m-%d") + timedelta(days=1)

    except Exception:
        print("‚ö†Ô∏è No curated data found. Bootstrapping full load.")
        start_date = datetime.strptime("2025-12-20", "%Y-%m-%d")

    end_date = datetime.today()

    dates_to_process = [
        (start_date + timedelta(days=i)).strftime("%Y-%m-%d")
        for i in range((end_date - start_date).days + 1)
    ]

else:
    START_DATE = os.getenv("START_DATE", "2025-12-20")
    END_DATE = os.getenv("END_DATE", datetime.today().strftime("%Y-%m-%d"))

    start_date = datetime.strptime(START_DATE, "%Y-%m-%d")
    end_date = datetime.strptime(END_DATE, "%Y-%m-%d")

    dates_to_process = [
        (start_date + timedelta(days=i)).strftime("%Y-%m-%d")
        for i in range((end_date - start_date).days + 1)
    ]

if not dates_to_process:
    print("‚úÖ No new dates to process. Exiting.")
    spark.stop()
    exit(0)

print(f"üìÖ Dates to process: {dates_to_process}")

# ============================================================================
# HELPER: FLATTEN STRUCT COLUMNS
# ============================================================================
def flatten_struct_columns(df):
    flat_cols = []
    for field in df.schema.fields:
        if isinstance(field.dataType, StructType):
            for sub in field.dataType.fields:
                flat_cols.append(
                    col(f"{field.name}.{sub.name}").alias(f"{field.name}_{sub.name}")
                )
        else:
            flat_cols.append(col(field.name))
    return df.select(flat_cols)

# ============================================================================
# READ RAW DATA
# ============================================================================
all_dfs = []

for process_date in dates_to_process:
    for client_id in CLIENT_IDS:
        try:
            input_path = (
                f"{BASE_INPUT_PATH}/client_id={client_id}/entity=campaign/"
                f"date={process_date}/campaign.parquet"
            )

            print(f"üìÇ Reading {client_id} | {process_date}")
            df = spark.read.parquet(input_path)

            df = (
                df.withColumn("client_id", lit(client_id))
                  .withColumn("source_date", lit(process_date))
            )

            all_dfs.append(df)

        except Exception as e:
            print(f"‚ö†Ô∏è Skipping {client_id} {process_date}: {e}")

if not all_dfs:
    print("‚ùå No data loaded.")
    spark.stop()
    exit(1)

combined_df = all_dfs[0]
for df in all_dfs[1:]:
    combined_df = combined_df.unionByName(df, allowMissingColumns=True)

# ============================================================================
# EXPLODE MESSAGES
# ============================================================================
exploded_df = combined_df.withColumn(
    "message",
    explode_outer(col("messages"))
).drop("messages")

# ============================================================================
# FLATTEN STRUCTS
# ============================================================================
flattened_df = flatten_struct_columns(exploded_df)

# ============================================================================
# METADATA
# ============================================================================
curated_df = (
    flattened_df
    .withColumn("processing_timestamp", current_timestamp())
    .withColumn("load_type", lit(LOAD_TYPE))
    .withColumn("processing_date", current_date())
)

# ============================================================================
# DEDUPLICATION (CRITICAL)
# ============================================================================
curated_df = curated_df.dropDuplicates(
    ["client_id", "id", "message_id", "source_date"]
)

# ============================================================================
# WRITE (DYNAMIC PARTITION OVERWRITE)
# ============================================================================
spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")

curated_df.write \
    .mode("overwrite") \
    .partitionBy("client_id", "source_date") \
    .parquet(CURATED_PATH)

print("‚úÖ Write complete")

# ============================================================================
# SUMMARY
# ============================================================================
print("üìä Records written:", curated_df.count())
curated_df.groupBy("client_id", "source_date").count().show()

spark.stop()
print("üéâ Job completed successfully")


In [1]:
# ============================================================================
# IMPORTS
# ============================================================================
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime, timedelta
import os

# ============================================================================
# SPARK SESSION
# ============================================================================
spark = SparkSession.builder \
    .appName("CampaignMessagesCuration") \
    .config(
        "spark.driver.extraClassPath",
        "/opt/spark/jars/hadoop-aws-3.3.4.jar:/opt/spark/jars/aws-java-sdk-bundle-1.12.262.jar"
    ) \
    .config(
        "spark.executor.extraClassPath",
        "/opt/spark/jars/hadoop-aws-3.3.4.jar:/opt/spark/jars/aws-java-sdk-bundle-1.12.262.jar"
    ) \
    .getOrCreate()

# ============================================================================
# S3 CONFIG
# ============================================================================
spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", os.getenv("AWS_ACCESS_KEY_ID"))
spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", os.getenv("AWS_SECRET_ACCESS_KEY"))
spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.amazonaws.com")
spark._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")

# ============================================================================
# CONFIG
# ============================================================================
BASE_INPUT_PATH = "s3a://mailshake-analytics/raw"
BASE_OUTPUT_PATH = "s3a://mailshake-analytics/curated"
OUTPUT_PATH = f"{BASE_OUTPUT_PATH}/entity=campaign/"

CLIENT_IDS = ["client_1", "client_2", "client_3"]

# Single-date override for backfill / fix
PROCESS_DATE = "2025-12-25"  # change as needed; set None for incremental

# ============================================================================
# DETERMINE DATES TO PROCESS
# ============================================================================
dates_to_process = []

if PROCESS_DATE:
    print(f"üìÖ Single-date mode enabled: {PROCESS_DATE}")
    dates_to_process = [PROCESS_DATE]
else:
    print("üîÑ Incremental mode enabled")
    try:
        existing_df = spark.read.parquet(OUTPUT_PATH)
        last_processed_date = existing_df.select(max("source_date").alias("max_date")).collect()[0]["max_date"]
        print(f"üìå Last processed date: {last_processed_date}")
        start_date = datetime.strptime(str(last_processed_date), "%Y-%m-%d") + timedelta(days=1)
    except Exception:
        print("‚ö†Ô∏è No curated data found. Bootstrapping from initial date.")
        start_date = datetime.strptime("2025-12-20", "%Y-%m-%d")
    
    end_date = datetime.today()
    dates_to_process = [(start_date + timedelta(days=i)).strftime("%Y-%m-%d")
                        for i in range((end_date - start_date).days + 1)]

if not dates_to_process:
    print("‚úÖ No dates to process. Exiting cleanly.")
    spark.stop()
    exit(0)

print(f"üìÖ Dates to process: {dates_to_process}")

# ============================================================================
# HELPER: FLATTEN STRUCT COLUMNS (SAFE WITH BACKTICKS)
# ============================================================================
def flatten_struct_columns(df):
    flat_cols = []
    for field in df.schema.fields:
        name = field.name
        if isinstance(field.dataType, StructType):
            for sub in field.dataType.fields:
                # Keep original name but access with backticks for dots
                full_name = f"`{name}`.`{sub.name}`"
                alias_name = f"{name}_{sub.name}"
                flat_cols.append(col(full_name).alias(alias_name))
        else:
            flat_cols.append(col(f"`{name}`"))
    return df.select(flat_cols)

# ============================================================================
# READ RAW DATA
# ============================================================================
dfs = []

for process_date in dates_to_process:
    for client_id in CLIENT_IDS:
        try:
            input_path = f"{BASE_INPUT_PATH}/client_id={client_id}/entity=campaign/date={process_date}/campaign.parquet"
            print(f"üìÇ Reading {client_id} | {process_date}")
            df = spark.read.parquet(input_path)
            df = df.withColumn("client_id", lit(client_id)) \
                   .withColumn("source_date", lit(process_date))
            dfs.append(df)
        except Exception as e:
            print(f"‚ö†Ô∏è Skipping {client_id} {process_date}: {e}")

if not dfs:
    print("‚ùå No raw data loaded.")
    spark.stop()
    exit(1)

combined_df = dfs[0]
for df in dfs[1:]:
    combined_df = combined_df.unionByName(df, allowMissingColumns=True)

# ============================================================================
# EXPLODE MESSAGES
# ============================================================================
exploded_df = combined_df.withColumn("message", explode_outer(col("messages"))).drop("messages")

# ============================================================================
# FLATTEN STRUCTS
# ============================================================================
flattened_df = flatten_struct_columns(exploded_df)

# ============================================================================
# ADD METADATA
# ============================================================================
curated_df = flattened_df.withColumn("processing_timestamp", current_timestamp()) \
    .withColumn("load_type", lit("single_date") if PROCESS_DATE else lit("incremental")) \
    .withColumn("processing_date", current_date())

# ============================================================================
# DEDUPLICATE (RERUN SAFE)
# ============================================================================
curated_df = curated_df.dropDuplicates(["client_id", "id", "message_id", "source_date"])

# ============================================================================
# WRITE (DYNAMIC PARTITION OVERWRITE)
# ============================================================================
spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")
curated_df.write.mode("overwrite").partitionBy("client_id", "source_date").parquet(OUTPUT_PATH)

# ============================================================================
# SUMMARY
# ============================================================================
print("‚úÖ Write complete")
print("üìä Records written:", curated_df.count())
curated_df.groupBy("client_id", "source_date").count().show()

spark.stop()
print("üéâ Job completed successfully")


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/30 15:05:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


üìÖ Single-date mode enabled: 2025-12-25
üìÖ Dates to process: ['2025-12-25']
üìÇ Reading client_1 | 2025-12-25


25/12/30 15:05:39 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

üìÇ Reading client_2 | 2025-12-25
üìÇ Reading client_3 | 2025-12-25


25/12/30 15:05:52 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

‚úÖ Write complete


                                                                                

üìä Records written: 75
+---------+-----------+-----+
|client_id|source_date|count|
+---------+-----------+-----+
| client_1| 2025-12-25|   40|
| client_3| 2025-12-25|   12|
| client_2| 2025-12-25|   23|
+---------+-----------+-----+

üéâ Job completed successfully


In [7]:
# ============================================================================
# IMPORTS
# ============================================================================
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime, timedelta
import os

# ============================================================================
# SPARK SESSION
# ============================================================================
spark = SparkSession.builder \
    .appName("CampaignMessagesCuration") \
    .config(
        "spark.driver.extraClassPath",
        "/opt/spark/jars/hadoop-aws-3.3.4.jar:/opt/spark/jars/aws-java-sdk-bundle-1.12.262.jar"
    ) \
    .config(
        "spark.executor.extraClassPath",
        "/opt/spark/jars/hadoop-aws-3.3.4.jar:/opt/spark/jars/aws-java-sdk-bundle-1.12.262.jar"
    ) \
    .getOrCreate()

# ============================================================================
# S3 CONFIG
# ============================================================================
spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", os.getenv("AWS_ACCESS_KEY_ID"))
spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", os.getenv("AWS_SECRET_ACCESS_KEY"))
spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.amazonaws.com")
spark._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")

# ============================================================================
# CONFIG
# ============================================================================
BASE_INPUT_PATH = "s3a://mailshake-analytics/raw"
BASE_OUTPUT_PATH = "s3a://mailshake-analytics/curated"

df_activity_open = spark.read.parquet("s3a://mailshake-analytics/raw/client_id=client_1/entity=activity_open/date=2025-12-25/activity_open_20251226T002548.parquet")

In [9]:
df_activity_reply = spark.read.parquet("s3a://mailshake-analytics/raw/client_id=client_1/entity=activity_reply/date=2025-12-25/activity_reply_20251226T002549.parquet")

In [12]:
df_activity_sent = spark.read.parquet("s3a://mailshake-analytics/raw/client_id=client_1/entity=activity_sent/date=2025-12-25/activity_sent_20251226T002549.parquet")

In [14]:
df_created_leads = spark.read.parquet("s3a://mailshake-analytics/raw/client_id=client_1/entity=created_leads/date=2025-12-25/created_leads_20251226T002549.parquet")

In [6]:
df_activity_open.printSchema()

root
 |-- object: string (nullable = true)
 |-- id: long (nullable = true)
 |-- actionDate: string (nullable = true)
 |-- isDuplicate: boolean (nullable = true)
 |-- recipient.object: string (nullable = true)
 |-- recipient.id: long (nullable = true)
 |-- recipient.emailAddress: string (nullable = true)
 |-- recipient.fullName: string (nullable = true)
 |-- recipient.created: string (nullable = true)
 |-- recipient.isPaused: boolean (nullable = true)
 |-- recipient.contactID: long (nullable = true)
 |-- recipient.first: string (nullable = true)
 |-- recipient.last: string (nullable = true)
 |-- recipient.fields.link: string (nullable = true)
 |-- recipient.fields.position: string (nullable = true)
 |-- recipient.fields.date applied: string (nullable = true)
 |-- recipient.fields.account: string (nullable = true)
 |-- recipient.fields.phoneNumber: string (nullable = true)
 |-- recipient.fields.facebookUrl: string (nullable = true)
 |-- recipient.fields.instagramID: string (nullable = tr

In [10]:
df_activity_reply.printSchema()

root
 |-- object: string (nullable = true)
 |-- id: long (nullable = true)
 |-- actionDate: string (nullable = true)
 |-- type: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- externalID: string (nullable = true)
 |-- externalRawMessageID: string (nullable = true)
 |-- externalConversationID: string (nullable = true)
 |-- rawBody: string (nullable = true)
 |-- body: string (nullable = true)
 |-- plainTextBody: string (nullable = true)
 |-- recipient.object: string (nullable = true)
 |-- recipient.id: long (nullable = true)
 |-- recipient.emailAddress: string (nullable = true)
 |-- recipient.fullName: string (nullable = true)
 |-- recipient.created: string (nullable = true)
 |-- recipient.isPaused: boolean (nullable = true)
 |-- recipient.contactID: long (nullable = true)
 |-- recipient.first: string (nullable = true)
 |-- recipient.last: string (nullable = true)
 |-- recipient.fields.account: string (nullable = true)
 |-- recipient.fields.phoneNumber: string (nulla

In [None]:
df_activity_sent

In [13]:
df_activity_sent.printSchema()

root
 |-- object: string (nullable = true)
 |-- id: long (nullable = true)
 |-- actionDate: string (nullable = true)
 |-- type: string (nullable = true)
 |-- excludeBody: boolean (nullable = true)
 |-- to: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- address: string (nullable = true)
 |    |    |-- first: string (nullable = true)
 |    |    |-- fullName: integer (nullable = true)
 |    |    |-- last: string (nullable = true)
 |    |    |-- object: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- externalID: string (nullable = true)
 |-- externalRawMessageID: string (nullable = true)
 |-- externalConversationID: string (nullable = true)
 |-- rawBody: string (nullable = true)
 |-- body: string (nullable = true)
 |-- plainTextBody: string (nullable = true)
 |-- recipient.object: string (nullable = true)
 |-- recipient.id: long (nullable = true)
 |-- recipient.emailAddress: string (nullable = true)
 |-- recipient.fullName: strin

In [16]:
df_created_leads.show()

+------+--------+--------------------+--------------------+--------------------+----------+------+----------------+------------+----------------------+------------------+--------------------+------------------+-------------------+---------------+--------------+---------------------+-------------------------+-----------------------------+------------------------+----------------------------+----------------------------+----------------------------+----------------------------+--------------------------+---------------+-----------+--------------------+---------------------+
|object|      id|             created|          openedDate|lastStatusChangeDate|annotation|status|recipient.object|recipient.id|recipient.emailAddress|recipient.fullName|   recipient.created|recipient.isPaused|recipient.contactID|recipient.first|recipient.last|recipient.fields.link|recipient.fields.position|recipient.fields.date applied|recipient.fields.account|recipient.fields.phoneNumber|recipient.fields.facebookUrl

In [34]:
# ============================================================================
# IMPORTS
# ============================================================================
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max, lit, current_timestamp, current_date, explode_outer
from pyspark.sql.types import StructType
from datetime import datetime, timedelta
import os

# ============================================================================
# SPARK SESSION
# ============================================================================
spark = SparkSession.builder \
    .appName("MailshakeCampaignCurations") \
    .config(
        "spark.driver.extraClassPath",
        "/opt/spark/jars/hadoop-aws-3.3.4.jar:/opt/spark/jars/aws-java-sdk-bundle-1.12.262.jar"
    ) \
    .config(
        "spark.executor.extraClassPath",
        "/opt/spark/jars/hadoop-aws-3.3.4.jar:/opt/spark/jars/aws-java-sdk-bundle-1.12.262.jar"
    ) \
    .getOrCreate()

# ============================================================================
# S3 CONFIG
# ============================================================================
spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", os.getenv("AWS_ACCESS_KEY_ID"))
spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", os.getenv("AWS_SECRET_ACCESS_KEY"))
spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.amazonaws.com")
spark._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")

# ============================================================================
# CONFIG
# ============================================================================
RAW_PATH = "s3a://mailshake-analytics/raw"
CURATED_PATH = "s3a://mailshake-analytics/curated"
CLIENT_IDS = ["client_1", "client_2", "client_3"]

# SINGLE DATE MODE: set this to a date string to process just 1 date
# For incremental, leave as None
SINGLE_DATE = "2025-12-25"

# Bootstrap date if no data exists
BOOTSTRAP_DATE = "2025-12-20"

# ============================================================================
# HELPER: Flatten struct columns
# ============================================================================
def flatten_struct_columns(df):
    flat_cols = []
    for field in df.schema.fields:
        name = field.name
        if isinstance(field.dataType, StructType):
            for sub in field.dataType.fields:
                full_name = f"`{name}`.`{sub.name}`"
                alias_name = f"{name}_{sub.name}"
                flat_cols.append(col(full_name).alias(alias_name))
        else:
            flat_cols.append(col(f"`{name}`"))
    return df.select(flat_cols)

# ============================================================================
# HELPER: Get dates to process per client
# ============================================================================
def get_dates_to_process(curated_path, client_ids, single_date=None, bootstrap_date="2025-12-20"):
    print("Single date:", single_date)
    print(client_ids)
    if single_date:
        print(f"üìÖ Single-date mode enabled: {single_date}")
        return {client: [single_date] for client in client_ids}

    # Incremental mode
    dates_to_process = {}
    try:
        existing_df = spark.read.parquet(curated_path)
        last_dates = existing_df.groupBy("client_id").agg(max("source_date").alias("last_date")).collect()
        last_date_dict = {row["client_id"]: row["last_date"] for row in last_dates}
    except Exception:
        print("‚ö†Ô∏è No curated data found. Bootstrapping all clients.")
        last_date_dict = {}

    today = datetime.today()
    for client in client_ids:
        if client in last_date_dict:
            start = datetime.strptime(str(last_date_dict[client]), "%Y-%m-%d") + timedelta(days=1)
        else:
            start = datetime.strptime(bootstrap_date, "%Y-%m-%d")
        dates_to_process[client] = [(start + timedelta(days=i)).strftime("%Y-%m-%d") 
                                    for i in range((today - start).days + 1)]
    return dates_to_process

# ============================================================================
# GENERALIZED DATASET PROCESSOR
# ============================================================================
def process_dataset(
    raw_base_path,
    curated_base_path,
    client_ids,
    dataset_name,
    unique_keys,
    explode_col=None,
    dates_per_client=None
):
    # Entity-first curated path
    entity_curated_path = f"{curated_base_path}/entity={dataset_name}"

    # Required for safe incremental overwrites
    spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")

    records_written = 0

    for client_id in client_ids:
        client_dates = dates_per_client.get(client_id, [])

        for process_date in client_dates:
            input_path = (
                f"{raw_base_path}/client_id={client_id}/entity={dataset_name}/date={process_date}/"
            )

            try:
                print(f"üìÇ {dataset_name} | {client_id} | {process_date}")

                df = spark.read.parquet(input_path)

                # Metadata
                df = (
                    df.withColumn("client_id", lit(client_id))
                      .withColumn("source_date", lit(process_date))
                      .withColumn("processing_timestamp", current_timestamp())
                      .withColumn("processing_date", current_date())
                      .withColumn(
                          "load_type",
                          lit("single_date" if SINGLE_DATE else "incremental")
                      )
                )

                # Explode if required
                if explode_col:
                    df = df.withColumn(explode_col, explode_outer(col(explode_col)))

                # Deduplicate BEFORE flattening
                df = df.dropDuplicates(unique_keys + ["client_id", "source_date"])

                # Flatten nested structs
                df = flatten_struct_columns(df)

                # Write only this client+date partition
                df.write \
                  .mode("overwrite") \
                  .partitionBy("client_id", "source_date") \
                  .parquet(entity_curated_path)

                count = df.count()
                records_written += count

                print(f"‚úÖ Written {count} records")

            except Exception as e:
                print(f"‚ö†Ô∏è Skipped {dataset_name} | {client_id} | {process_date}: {e}")

    if records_written == 0:
        print(f"‚ùå No data written for {dataset_name}")
    else:
        print(f"üéØ {dataset_name} total records written: {records_written}")

# ============================================================================
# GET DATES PER CLIENT
# ============================================================================
dates_per_client = get_dates_to_process(CURATED_PATH, CLIENT_IDS, single_date=SINGLE_DATE, bootstrap_date=BOOTSTRAP_DATE)
print(dates_per_client)
# ============================================================================
# PROCESS ALL DATASETS
# ============================================================================
# activity_open
process_dataset(RAW_PATH, CURATED_PATH, CLIENT_IDS, "activity_open",
                unique_keys=["id", "recipient.id", "campaign.id"], dates_per_client=dates_per_client)

# activity_reply
process_dataset(RAW_PATH, CURATED_PATH, CLIENT_IDS, "activity_reply",
                unique_keys=["id", "recipient.id", "campaign.id"], dates_per_client=dates_per_client)

# activity_sent (explode 'to' array)
process_dataset(RAW_PATH, CURATED_PATH, CLIENT_IDS, "activity_sent",
                unique_keys=["id", "recipient.id", "campaign.id"],explode_col="to", dates_per_client=dates_per_client)

# created_leads
process_dataset(RAW_PATH, CURATED_PATH, CLIENT_IDS, 
                "created_leads",
                unique_keys=["id", "recipient.id", "campaign.id"], 
                dates_per_client=dates_per_client)

# ============================================================================
# JOB COMPLETE
# ============================================================================
spark.stop()
print("üéâ All datasets processed successfully!")


Single date: 2025-12-25
['client_1', 'client_2', 'client_3']
üìÖ Single-date mode enabled: 2025-12-25
{'client_1': ['2025-12-25'], 'client_2': ['2025-12-25'], 'client_3': ['2025-12-25']}
üìÇ activity_open | client_1 | 2025-12-25


                                                                                

‚úÖ Written 100 records
üìÇ activity_open | client_2 | 2025-12-25


                                                                                

‚úÖ Written 100 records
üìÇ activity_open | client_3 | 2025-12-25


                                                                                

‚úÖ Written 100 records
üéØ activity_open total records written: 300
üìÇ activity_reply | client_1 | 2025-12-25


                                                                                

‚úÖ Written 25 records
üìÇ activity_reply | client_2 | 2025-12-25


                                                                                

‚úÖ Written 25 records
üìÇ activity_reply | client_3 | 2025-12-25


                                                                                

‚úÖ Written 25 records
üéØ activity_reply total records written: 75
üìÇ activity_sent | client_1 | 2025-12-25


                                                                                

‚úÖ Written 25 records
üìÇ activity_sent | client_2 | 2025-12-25


                                                                                

‚úÖ Written 25 records
üìÇ activity_sent | client_3 | 2025-12-25


                                                                                

‚úÖ Written 25 records
üéØ activity_sent total records written: 75
üìÇ created_leads | client_1 | 2025-12-25


                                                                                

‚úÖ Written 100 records
üìÇ created_leads | client_2 | 2025-12-25


                                                                                

‚úÖ Written 43 records
üìÇ created_leads | client_3 | 2025-12-25


                                                                                

‚úÖ Written 16 records
üéØ created_leads total records written: 159
üéâ All datasets processed successfully!


In [24]:
# ============================================================================
# IMPORTS
# ============================================================================
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, max, lit, current_timestamp, current_date, explode_outer
)
from pyspark.sql.types import NullType, StringType
from datetime import datetime, timedelta
import os

# ============================================================================
# SPARK SESSION
# ============================================================================
spark = (
    SparkSession.builder
    .appName("MailshakeCampaignCurations")
    .config(
        "spark.driver.extraClassPath",
        "/opt/spark/jars/hadoop-aws-3.3.4.jar:/opt/spark/jars/aws-java-sdk-bundle-1.12.262.jar"
    )
    .config(
        "spark.executor.extraClassPath",
        "/opt/spark/jars/hadoop-aws-3.3.4.jar:/opt/spark/jars/aws-java-sdk-bundle-1.12.262.jar"
    )
    .getOrCreate()
)

# ============================================================================
# S3 CONFIG
# ============================================================================
hadoop_conf = spark._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.access.key", os.getenv("AWS_ACCESS_KEY_ID"))
hadoop_conf.set("fs.s3a.secret.key", os.getenv("AWS_SECRET_ACCESS_KEY"))
hadoop_conf.set("fs.s3a.endpoint", "s3.amazonaws.com")
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")

# ============================================================================
# CONFIG
# ============================================================================
RAW_PATH = "s3a://mailshake-analytics/raw"
CURATED_PATH = "s3a://mailshake-analytics/curated"
CLIENT_IDS = ["client_1", "client_2", "client_3"]

SINGLE_DATE = "2025-12-25"
BOOTSTRAP_DATE = "2025-12-20"

spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")

# ============================================================================
# HELPERS
# ============================================================================

def sanitize_column_names(df):
    """
    Replace dots in column names with underscores.
    This is CRITICAL for Mailshake data.
    """
    for c in df.columns:
        if "." in c:
            df = df.withColumnRenamed(c, c.replace(".", "_"))
    return df


def fix_void_columns(df):
    """
    Cast NullType (VOID) columns so Parquet can write them.
    """
    for field in df.schema.fields:
        if isinstance(field.dataType, NullType):
            print(f"‚ö†Ô∏è Casting VOID column to string: {field.name}")
            df = df.withColumn(field.name, col(field.name).cast(StringType()))
    return df


def get_dates_to_process(curated_path, client_ids, single_date=None, bootstrap_date="2025-12-20"):
    if single_date:
        print(f"üìÖ Single-date mode: {single_date}")
        return {c: [single_date] for c in client_ids}

    dates = {}
    try:
        existing = spark.read.parquet(curated_path)
        last_dates = (
            existing.groupBy("client_id")
            .agg(max("source_date").alias("last_date"))
            .collect()
        )
        last_map = {r["client_id"]: r["last_date"] for r in last_dates}
    except Exception:
        last_map = {}

    today = datetime.today()
    for client in client_ids:
        start = (
            datetime.strptime(str(last_map[client]), "%Y-%m-%d") + timedelta(days=1)
            if client in last_map
            else datetime.strptime(bootstrap_date, "%Y-%m-%d")
        )
        dates[client] = [
            (start + timedelta(days=i)).strftime("%Y-%m-%d")
            for i in range((today - start).days + 1)
        ]
    return dates

# ============================================================================
# DATASET PROCESSOR
# ============================================================================
def process_dataset(
    raw_base_path,
    curated_base_path,
    client_ids,
    dataset_name,
    unique_keys,
    explode_col=None,
    dates_per_client=None
):
    entity_path = f"{curated_base_path}/entity={dataset_name}"

    for client_id in client_ids:
        for process_date in dates_per_client.get(client_id, []):

            input_path = (
                f"{raw_base_path}/client_id={client_id}/entity={dataset_name}/date={process_date}/"
            )

            try:
                print(f"üìÇ {dataset_name} | {client_id} | {process_date}")

                df = spark.read.parquet(input_path)

                # IMPORTANT: sanitize dotted columns FIRST
                df = sanitize_column_names(df)

                # Metadata
                df = (
                    df.withColumn("client_id", lit(client_id))
                      .withColumn("source_date", lit(process_date))
                      .withColumn("processing_timestamp", current_timestamp())
                      .withColumn("processing_date", current_date())
                      .withColumn(
                          "load_type",
                          lit("single_date" if SINGLE_DATE else "incremental")
                      )
                )

                # Explode if needed (use sanitized name)
                if explode_col:
                    explode_col = explode_col.replace(".", "_")
                    if explode_col in df.columns:
                        df = df.withColumn(explode_col, explode_outer(col(explode_col)))

                # Deduplicate using sanitized keys
                safe_keys = [k.replace(".", "_") for k in unique_keys]
                df = df.dropDuplicates(safe_keys + ["client_id", "source_date"])

                # Fix VOID columns
                df = fix_void_columns(df)

                # Write
                df.write \
                    .mode("overwrite") \
                    .partitionBy("client_id", "source_date") \
                    .parquet(entity_path)

                print(f"‚úÖ Written {df.count()} records")

            except Exception as e:
                print(f"‚ö†Ô∏è Skipped {dataset_name} | {client_id} | {process_date}: {e}")

# ============================================================================
# RUN
# ============================================================================
dates_per_client = get_dates_to_process(
    CURATED_PATH,
    CLIENT_IDS,
    single_date=SINGLE_DATE,
    bootstrap_date=BOOTSTRAP_DATE
)

process_dataset(
    RAW_PATH, CURATED_PATH, CLIENT_IDS,
    "activity_open",
    unique_keys=["id", "recipient.id", "campaign.id"],
    dates_per_client=dates_per_client
)

process_dataset(
    RAW_PATH, CURATED_PATH, CLIENT_IDS,
    "activity_reply",
    unique_keys=["id", "recipient.id", "campaign.id"],
    dates_per_client=dates_per_client
)

process_dataset(
    RAW_PATH, CURATED_PATH, CLIENT_IDS,
    "activity_sent",
    unique_keys=["id", "recipient.id", "campaign.id"],
    explode_col="to",
    dates_per_client=dates_per_client
)

process_dataset(
    RAW_PATH, CURATED_PATH, CLIENT_IDS,
    "created_leads",
    unique_keys=["id", "recipient.id", "campaign.id"],
    dates_per_client=dates_per_client
)

spark.stop()
print("üéâ All datasets processed successfully!")


üìÖ Single-date mode: 2025-12-25
üìÇ activity_open | client_1 | 2025-12-25


                                                                                

‚úÖ Written 100 records
üìÇ activity_open | client_2 | 2025-12-25


                                                                                

‚úÖ Written 100 records
üìÇ activity_open | client_3 | 2025-12-25


                                                                                

‚úÖ Written 100 records
üìÇ activity_reply | client_1 | 2025-12-25


                                                                                

‚úÖ Written 25 records
üìÇ activity_reply | client_2 | 2025-12-25


                                                                                

‚úÖ Written 25 records
üìÇ activity_reply | client_3 | 2025-12-25


                                                                                

‚úÖ Written 25 records
üìÇ activity_sent | client_1 | 2025-12-25


                                                                                

‚úÖ Written 25 records
üìÇ activity_sent | client_2 | 2025-12-25


                                                                                

‚úÖ Written 25 records
üìÇ activity_sent | client_3 | 2025-12-25


                                                                                

‚úÖ Written 25 records
üìÇ created_leads | client_1 | 2025-12-25


                                                                                

‚úÖ Written 100 records
üìÇ created_leads | client_2 | 2025-12-25


                                                                                

‚úÖ Written 43 records
üìÇ created_leads | client_3 | 2025-12-25


                                                                                

‚úÖ Written 16 records
üéâ All datasets processed successfully!


In [8]:
# ============================================================================
# IMPORTS
# ============================================================================
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, max, lit, current_timestamp, current_date, explode_outer
)
from pyspark.sql.types import NullType, StringType, StructType, DoubleType
from datetime import datetime, timedelta
import os

# ============================================================================
# SPARK SESSION
# ============================================================================
spark = (
    SparkSession.builder
    .appName("MailshakeCampaignCurations")
    .config(
        "spark.driver.extraClassPath",
        "/opt/spark/jars/hadoop-aws-3.3.4.jar:/opt/spark/jars/aws-java-sdk-bundle-1.12.262.jar"
    )
    .config(
        "spark.executor.extraClassPath",
        "/opt/spark/jars/hadoop-aws-3.3.4.jar:/opt/spark/jars/aws-java-sdk-bundle-1.12.262.jar"
    )
    .getOrCreate()
)

spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")

# ============================================================================
# S3 CONFIG
# ============================================================================
hadoop_conf = spark._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.access.key", os.getenv("AWS_ACCESS_KEY_ID"))
hadoop_conf.set("fs.s3a.secret.key", os.getenv("AWS_SECRET_ACCESS_KEY"))
hadoop_conf.set("fs.s3a.endpoint", "s3.amazonaws.com")
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")

# ============================================================================
# CONFIG
# ============================================================================
RAW_PATH = "s3a://mailshake-analytics/raw"
CURATED_PATH = "s3a://mailshake-analytics/curated"
CLIENT_IDS = ["client_1", "client_2", "client_3"]

SINGLE_DATE = "2025-12-25"   # set None for incremental
BOOTSTRAP_DATE = "2025-12-20"

# ============================================================================
# HELPERS
# ============================================================================

def sanitize_columns(df):
    for col_name in df.columns:
        clean = re.sub(r'[^a-zA-Z0-9_]', '_', col_name)
        clean = re.sub(r'_+', '_', clean).lower()
        if clean != col_name:
            df = df.withColumnRenamed(col_name, clean)
    return df


def fix_void_columns(df):
    """Cast NullType columns so Parquet can write."""
    for field in df.schema.fields:
        if isinstance(field.dataType, NullType):
            df = df.withColumn(field.name, col(field.name).cast(StringType()))
    return df


def add_explicit_missing_columns(df, required_columns):
    """
    Add explicitly required columns if missing.
    required_columns = { "column_name": DataType }
    """
    for col_name, data_type in required_columns.items():
        if col_name not in df.columns:
            print(f"‚ö†Ô∏è Adding missing column: {col_name}")
            df = df.withColumn(col_name, lit(None).cast(data_type))
    return df

def flatten_struct_columns(df):
    while True:
        struct_cols = [
            field.name
            for field in df.schema.fields
            if isinstance(field.dataType, StructType)
        ]

        if not struct_cols:
            break

        for col_name in struct_cols:
            for nested in df.schema[col_name].dataType.fields:
                df = df.withColumn(
                    f"{col_name}_{nested.name}",
                    col(f"{col_name}.{nested.name}")
                )
            df = df.drop(col_name)

    return df


def get_dates_to_process(curated_path, client_ids, single_date=None, bootstrap_date="2025-12-20"):
    if single_date:
        print(f"üìÖ Single-date mode enabled: {single_date}")
        return {c: [single_date] for c in client_ids}

    dates = {}
    try:
        existing = spark.read.parquet(curated_path)
        last_dates = (
            existing.groupBy("client_id")
            .agg(max("source_date").alias("last_date"))
            .collect()
        )
        last_map = {r["client_id"]: r["last_date"] for r in last_dates}
    except Exception:
        last_map = {}

    today = datetime.today()
    for client in client_ids:
        start = (
            datetime.strptime(str(last_map[client]), "%Y-%m-%d") + timedelta(days=1)
            if client in last_map
            else datetime.strptime(bootstrap_date, "%Y-%m-%d")
        )
        dates[client] = [
            (start + timedelta(days=i)).strftime("%Y-%m-%d")
            for i in range((today - start).days + 1)
        ]
    return dates

# ============================================================================
# DATASET PROCESSOR
# ============================================================================
def process_dataset(
    raw_base_path,
    curated_base_path,
    client_ids,
    dataset_name,
    unique_keys,
    explode_col=None,
    dates_per_client=None,
    required_columns=None
):
    entity_path = f"{curated_base_path}/entity={dataset_name}"

    for client_id in client_ids:
        for process_date in dates_per_client.get(client_id, []):

            input_path = (
                f"{raw_base_path}/client_id={client_id}/entity={dataset_name}/date={process_date}/"
            )

            try:
                print(f"üìÇ {dataset_name} | {client_id} | {process_date}")

                df = spark.read.parquet(input_path)

                df = flatten_struct_columns(df)   # ‚úÖ FLATTEN FIRST
                
                # üö´ explode only if column is ARRAY
                if explode_col:
                    explode_col = explode_col.replace(".", "_")
                    if explode_col in df.columns:
                        df = df.withColumn(explode_col, explode_outer(col(explode_col)))
                
                df = sanitize_column_names(df)    # sanitize again after flatten

                # Metadata
                df = (
                    df.withColumn("client_id", lit(client_id))
                      .withColumn("source_date", lit(process_date))
                       .withColumn("client_id_col", lit(client_id))        # physical copy
                      .withColumn("source_date_col", lit(process_date))   # physical copy
                      .withColumn("processing_timestamp", current_timestamp())
                      .withColumn("processing_date", current_date())
                      .withColumn(
                          "load_type",
                          lit("single_date" if SINGLE_DATE else "incremental")
                      )
                )

                # Explicit missing columns
                if required_columns:
                    df = add_explicit_missing_columns(df, required_columns)

                # Deduplicate
                safe_keys = [k.replace(".", "_") for k in unique_keys]
                df = df.dropDuplicates(safe_keys + ["client_id", "source_date"])

                # Fix VOID columns
                df = fix_void_columns(df)

                # Write
                df.write \
                    .mode("overwrite") \
                    .partitionBy("client_id", "source_date") \
                    .parquet(entity_path)

                print(f"‚úÖ Written {df.count()} records")

            except Exception as e:
                print(f"‚ö†Ô∏è Skipped {dataset_name} | {client_id} | {process_date}: {e}")

# ============================================================================
# RUN
# ============================================================================
dates_per_client = get_dates_to_process(
    CURATED_PATH,
    CLIENT_IDS,
    single_date=SINGLE_DATE,
    bootstrap_date=BOOTSTRAP_DATE
)

# # -------------------- activity_open --------------------
# process_dataset(
#     RAW_PATH,
#     CURATED_PATH,
#     CLIENT_IDS,
#     "activity_open",
#     unique_keys=["id", "recipient.id", "campaign.id"],
#     dates_per_client=dates_per_client,
#     required_columns={
#         "recipient_fields_status": StringType()
#     }
# )

# # -------------------- activity_reply --------------------
# process_dataset(
#     RAW_PATH,
#     CURATED_PATH,
#     CLIENT_IDS,
#     "activity_reply",
#     unique_keys=["id", "recipient.id", "campaign.id"],
#     dates_per_client=dates_per_client,
#     required_columns={
#         "recipient_fields_status": StringType()
#     }
# )

# -------------------- activity_sent --------------------
process_dataset(
    RAW_PATH,
    CURATED_PATH,
    CLIENT_IDS,
    "activity_sent",
    unique_keys=["id", "recipient.id", "campaign.id"],
    explode_col=None,
    dates_per_client=dates_per_client,
    required_columns={
        "recipient_fields_status": StringType()
    }
)

# -------------------- created_leads --------------------
process_dataset(
    RAW_PATH,
    CURATED_PATH,
    CLIENT_IDS,
    "created_leads",
    unique_keys=["id", "recipient.id", "campaign.id"],
    dates_per_client=dates_per_client,
    required_columns={
        "recipient_fields_status": StringType()
    }
)

spark.stop()
print("üéâ All datasets processed successfully!")


üìÖ Single-date mode enabled: 2025-12-25
üìÇ activity_sent | client_1 | 2025-12-25
‚ö†Ô∏è Adding missing column: recipient_fields_status


25/12/30 23:58:58 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

‚úÖ Written 25 records
üìÇ activity_sent | client_2 | 2025-12-25


                                                                                

‚úÖ Written 25 records
üìÇ activity_sent | client_3 | 2025-12-25
‚ö†Ô∏è Adding missing column: recipient_fields_status


                                                                                

‚úÖ Written 25 records
üìÇ created_leads | client_1 | 2025-12-25
‚ö†Ô∏è Adding missing column: recipient_fields_status


                                                                                

‚úÖ Written 100 records
üìÇ created_leads | client_2 | 2025-12-25


                                                                                

‚úÖ Written 43 records
üìÇ created_leads | client_3 | 2025-12-25
‚ö†Ô∏è Adding missing column: recipient_fields_status


                                                                                

‚úÖ Written 16 records
üéâ All datasets processed successfully!


In [9]:
# ============================================================================
# IMPORTS
# ============================================================================
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import (
    col, max, lit, current_timestamp, current_date, explode_outer
)
from pyspark.sql.types import NullType, StringType, StructType, ArrayType
from datetime import datetime, timedelta
import os
import re

# ============================================================================
# SPARK SESSION
# ============================================================================
spark = (
    SparkSession.builder
    .appName("MailshakeCampaignCurations")
    .config(
        "spark.driver.extraClassPath",
        "/opt/spark/jars/hadoop-aws-3.3.4.jar:/opt/spark/jars/aws-java-sdk-bundle-1.12.262.jar"
    )
    .config(
        "spark.executor.extraClassPath",
        "/opt/spark/jars/hadoop-aws-3.3.4.jar:/opt/spark/jars/aws-java-sdk-bundle-1.12.262.jar"
    )
    .getOrCreate()
)

spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")

# ============================================================================
# S3 CONFIG
# ============================================================================
hadoop_conf = spark._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.access.key", os.getenv("AWS_ACCESS_KEY_ID"))
hadoop_conf.set("fs.s3a.secret.key", os.getenv("AWS_SECRET_ACCESS_KEY"))
hadoop_conf.set("fs.s3a.endpoint", "s3.amazonaws.com")
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")

# ============================================================================
# CONFIG
# ============================================================================
RAW_PATH = "s3a://mailshake-analytics/raw"
CURATED_PATH = "s3a://mailshake-analytics/curated"
CLIENT_IDS = ["client_1", "client_2", "client_3"]

SINGLE_DATE = "2025-12-25"   # set None for incremental
BOOTSTRAP_DATE = "2025-12-20"

# ============================================================================
# HELPERS
# ============================================================================

def sanitize_column_names(df):
    """Replace invalid characters and dots in column names with underscores"""
    for col_name in df.columns:
        clean = re.sub(r'[^a-zA-Z0-9_]', '_', col_name)
        clean = re.sub(r'_+', '_', clean).lower()
        if clean != col_name:
            df = df.withColumnRenamed(col_name, clean)
    return df

def fix_void_columns(df):
    """Cast NullType columns so Parquet can write."""
    for field in df.schema.fields:
        if isinstance(field.dataType, NullType):
            df = df.withColumn(field.name, col(field.name).cast(StringType()))
    return df



def ensure_columns_and_reorder(df: DataFrame, column_order: list, column_types: dict = None) -> DataFrame:
    """
    Ensure all columns exist (with specified data types if provided) and reorder DataFrame.
    
    Args:
        df (DataFrame): input DataFrame
        column_order (list): list of columns in the desired order
        column_types (dict, optional): dictionary of {column_name: DataType} for missing columns
    Returns:
        DataFrame with all columns present and reordered
    """
    column_types = column_types or {}

    for col_name in column_order:
        if col_name not in df.columns:
            dtype = column_types.get(col_name)
            if dtype:
                df = df.withColumn(col_name, lit(None).cast(dtype))
            else:
                df = df.withColumn(col_name, lit(None))
            print(f"‚ö†Ô∏è Adding missing column: {col_name}")

    # Select columns in the desired order
    df = df.select([col(c) for c in column_order])
    return df


def flatten_struct_columns(df):
    """Flatten all StructType columns and explode arrays of structs."""
    while True:
        struct_cols = [
            field.name
            for field in df.schema.fields
            if isinstance(field.dataType, StructType)
        ]

        if not struct_cols:
            break

        for col_name in struct_cols:
            for nested in df.schema[col_name].dataType.fields:
                df = df.withColumn(
                    f"{col_name}_{nested.name}",
                    col(f"{col_name}.{nested.name}")
                )
            df = df.drop(col_name)

    # Explode array<struct> columns
    array_struct_cols = [
        field.name
        for field in df.schema.fields
        if isinstance(field.dataType, ArrayType) and isinstance(field.dataType.elementType, StructType)
    ]

    for col_name in array_struct_cols:
        df = df.withColumn(col_name, explode_outer(col(col_name)))
        for nested in df.schema[col_name].dataType.fields:
            df = df.withColumn(f"{col_name}_{nested.name}", col(f"{col_name}.{nested.name}"))
        df = df.drop(col_name)

    return df

def get_dates_to_process(curated_path, client_ids, single_date=None, bootstrap_date="2025-12-20"):
    if single_date:
        print(f"üìÖ Single-date mode enabled: {single_date}")
        return {c: [single_date] for c in client_ids}

    dates = {}
    try:
        existing = spark.read.parquet(curated_path)
        last_dates = (
            existing.groupBy("client_id")
            .agg(max("source_date").alias("last_date"))
            .collect()
        )
        last_map = {r["client_id"]: r["last_date"] for r in last_dates}
    except Exception:
        last_map = {}

    today = datetime.today()
    for client in client_ids:
        start = (
            datetime.strptime(str(last_map[client]), "%Y-%m-%d") + timedelta(days=1)
            if client in last_map
            else datetime.strptime(bootstrap_date, "%Y-%m-%d")
        )
        dates[client] = [
            (start + timedelta(days=i)).strftime("%Y-%m-%d")
            for i in range((today - start).days + 1)
        ]
    return dates

# ============================================================================
# DATASET PROCESSOR
# ============================================================================
def process_dataset(
    raw_base_path,
    curated_base_path,
    client_ids,
    dataset_name,
    unique_keys,
    explode_col=None,
    dates_per_client=None,
    desired_columns =None,
    column_types=None
):
    entity_path = f"{curated_base_path}/entity={dataset_name}"

    for client_id in client_ids:
        for process_date in dates_per_client.get(client_id, []):
            input_path = (
                f"{raw_base_path}/client_id={client_id}/entity={dataset_name}/date={process_date}/"
            )
            try:
                print(f"üìÇ {dataset_name} | {client_id} | {process_date}")

                df = spark.read.parquet(input_path)

                df = flatten_struct_columns(df)   # flatten structs and arrays first

                if explode_col:
                    explode_col = explode_col.replace(".", "_")
                    if explode_col in df.columns:
                        df = df.withColumn(explode_col, explode_outer(col(explode_col)))

                # Sanitize column names
                df = sanitize_column_names(df)

                # Ensure all columns exist and reorder
                df = ensure_columns_and_reorder(df, desired_columns,column_types)

                # Add metadata
                df = (
                    df.withColumn("client_id", lit(client_id))
                      .withColumn("source_date", lit(process_date))
                      .withColumn("client_id_col", lit(client_id))
                      .withColumn("source_date_col", lit(process_date))
                      .withColumn("processing_timestamp", current_timestamp())
                      .withColumn("processing_date", current_date())
                      .withColumn("load_type", lit("single_date" if SINGLE_DATE else "incremental"))
                )

               # # Add missing columns
               # if required_columns:
                #    df = add_explicit_missing_columns(df, required_columns)

                # Deduplicate
                safe_keys = [k.replace(".", "_") for k in unique_keys]
                df = df.dropDuplicates(safe_keys + ["client_id", "source_date"])

                df = fix_void_columns(df)

                # Write
                df.write.mode("overwrite").partitionBy("client_id", "source_date").parquet(entity_path)
                print(f"‚úÖ Written {df.count()} records")

            except Exception as e:
                print(f"‚ö†Ô∏è Skipped {dataset_name} | {client_id} | {process_date}: {e}")

# ============================================================================
# RUN
# ============================================================================
dates_per_client = get_dates_to_process(CURATED_PATH, CLIENT_IDS, single_date=SINGLE_DATE, bootstrap_date=BOOTSTRAP_DATE)

# # -------------------- activity_open --------------------
# process_dataset(
#     RAW_PATH, CURATED_PATH, CLIENT_IDS,
#     "activity_open",
#     unique_keys=["id", "recipient.id", "campaign.id"],
#     dates_per_client=dates_per_client,
#     desired_columns = [
#             "object", "id", "actiondate", "isduplicate", "recipient_object",
#             "recipient_id", "recipient_emailaddress", "recipient_fullname",
#             "recipient_created", "recipient_ispaused", "recipient_contactid",
#             "recipient_first", "recipient_last", "recipient_fields_link",
#             "recipient_fields_status", "recipient_fields_first",
#             "recipient_fields_position", "recipient_fields_date_applied",
#             "recipient_fields_account", "recipient_fields_phonenumber",
#             "recipient_fields_facebookurl", "recipient_fields_instagramid",
#             "recipient_fields_linkedinurl", "recipient_fields_twitterid",
#             "campaign_object", "campaign_id", "campaign_title", "campaign_wizardstatus",
#             "parent_object", "parent_id", "parent_type", "parent_message_object",
#             "parent_message_id", "parent_message_type", "parent_message_subject",
#             "parent_message_replytoid"
#         ],
#             # Optional: specify data types for missing columns
#         column_types = {
#             "recipient_fields_status": StringType(),
#             "recipient_fields_first": StringType()
#         }
# )

# # -------------------- activity_reply --------------------
# process_dataset(
#     RAW_PATH, CURATED_PATH, CLIENT_IDS,
#     "activity_reply",
#     unique_keys=["id", "recipient.id", "campaign.id"],
#     dates_per_client=dates_per_client,
#     desired_columns = [
#             "object", "id", "actiondate",
#             "type", "subject", "externalid",
#             "externalrawmessageid", "externalconversationid", "rawbody",
#             "body", "plaintextbody", "recipient_object",
#             "recipient_id", "recipient_emailaddress", "recipient_fullname",
#             "recipient_created", "recipient_ispaused", "recipient_contactid",
#             "recipient_first", "recipient_last", "recipient_fields_link",
#             "recipient_fields_status",  "recipient_fields_first", "recipient_fields_position", "recipient_fields_date_applied",
#             "recipient_fields_account", "recipient_fields_phonenumber", "recipient_fields_facebookurl",
#             "recipient_fields_instagramid", "recipient_fields_linkedinurl", "recipient_fields_twitterid",
#             "campaign_object", "campaign_id", "campaign_title",
#             "campaign_wizardstatus", "parent_object", "parent_id",
#             "parent_type", "parent_message_object", "parent_message_id",
#             "parent_message_type", "parent_message_subject", "parent_message_replytoid",
#             "from_object", "from_address", "from_fullname",
#             "from_first", "from_last"
#             ],
#     column_types ={"recipient_fields_status": StringType(),
#                      "recipient_fields_first": StringType()}
# )




# -------------------- activity_sent --------------------
process_dataset(
    RAW_PATH, CURATED_PATH, CLIENT_IDS,
    "activity_sent",
    unique_keys=["id", "recipient.id", "campaign.id"],
    explode_col="to",
    dates_per_client=dates_per_client,
    desired_columns =
                    # Core
                    ["object", "id", "actiondate",
                    "type", "excludebody",
                    # To (exploded)
                    "to_address", "to_first", "to_fullname",
                    "to_last", "to_object",
                    # Message content
                    "subject", "externalid", "externalrawmessageid",
                    "externalconversationid", "rawbody", "body",
                    "plaintextbody",
                    # Recipient
                    "recipient_object", "recipient_id", "recipient_emailaddress",
                    "recipient_fullname", "recipient_created", "recipient_ispaused",
                    "recipient_first", "recipient_last",
                    # Recipient fields
                    "recipient_fields_account", "recipient_fields_phonenumber",
                    "recipient_fields_facebookurl", "recipient_fields_instagramid",
                    "recipient_fields_linkedinurl", "recipient_fields_twitterid",
                    "recipient_fields_link", "recipient_fields_position",
                    "recipient_fields_date_applied", "recipient_fields_status",
                    # Campaign
                    "campaign_object", "campaign_id",
                    "campaign_title", "campaign_wizardstatus",
                    # Message (parent)
                    "message_object", "message_id", "message_type",
                    "message_subject", "message_replytoid",
                    # From
                    "from_object", "from_address", "from_fullname",
                    "from_first", "from_last"],
      column_types={"recipient_fields_status": StringType()}
)

# -------------------- created_leads --------------------
process_dataset(
    RAW_PATH, CURATED_PATH, CLIENT_IDS,
    "created_leads",
    unique_keys=["id", "recipient.id", "campaign.id"],
    dates_per_client=dates_per_client,
    desired_columns = [
    "object", "id", "created",
    "openeddate", "laststatuschangedate", "annotation",
    "status",

    "recipient_object", "recipient_id", "recipient_emailaddress",
    "recipient_fullname", "recipient_created", "recipient_ispaused",
    "recipient_contactid", "recipient_first", "recipient_last",

    "recipient_fields_link", "recipient_fields_first",
    "recipient_fields_status", "recipient_fields_position",
    "recipient_fields_date_applied", "recipient_fields_account",
    "recipient_fields_phonenumber", "recipient_fields_facebookurl",
    "recipient_fields_instagramid", "recipient_fields_linkedinurl",
    "recipient_fields_twitterid",

    "campaign_object", "campaign_id",
    "campaign_title", "campaign_wizardstatus",

    "assignedto_object", "assignedto_id",
    "assignedto_emailaddress", "assignedto_fullname",
    "assignedto_first", "assignedto_last"
]       ,
    column_types={  "recipient_fields_status": StringType(),
                    "recipient_fields_first": StringType(),
                    "assignedto_object": StringType(),
                    "assignedto_id": DoubleType(),
                    "assignedto_emailaddress": StringType(),
                    "assignedto_fullname": StringType(),
                    "assignedto_first": StringType(),
                    "assignedto_last": StringType()
}
)

spark.stop()
print("üéâ All datasets processed successfully!")


üìÖ Single-date mode enabled: 2025-12-25
üìÇ activity_sent | client_1 | 2025-12-25
‚ö†Ô∏è Adding missing column: recipient_fields_status


                                                                                

‚úÖ Written 25 records
üìÇ activity_sent | client_2 | 2025-12-25


                                                                                

‚úÖ Written 25 records
üìÇ activity_sent | client_3 | 2025-12-25
‚ö†Ô∏è Adding missing column: recipient_fields_status


                                                                                

‚úÖ Written 25 records
üìÇ created_leads | client_1 | 2025-12-25
‚ö†Ô∏è Adding missing column: recipient_fields_first
‚ö†Ô∏è Adding missing column: recipient_fields_status
‚ö†Ô∏è Adding missing column: assignedto_object
‚ö†Ô∏è Adding missing column: assignedto_id
‚ö†Ô∏è Adding missing column: assignedto_emailaddress
‚ö†Ô∏è Adding missing column: assignedto_fullname
‚ö†Ô∏è Adding missing column: assignedto_first
‚ö†Ô∏è Adding missing column: assignedto_last


                                                                                

‚úÖ Written 100 records
üìÇ created_leads | client_2 | 2025-12-25
‚ö†Ô∏è Adding missing column: recipient_fields_first
‚ö†Ô∏è Adding missing column: assignedto_object
‚ö†Ô∏è Adding missing column: assignedto_id
‚ö†Ô∏è Adding missing column: assignedto_emailaddress
‚ö†Ô∏è Adding missing column: assignedto_fullname
‚ö†Ô∏è Adding missing column: assignedto_first
‚ö†Ô∏è Adding missing column: assignedto_last


                                                                                

‚úÖ Written 43 records
üìÇ created_leads | client_3 | 2025-12-25
‚ö†Ô∏è Adding missing column: recipient_fields_status


                                                                                

‚úÖ Written 16 records
üéâ All datasets processed successfully!


In [33]:
from pyspark.sql import SparkSession
import os

# Create Spark session with S3 configuration
spark = SparkSession.builder \
    .appName("JupyterS3") \
    .config("spark.driver.extraClassPath", "/opt/spark/jars/hadoop-aws-3.3.4.jar:/opt/spark/jars/aws-java-sdk-bundle-1.12.262.jar") \
    .config("spark.executor.extraClassPath", "/opt/spark/jars/hadoop-aws-3.3.4.jar:/opt/spark/jars/aws-java-sdk-bundle-1.12.262.jar") \
    .getOrCreate()

# Configure S3
aws_access_key = os.getenv("AWS_ACCESS_KEY_ID")
aws_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")

spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", aws_access_key)
spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", aws_secret_key)
spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.amazonaws.com")
spark._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")

df1 = spark.read.parquet(
    "s3a://mailshake-analytics/curated/entity=created_leads/client_id=client_2/source_date=2025-12-25/")

print(len(df1.columns))
df1.select("recipient_fields_link").show(truncate=False)

42
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|recipient_fields_link                                                                                                                                                                                                                                                                                                         

In [11]:
df2 = spark.read.parquet(
    "s3a://mailshake-analytics/curated/entity=created_leads/client_id=client_3/source_date=2025-12-25/")

# Quick check
print(f"Missing in DF2: {set(df1.columns) - set(df2.columns)}")
print(f"Missing in DF1: {set(df2.columns) - set(df1.columns)}")

print(len(df2.columns))
df2.select("recipient_fields_status").show()


Missing in DF2: set()
Missing in DF1: set()
42
+-----------------------+
|recipient_fields_status|
+-----------------------+
|                   NULL|
|                   NULL|
|                   NULL|
|                   NULL|
|                   NULL|
|                   NULL|
|                   NULL|
|                   NULL|
|                   NULL|
|                   NULL|
|                   NULL|
|                   NULL|
|                   NULL|
|                   NULL|
|                   NULL|
|                   NULL|
+-----------------------+



In [12]:

df3 = spark.read.parquet(
    "s3a://mailshake-analytics/curated/entity=created_leads/client_id=client_2/source_date=2025-12-25/")

# Quick check
print(f"Missing in DF2: {set(df1.columns) - set(df2.columns)}")
print(f"Missing in DF1: {set(df2.columns) - set(df1.columns)}")
print(f"Missing in DF3: {set(df3.columns) - set(df1.columns)}")
print(f"Missing in DF3: {set(df3.columns) - set(df2.columns)}")
print(len(df3.columns))
df3.printSchema()
df3.select("recipient_fields_status").show()

Missing in DF2: set()
Missing in DF1: set()
Missing in DF3: set()
Missing in DF3: set()
42
root
 |-- object: string (nullable = true)
 |-- id: long (nullable = true)
 |-- created: string (nullable = true)
 |-- openeddate: string (nullable = true)
 |-- laststatuschangedate: integer (nullable = true)
 |-- annotation: integer (nullable = true)
 |-- status: string (nullable = true)
 |-- recipient_object: string (nullable = true)
 |-- recipient_id: long (nullable = true)
 |-- recipient_emailaddress: string (nullable = true)
 |-- recipient_fullname: string (nullable = true)
 |-- recipient_created: string (nullable = true)
 |-- recipient_ispaused: boolean (nullable = true)
 |-- recipient_contactid: long (nullable = true)
 |-- recipient_first: string (nullable = true)
 |-- recipient_last: string (nullable = true)
 |-- recipient_fields_link: string (nullable = true)
 |-- recipient_fields_first: string (nullable = true)
 |-- recipient_fields_status: string (nullable = true)
 |-- recipient_fields

In [18]:
df1.printSchema()

root
 |-- object: string (nullable = true)
 |-- id: long (nullable = true)
 |-- actionDate: string (nullable = true)
 |-- isDuplicate: boolean (nullable = true)
 |-- recipient_object: string (nullable = true)
 |-- recipient_id: long (nullable = true)
 |-- recipient_emailAddress: string (nullable = true)
 |-- recipient_fullName: string (nullable = true)
 |-- recipient_created: string (nullable = true)
 |-- recipient_isPaused: boolean (nullable = true)
 |-- recipient_contactID: long (nullable = true)
 |-- recipient_first: string (nullable = true)
 |-- recipient_last: string (nullable = true)
 |-- recipient_fields_link: string (nullable = true)
 |-- recipient_fields_first: string (nullable = true)
 |-- recipient_fields_position: string (nullable = true)
 |-- recipient_fields_date applied: string (nullable = true)
 |-- recipient_fields_account: string (nullable = true)
 |-- recipient_fields_phoneNumber: string (nullable = true)
 |-- recipient_fields_facebookUrl: string (nullable = true)
 |