Importing Kaggle Dataset

In [None]:
from kagglehub import dataset_download
#remove warnings
import warnings
warnings.filterwarnings("ignore")

from pyspark.sql import SparkSession
import pyspark.sql.functions as sf
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType, TimestampType, LongType
import re

path: str = dataset_download("jinquan/cc-sample-data")

print(path)

spark = SparkSession.builder.appName("payNet").getOrCreate()


/home/jeevin/.cache/kagglehub/datasets/jinquan/cc-sample-data/versions/1


Load JSON and clean up Data

In [None]:
# Read the JSON data from the file
df = spark.read.json(path)

def clean_json_string(json_str):
    """
    Clean JSON string by:
    1. Removing all backslashes
    2. Removing quotes around JSON objects (e.g., "{ }" becomes { })
    """
    if json_str is None:
        return None
    
    # Remove all backslashes
    cleaned = json_str.replace("\\", "")
    
    # Remove quotes around JSON objects - pattern: "{ ... }"
    # This regex finds quoted JSON objects and removes the outer quotes
    cleaned = re.sub(r'"\s*\{\s*(.*?)\s*\}\s*"', r'{\1}', cleaned)
    
    return cleaned

# Register UDF
clean_json_udf = sf.udf(clean_json_string, StringType())

# Apply cleaning to the personal_detail column
df_cleaned = df.withColumn("personal_detail", clean_json_udf(sf.col("personal_detail")))

# Define schema for address (nested within personal_detail) - all strings initially
address_schema = StructType([
    StructField("street", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("zip", StringType(), True)
])

# Define schema for personal_detail - all strings initially
personal_schema = StructType([
    StructField("person_name", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("address", address_schema, True),
    StructField("lat", StringType(), True),
    StructField("long", StringType(), True),
    StructField("city_pop", StringType(), True),
    StructField("job", StringType(), True),
    StructField("dob", StringType(), True)
])

# Parse the cleaned JSON string into proper columns (overwrite the original column)
df_with_parsed_personal = df_cleaned.withColumn("personal_detail", sf.from_json(sf.col("personal_detail"), personal_schema))

# for debugging
# df_with_parsed_personal.select(col("personal_detail.person_name")).show()
# df_with_parsed_personal.select(col("personal_detail.address.street")).show()

# Process name splitting for 'first' and 'last' names with improved robustness.
df_with_names = df_with_parsed_personal.withColumn(
    "cleaned_person_name",
    sf.when(sf.col("personal_detail.person_name").isNotNull(),
        sf.trim(
            sf.regexp_replace( # Normalize multiple spaces to single space
                sf.regexp_replace( # Replace all non-alphanumeric characters (except spaces) with a single space
                    sf.regexp_replace( # Remove specific trailing strings like 'eeeee' and 'N' followed by 4 or more '0' or 'O' (case-insensitive)
                        sf.regexp_replace(sf.col("personal_detail.person_name"), r"(?i),?eeeee$", ""),
                        r"(?i),?\s*N[0O]{4,}$", "" # Updated regex to handle N0000, NOOOO etc.
                    ),
                    r"[^a-zA-Z0-9\s]", " " # Replace any character that is NOT a letter, number, or whitespace with a space. This will catch /, !, @, |, and also the comma if it's not part of a "Lastname, Firstname" pattern.
                ),
                r"\s+", " " # Normalize multiple spaces to single space
            )
        )
    ).otherwise(None)
)

df_with_names = df_with_names \
    .withColumn("name_parts", sf.split(sf.col("cleaned_person_name"), " ")) \
    .withColumn("first", 
        sf.when(sf.size(sf.col("name_parts")) >= 1, sf.trim(sf.element_at(sf.col("name_parts"), 1)))
        .otherwise(None)
    ) \
    .withColumn("last", 
        sf.when(sf.size(sf.col("name_parts")) > 1, 
                sf.trim(sf.concat_ws(" ", sf.slice(sf.col("name_parts"), 2, sf.size(sf.col("name_parts"))))))
        .otherwise(None)
    ) \
    .drop("cleaned_person_name", "name_parts") # Drop intermediate columns


# Flatten the personal_detail structure and address structure
df_flattened = df_with_names.select(
    # Original columns in desired order
    sf.col("Unnamed: 0"),
    sf.col("trans_date_trans_time"),
    sf.col("cc_num"),
    sf.col("merchant"),
    sf.col("category"),
    sf.col("amt"),
    
    sf.col("first"),
    sf.col("last"),

    # Personal details
    sf.col("personal_detail.gender").alias("gender"),
    
    # Flattened address details
    sf.col("personal_detail.address.street").alias("street"),
    sf.col("personal_detail.address.city").alias("city"),
    sf.col("personal_detail.address.state").alias("state"),
    sf.col("personal_detail.address.zip").alias("zip"),
    
    # Location and demographic info
    sf.col("personal_detail.lat").alias("lat"),
    sf.col("personal_detail.long").alias("long"),
    sf.col("personal_detail.city_pop").alias("city_pop"),
    sf.col("personal_detail.job").alias("job"),
    sf.col("personal_detail.dob").alias("dob"),
    
    # Transaction details
    sf.col("trans_num"),
    sf.col("merch_lat"),
    sf.col("merch_long"),
    sf.col("is_fraud"),
    sf.col("merch_zipcode"),
    sf.col("merch_last_update_time"),
    sf.col("merch_eff_time"),
    sf.col("cc_bic")
)


# Type conversions and rounding in one operation, including date format and timezone handling
# Type conversions and rounding in one operation, including date format and timezone handling
df_final = df_flattened.withColumns({
    'Unnamed: 0': sf.col("Unnamed: 0").cast(IntegerType()),


    # Convert trans_date_trans_time to TimestampType, then to UTC+8, then format
    'trans_date_trans_time': sf.date_format(sf.from_utc_timestamp(sf.col("trans_date_trans_time").cast("timestamp"), "UTC+8"), 'yyyy-MM-dd HH:mm:ss.SSSSSS Z'),


    'amt': sf.round(sf.col("amt").cast(FloatType()), 6),
    'merch_lat': sf.round(sf.col("merch_lat").cast(FloatType()), 6),
    'merch_long': sf.round(sf.col("merch_long").cast(FloatType()), 6),
    'is_fraud': sf.col("is_fraud").cast(IntegerType()),


    # Convert merch_eff_time (microseconds) to TimestampType, then to UTC+8, then format
 
    'merch_eff_time': sf.date_format(
        sf.from_utc_timestamp(
            (
                sf.rpad(
                    sf.col("merch_eff_time").cast(StringType()),
                    16,
                    '0'
                ).cast(LongType()) / 1000000
            ).cast("timestamp"),
            "UTC+8"
        ),
        'yyyy-MM-dd HH:mm:ss.SSSSSS Z'
    ),


    # Convert merch_last_update_time (microseconds) to TimestampType, then to UTC+8, then format
    'merch_last_update_time': sf.date_format(
        sf.from_utc_timestamp(
            (
                sf.rpad(
                    sf.col("merch_last_update_time").cast(StringType()),
                    16,
                    '0'
                ).cast(LongType()) / 1000000
            ).cast("timestamp"),
            "UTC+8"
            ),
            'yyyy-MM-dd HH:mm:ss.SSSSSS Z'
        ), 
    
    'lat': sf.round(sf.col("lat").cast(FloatType()), 6),
    'long': sf.round(sf.col("long").cast(FloatType()), 6),
    'city_pop': sf.col("city_pop").cast(IntegerType())
})


# Handle null values and "NA" strings for all string columns automatically
string_columns = [field.name for field in df_final.schema.fields if field.dataType.typeName() == 'string']

# Create dictionary for null value handling across all string columns
null_handling_dict = {}
for col_name in string_columns:
    null_handling_dict[col_name] = sf.when(
        (sf.lower(sf.col(col_name)) == "na") | 
        (sf.lower(sf.col(col_name)) == "null") | 
        (sf.col(col_name) == ""), 
        None
    ).otherwise(sf.col(col_name))

df_final = df_final.withColumns(null_handling_dict)


## Display cleaned data 

# Show final result
df_final.show(40,truncate=False)

# Show schema to verify structure
df_final.printSchema()


                                                                                

+----------+--------------------------------+-------------------+----------------------------------------+-------------+------+-----------+----------+------+------------------------------+------------------------+-----+-----+-------+---------+--------+---------------------------------------------+----------+--------------------------------+---------+-----------+--------+-------------+--------------------------------+--------------------------------+-----------+
|Unnamed: 0|trans_date_trans_time           |cc_num             |merchant                                |category     |amt   |first      |last      |gender|street                        |city                    |state|zip  |lat    |long     |city_pop|job                                          |dob       |trans_num                       |merch_lat|merch_long |is_fraud|merch_zipcode|merch_last_update_time          |merch_eff_time                  |cc_bic     |
+----------+--------------------------------+-------------------+---

Handling PII data

In [None]:
# Direct Identifiers
direct_identifiers = [
    "cc_num"
]

Infer insights from the data