Importing Kaggle Dataset

In [57]:
from kagglehub import dataset_download
import pandas as pd
#remove warnings
import warnings
warnings.filterwarnings("ignore")

from pyspark.sql import SparkSession
import pyspark.pandas as ps
from pyspark.sql.functions import col, from_json, to_timestamp, udf
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType, LongType
import re

path: str = dataset_download("jinquan/cc-sample-data")

print(path)

spark = SparkSession.builder.appName("payNet").getOrCreate()


/home/jeevin/.cache/kagglehub/datasets/jinquan/cc-sample-data/versions/1


Load JSON

In [63]:
## Load your json data into a dataframe
df = spark.read.json(path)

df.show(5)
df.printSchema()
df.columns

def clean_json_string(json_str):
    """
    Clean JSON string by:
    1. Removing all backslashes
    2. Removing quotes around JSON objects (e.g., "{ }" becomes { })
    """
    if json_str is None:
        return None
    
    # Remove all backslashes
    cleaned = json_str.replace("\\", "")
    
    # Remove quotes around JSON objects - pattern: "{ ... }"
    # This regex finds quoted JSON objects and removes the outer quotes
    cleaned = re.sub(r'"\s*\{\s*(.*?)\s*\}\s*"', r'{\1}', cleaned)
    
    return cleaned


# Define schema for address (nested within personal_detail)
address_schema = StructType([
    StructField("street", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("zip", IntegerType(), True)
])

# Define schema for personal_detail
personal_schema = StructType([
    StructField("person_name", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("address", address_schema, True),  # Now properly structured
    StructField("lat",StringType(), True),
    StructField("long", FloatType(), True),
    StructField("city_pop", IntegerType(), True),
    StructField("job", StringType(), True),
    StructField("dob", StringType(), True)
])

[Stage 69:>                                                       (0 + 12) / 12]

+----------+------+-------------+-----------+----------------+--------+----------------+----------------------+------------------+-----------+-------------+--------------------+--------------------+---------------------+--------------------+
|Unnamed: 0|   amt|     category|     cc_bic|          cc_num|is_fraud|  merch_eff_time|merch_last_update_time|         merch_lat| merch_long|merch_zipcode|            merchant|     personal_detail|trans_date_trans_time|           trans_num|
+----------+------+-------------+-----------+----------------+--------+----------------+----------------------+------------------+-----------+-------------+--------------------+--------------------+---------------------+--------------------+
|         0|  4.97|     misc_net|CITIUS33CHI|2703186189652095|       0|1325376018798532|         1325376018666|         36.011293| -82.048315|        28705|fraud_Rippin, Kub...|{"person_name":"J...|  2019-01-01 00:00:18|0b242abb623afc578...|
|         1|107.23|  grocery_pos

                                                                                

In [64]:
# Register UDF
clean_json_udf = udf(clean_json_string, StringType())

# Apply cleaning to the personal_detail column (and any other columns that need it)
df_cleaned = df.withColumn("personal_detail", clean_json_udf(col("personal_detail")))

# If you have other columns with similar issues, apply the same cleaning
# df_cleaned = df_cleaned.withColumn("other_column", clean_json_udf(col("other_column")))

# Show the results
df_cleaned.select("personal_detail").show(truncate=False)

df_with_parsed_personal = df_cleaned.withColumn("personal_detail", from_json(col("personal_detail"), personal_schema))

# Flatten the personal_detail structure and address structure
df_final = df_with_parsed_personal.select(
    # Original columns (excluding personal_detail)
    col("Unnamed: 0"),
    col("trans_date_trans_time"),
    col("cc_bic"),
    col("cc_num"),
    col("merchant"),
    col("category"),
    col("amt"),
    col("trans_num"),
    col("merch_lat"),
    col("merch_long"),
    col("is_fraud"),
    col("merch_zipcode"),
    col("merch_eff_time"),
    col("merch_last_update_time"),
    
    # Flattened personal details
    col("personal_detail.person_name").alias("person_name"),
    col("personal_detail.gender").alias("gender"),
    col("personal_detail.lat").alias("person_lat"),
    col("personal_detail.long").alias("person_long"),
    col("personal_detail.city_pop").alias("city_pop"),
    col("personal_detail.job").alias("job"),
    col("personal_detail.dob").alias("dob"),
    
    # Flattened address details
    col("personal_detail.address.street").alias("person_street"),
    col("personal_detail.address.city").alias("person_city"),
    col("personal_detail.address.state").alias("person_state"),
    col("personal_detail.address.zip").alias("person_zip")
)

# Show final result
df_final.show()

# Show schema to verify structure
df_final.printSchema()

# Save the cleaned data
# df_final.write.mode("overwrite").json("cleaned_output_path")

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|personal_detail                                                                                                                                                                                                                                                       |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{"person_name":"Jennifer,Banks,eeeee","gender":"F","address":{"street":"561 Perry Cove","city":"Moravian Falls","state":"NC","zip":"28654"},"lat":"36.0788","long":"-81.1781","city_pop":"3495","job":"Psych