In [0]:
# Load CSV files
user_df = spark.read.csv("dbfs:/FileStore/shared_uploads/varshinie.1006@gmail.com/user.csv", header=True, inferSchema=True)
transaction_df = spark.read.csv("dbfs:/FileStore/shared_uploads/varshinie.1006@gmail.com/transaction.csv", header=True, inferSchema=True)
fraud_df = spark.read.csv("dbfs:/FileStore/shared_uploads/varshinie.1006@gmail.com/fraud.csv", header=True, inferSchema=True)

# Display DataFrames
user_df.show()
transaction_df.show()
fraud_df.show()


+------+-------------+-------------------+------------+-------------+-------------------+
|UserID|     UserName|              Email| PhoneNumber|      Address| AccountCreatedDate|
+------+-------------+-------------------+------------+-------------+-------------------+
|     1|Alice Johnson|  alice@example.com|123-456-7890| 123 Maple St|2023-01-15 08:30:00|
|     2|    Bob Smith|    bob@example.com|098-765-4321|   456 Oak St|2022-11-20 10:00:00|
|     3|Charlie Brown|charlie@example.com|555-555-5555|  789 Pine St|2023-06-01 09:45:00|
|     4|  David Green|  david@example.com|222-333-4444| 101 Birch St|2022-09-15 10:00:00|
|     5|  Emily White|   mily@example.com|333-444-5555| 202 Cedar St|2023-02-05 11:30:00|
|     6|    Frank Lee|  frank@example.com|444-555-6666|303 Walnut St|2022-12-25 16:00:00|
|     7|    Grace Kim|  grace@example.com|555-666-7777|   404 Ash St|2023-03-18 14:20:00|
+------+-------------+-------------------+------------+-------------+-------------------+

+--------

In [0]:
#EDA
# Show schema to inspect data types
transaction_df.printSchema()

# Descriptive statistics for transaction data
transaction_df.describe().show()

# Distribution of fraud labels (0 = legitimate, 1 = fraud)
transaction_df.groupBy("FraudLabel").count().show()

# Check for missing values
from pyspark.sql.functions import col, sum

transaction_df.select([sum(col(c).isNull().cast("int")).alias(c) for c in transaction_df.columns]).show()


root
 |-- TransactionID: integer (nullable = true)
 |-- UserID: double (nullable = true)
 |-- TransactionAmount: double (nullable = true)
 |-- TransactionLocation: string (nullable = true)
 |-- TransactionTime: timestamp (nullable = true)
 |-- PaymentMethod: string (nullable = true)
 |-- TransactionStatus: string (nullable = true)
 |-- FraudLabel: double (nullable = true)

+-------+-----------------+------------------+-----------------+-------------------+-------------+-----------------+------------------+
|summary|    TransactionID|            UserID|TransactionAmount|TransactionLocation|PaymentMethod|TransactionStatus|        FraudLabel|
+-------+-----------------+------------------+-----------------+-------------------+-------------+-----------------+------------------+
|  count|               12|                12|               12|                 12|           12|               12|                12|
|   mean|           1006.5|3.8333333333333335|         2092.125|               n

In [0]:
# Join transaction data with fraud data using TransactionID
labeled_data = transaction_df.join(fraud_df, "TransactionID", how="left")

# Display the resulting labeled dataset
labeled_data.show()


+-------------+------+-----------------+-------------------+-------------------+-------------+-----------------+----------+-------+--------------------+-------------------+
|TransactionID|UserID|TransactionAmount|TransactionLocation|    TransactionTime|PaymentMethod|TransactionStatus|FraudLabel|LabelID|         FraudReason|        DateLabeled|
+-------------+------+-----------------+-------------------+-------------------+-------------+-----------------+----------+-------+--------------------+-------------------+
|         1001|   1.0|            200.5|           New York|2023-09-01 12:30:00|  Credit Card|        Completed|       0.0|   null|                null|               null|
|         1002|   1.0|           5000.0|        Los Angeles|2023-09-02 13:15:00|   Debit Card|        Completed|       1.0|      1|Suspiciously High...|2023-09-02 14:00:00|
|         1003|   2.0|            150.0|      San Francisco|2023-09-03 15:45:00|       PayPal|          Pending|       0.0|   null|    

In [0]:
#preprocessing data
# Impute missing values (if any)
from pyspark.sql.functions import when

labeled_data = labeled_data.fillna({"TransactionAmount": 0, "TransactionLocation": "Unknown"})

# encoding for categorical variables 
from pyspark.ml.feature import StringIndexer, OneHotEncoder

indexer = StringIndexer(inputCol="PaymentMethod", outputCol="PaymentMethodIndex")
encoder = OneHotEncoder(inputCol="PaymentMethodIndex", outputCol="PaymentMethodVec")

# Assemble features
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=["TransactionAmount", "PaymentMethodVec"], outputCol="features")


In [0]:
#feature engineering
transaction_frequency = labeled_data.groupBy("UserID").count().withColumnRenamed("count", "TransactionFrequency")
labeled_data = labeled_data.join(transaction_frequency, "UserID", how="left")
labeled_data.show()

average_transaction_amount = labeled_data.groupBy("UserID").avg("TransactionAmount").withColumnRenamed("avg(TransactionAmount)", "AvgTransactionAmount")
labeled_data = labeled_data.join(average_transaction_amount, "UserID", how="left")
labeled_data.show()

from pyspark.sql.window import Window
from pyspark.sql.functions import lag, unix_timestamp

window = Window.partitionBy("UserID").orderBy("TransactionTime")

# Compute time difference in seconds between consecutive transactions
labeled_data = labeled_data.withColumn("PrevTransactionTime", lag("TransactionTime", 1).over(window))
labeled_data = labeled_data.withColumn("TimeBetweenTransactions", 
    (unix_timestamp("TransactionTime") - unix_timestamp("PrevTransactionTime")).cast("double"))
labeled_data.show()

+------+-------------+-----------------+-------------------+-------------------+-------------+-----------------+----------+-------+--------------------+-------------------+--------------------+--------------------+-------------------+-----------------------+--------------------+
|UserID|TransactionID|TransactionAmount|TransactionLocation|    TransactionTime|PaymentMethod|TransactionStatus|FraudLabel|LabelID|         FraudReason|        DateLabeled|TransactionFrequency|AvgTransactionAmount|PrevTransactionTime|TimeBetweenTransactions|TransactionFrequency|
+------+-------------+-----------------+-------------------+-------------------+-------------+-----------------+----------+-------+--------------------+-------------------+--------------------+--------------------+-------------------+-----------------------+--------------------+
|   1.0|         1001|            200.5|           New York|2023-09-01 12:30:00|  Credit Card|        Completed|       0.0|   null|                null|        

In [0]:
# Split the dataset into training (80%) and test (20%)
train_data, test_data = labeled_data.randomSplit([0.8, 0.2], seed=42)

# Display the size of the training and test sets
print(f"Training Data: {train_data.count()}, Test Data: {test_data.count()}")


Training Data: 9, Test Data: 3


In [0]:
#model training
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

# Combine features into a vector
assembler = VectorAssembler(inputCols=["TransactionAmount","TransactionID"], outputCol="features")

# Initialize classifier
rf = RandomForestClassifier(labelCol="FraudLabel", featuresCol="features")

# Build pipeline
pipeline = Pipeline(stages=[assembler, rf])

# Train the model
model = pipeline.fit(train_data)


In [0]:
model.write().overwrite().save("/mnt/models/fraud_detection_model")


In [0]:
# score real-time transactions
from pyspark.ml.pipeline import PipelineModel
from pyspark.sql.functions import col

# Load the model
loaded_model = PipelineModel.load("/mnt/models/fraud_detection_model")

from pyspark.sql.types import StructType, StructField, StringType, FloatType, TimestampType,IntegerType

# Define the schema
schema = StructType([
    StructField("TransactionID", IntegerType(), True),
    StructField("UserID", StringType(), True),
    StructField("TransactionAmount", FloatType(), True),
    StructField("TransactionLocation", StringType(), True),
    StructField("TransactionTime", TimestampType(), True),
    StructField("PaymentMethod", StringType(), True),
    StructField("TransactionStatus", StringType(), True)
])

# Create a streaming DataFrame with the defined schema
streaming_data = spark.readStream \
    .schema(schema) \
    .option("header", "true") \
    .csv("dbfs:/FileStore/shared_uploads/varshinie.1006@gmail.com/transaction.csv") 

# Predict fraud in real-time data
predictions = loaded_model.transform(streaming_data)
