In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.classification import GBTClassifier, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


# Create a SparkSession
spark = SparkSession.builder \
    .appName("CC_Fraud") \
    .config("spark.executor.instances", "2") \
    .getOrCreate()


In [11]:
# Load your dataset into a Spark DataFrame
df = spark.read.csv("./data/clean_train.csv", header=True, inferSchema=True)
df = df.drop("_c0", "trans_num")
# Define your features and target column
feature_columns = [col for col in df.columns if col != "is_fraud"]
label_column = "is_fraud"

In [12]:
df.head(2)

[Row(amt=85.75, lat=36.4899, long=-79.4736, city_pop=3402, unix_time=1341848490, merch_lat=36.974911, merch_long=-80.4626, is_fraud=0, merchant_label=260, category_label=5, gender_label=1, job_label=251),
 Row(amt=8.45, lat=40.817, long=-74.0, city_pop=13835, unix_time=1344061258, merch_lat=41.076082, merch_long=-74.337634, is_fraud=0, merchant_label=421, category_label=11, gender_label=1, job_label=368)]

In [13]:
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
classifier = RandomForestClassifier(featuresCol="features", labelCol=label_column, numTrees=10, maxBins=700)

pipeline = Pipeline(stages=[assembler, classifier])

In [14]:
# Split data into training and test sets
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

In [15]:
df.head()

Row(amt=85.75, lat=36.4899, long=-79.4736, city_pop=3402, unix_time=1341848490, merch_lat=36.974911, merch_long=-80.4626, is_fraud=0, merchant_label=260, category_label=5, gender_label=1, job_label=251)

In [16]:
# Train the pipeline
model = pipeline.fit(train_data)

In [17]:
# Make predictions on test data
predictions = model.transform(test_data)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol=label_column, predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)


Accuracy: 0.9951976900889328


In [18]:
model.save(r"C:\Users\saritajoshi\Documents\Repos\Credit-Card-Fraud-Detection-Spark\model")

In [9]:
Stop SparkSession
spark.stop()