In [0]:
# File location and type

# kaggle dataset --- https://www.kaggle.com/datasets/dhanushnarayananr/credit-card-fraud

file_location = "/FileStore/tables/card_data.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

df.repartition(4)

display(df)

In [0]:
# Create a view or table

temp_table_name = "card_data_csv"

df.createOrReplaceTempView(temp_table_name)

In [0]:
%sql

/* Query the created temp table in a SQL cell */

select * from `card_data_csv`

In [0]:
# With this registered as a temp view, it will only be available to this particular notebook. If you'd like other users to be able to query this table, you can also create a table from the DataFrame.
# Once saved, this table will persist across cluster restarts as well as allow various users across different notebooks to query this data.
# To do so, choose your table name and uncomment the bottom line.

permanent_table_name = "card_data_csv"

# df.write.format("parquet").saveAsTable(permanent_table_name)

In [0]:
#No of rows and columns
print(df.count(),len(df.columns))

In [0]:
#No of null values in each column
from pyspark.sql.functions import col,isnan, when, count

df.select([count(when(isnan(c) | col(c).isNull(),c)).alias(c) for c in df.columns]).display()

In [0]:
#describing numerical columns
pd = df.toPandas()
pd.describe()

In [0]:
#rename fraud column name to label
df = df.withColumnRenamed("fraud","label")

In [0]:
#No of fraudulent and non-fradulent records
df.groupBy("label").count().display()

In [0]:
from pyspark.sql.functions import expr

fraud_df = df.filter(expr("label = 1"))
print("Fraud Count --- " + str(fraud_df.count()))

In [0]:
no_fraud_df = df.filter(expr("label = 0"))
print("Not Fraud Count --- " + str(no_fraud_df.count()))

In [0]:
# sampling unbalanced data to balanced data by selecting 60k records from each fraud and not fraud dataframes

#For fraudulent data, sampling 
col_names = df.columns
fraud_sample_list = fraud_df.sample(False,0.8,2321).take(60000)
fraud_sample_df = spark.createDataFrame(fraud_sample_list, col_names)
fraud_sample_df.display()

In [0]:
#For fraudulent data, sampling 
col_names = df.columns
not_fraud_sample_list = no_fraud_df.sample(False,0.8,2321).take(60000)
not_fraud_sample_df = spark.createDataFrame(not_fraud_sample_list, col_names)
not_fraud_sample_df.display()

In [0]:
# Sampled no of each type of records
print(fraud_sample_df.count(), not_fraud_sample_df.count())

In [0]:
# union of all samples
dataframe = fraud_sample_df.union(not_fraud_sample_df)
print(dataframe.count())

In [0]:
#casting all columns to float
from pyspark.sql.functions import col

cols = dataframe.columns
for col_name in cols:
    dataframe = dataframe.withColumn(col_name,col(col_name).cast('float'))

dataframe.printSchema()

In [0]:
# corelation between all columns
from pyspark.ml.stat import Correlation
from pyspark.ml.linalg import DenseMatrix, Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import *


assembler = VectorAssembler(inputCols=dataframe.columns, outputCol="features",handleInvalid='keep')
assembled_df = assembler.transform(dataframe).select("features")

correlation = Correlation.corr(assembled_df,"features","pearson").collect()

rows = correlation[0][0].toArray().tolist()
corr_df = spark.createDataFrame(rows,dataframe.columns)

corr_df.display()
"""
    Since all values are less than 0.8 and greater than -0.8, all columns are dependent on each other
"""

In [0]:
train_df, test_df = dataframe.randomSplit(weights=[0.7,0.3], seed=2321)

In [0]:
print(train_df.count(), test_df.count())

In [0]:
from pyspark.ml.feature import VectorAssembler

cols_without_label = train_df.columns
cols_without_label.remove("label")
assembled_train_df = VectorAssembler(inputCols=cols_without_label, outputCol="cols_vector",handleInvalid='keep').transform(train_df)
assembled_test_df = VectorAssembler(inputCols=cols_without_label, outputCol="cols_vector",handleInvalid='keep').transform(test_df)

In [0]:
from pyspark.ml.feature import MinMaxScaler

scaled_train_df = MinMaxScaler(inputCol="cols_vector",outputCol="cols_scaled").fit(assembled_train_df).transform(assembled_train_df)
scaled_test_df = MinMaxScaler(inputCol="cols_vector",outputCol="cols_scaled").fit(assembled_test_df).transform(assembled_test_df)

In [0]:
scaled_train_df.display()

In [0]:
from pyspark.ml.classification import LinearSVC

lsvc = LinearSVC(featuresCol="cols_scaled", labelCol="label",maxIter=10000)
model = lsvc.fit(scaled_train_df)

In [0]:
predictions = model.transform(scaled_test_df)

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from sklearn.metrics import confusion_matrix

evaluator=MulticlassClassificationEvaluator(metricName="accuracy")
acc = evaluator.evaluate(predictions)
 
print("Prediction Accuracy: ", acc)

y_pred=predictions.select("prediction").collect()
y_orig=predictions.select("label").collect()

cm = confusion_matrix(y_orig, y_pred)
print("Confusion Matrix:")
print(cm) 
