In [1]:
// Import libaries...
%use dataframe, kandy

## Overview of the provided dataset

Just a general overview of the provided dataset to gain understanding of the data...

In [2]:
import kotlinx.datetime.*

val dataset = DataFrame.readCSV("dataset/bank_transactions_data_2.csv", delimiter = ',')

val df = dataset
    .add("DayOfWeek") { row ->
        // NOTE: Day of the week the transaction occured.
        val transactionDate = row["TransactionDate"] as LocalDateTime
        transactionDate.dayOfWeek.name
    }.add("HourOfDay") { row ->
        // NOTE: Hour of day the transaction occured. (in 24h format)
        val transactionDate = row["TransactionDate"] as LocalDateTime
        transactionDate.hour
    }.add("TimeGapSeconds") {row ->
        // NOTE: Time gap between current transaction and previous transaction (output in seconds).
        val usTimeZone = TimeZone.of("America/New_York")
        val transactionDate = (row["TransactionDate"] as LocalDateTime).toInstant(usTimeZone)
        val prevTransactionDate = (row["PreviousTransactionDate"] as LocalDateTime).toInstant(usTimeZone)

        transactionDate.until(prevTransactionDate, DateTimeUnit.SECOND, usTimeZone)
    }

df


TransactionID,AccountID,TransactionAmount,TransactionDate,TransactionType,Location,DeviceID,IP Address,MerchantID,Channel,CustomerAge,CustomerOccupation,TransactionDuration,LoginAttempts,AccountBalance,PreviousTransactionDate,DayOfWeek,HourOfDay,TimeGapSeconds
TX000001,AC00128,14.09,2023-04-11T16:29:14,Debit,San Diego,D000380,162.198.218.92,M015,ATM,70,Doctor,81,1,5112.21,2024-11-04T08:08:08,TUESDAY,16,49480734
TX000002,AC00455,376.24,2023-06-27T16:44:19,Debit,Houston,D000051,13.149.61.4,M052,ATM,68,Doctor,141,1,13758.91,2024-11-04T08:09:35,TUESDAY,16,42827116
TX000003,AC00019,126.29,2023-07-10T18:16:08,Debit,Mesa,D000235,215.97.143.157,M009,Online,19,Student,56,1,1122.35,2024-11-04T08:07:04,MONDAY,18,41698256
TX000004,AC00070,184.5,2023-05-05T16:32:11,Debit,Raleigh,D000187,200.13.225.150,M002,Online,26,Student,25,1,8569.06,2024-11-04T08:09:06,FRIDAY,16,47407015
TX000005,AC00411,13.45,2023-10-16T17:51:24,Credit,Atlanta,D000308,65.164.3.100,M091,Online,26,Student,198,1,7429.4,2024-11-04T08:06:39,MONDAY,17,33232515
TX000006,AC00393,92.15,2023-04-03T17:15:01,Debit,Oklahoma City,D000579,117.67.192.211,M054,ATM,18,Student,172,1,781.68,2024-11-04T08:06:36,MONDAY,17,50169095
TX000007,AC00199,7.08,2023-02-15T16:36:48,Credit,Seattle,D000241,140.212.253.222,M019,ATM,37,Doctor,139,1,13316.71,2024-11-04T08:10:09,WEDNESDAY,16,54228801
TX000008,AC00069,171.42,2023-05-08T17:47:59,Credit,Indianapolis,D000500,92.214.76.157,M020,Branch,67,Retired,291,1,2796.24,2024-11-04T08:10:55,MONDAY,17,47143376
TX000009,AC00135,106.23,2023-03-21T16:59:46,Credit,Detroit,D000690,24.148.92.177,M035,Branch,51,Engineer,86,1,9095.14,2024-11-04T08:11:14,TUESDAY,16,51293488
TX000010,AC00385,815.96,2023-03-31T16:06:57,Debit,Nashville,D000199,32.169.88.41,M007,ATM,55,Doctor,120,1,1021.88,2024-11-04T08:06:32,FRIDAY,16,50432375


In [3]:
val locations = df["Location"].valueCounts()

plot(locations) {
    x("count")
    y("Location")

    bars {
        fillColor("Location") {
        }
        borderLine.width = 0.0
    }

    layout {
        title = "Transactions by Location"
        size = 700 to 450
        style {
            legend.position = LegendPosition.None
        }
    }
}

In [4]:
df.groupBy("TransactionType").plot {
    layout.title = "Transaction Amount Histplot"
    layout.xAxisLabel = "Transaction Amount"
    layout.yAxisLabel = "Number of Transactions"
    statBin("TransactionAmount", binsOption = BinsOption.byNumber(20)) {
        bars {
            alpha = 0.8
            x(Stat.x)
            y(Stat.count)
            fillColor("TransactionType") {
            }
            position = Position.stack()

            line {
                x(Stat.x)
                y(Stat.count)
                fillColor("TransactionType") {
                }
                width = 1.5
                type = LineType.SOLID
            }
        }
    }
}

In [5]:
val dayOrder = mapOf(
    "MONDAY" to 1,
    "TUESDAY" to 2,
    "WEDNESDAY" to 3,
    "THURSDAY" to 4,
    "FRIDAY" to 5
)

val sortedDf = df.add("DayOrder") { row ->
    dayOrder[row["DayOfWeek"] as String]
}.sortBy("DayOrder")

sortedDf.plot {
    countPlot("DayOfWeek") {
        fillColor(Stat.x) {
            legend.type = LegendType.None
        }
        x.axis.name = "Day Of Week"
    }
}

The dataset does not contain Sunday and Saturdays. But monday seems to have the most number of transactions.

> Probably implying that transaction tracking are closed during weekends?

In [6]:
val sortedDf = df.sortBy("HourOfDay")

plot(df) {
    histogram("HourOfDay") {
        alpha = 0.9
        fillColor = Color.BLUE
        y(Stat.density)
    }
    densityPlot("HourOfDay") {
        alpha = 0.5
        fillColor = Color.hex(0xFF6666)
    }
    layout.title = "Transaction Frequency By Hour of Day"
    layout.yAxisLabel = "Frequency (Density)"
    layout.xAxisLabel = "Hour of Day"
}

Dataset seems to only contain transaction between 3 points in the day.

In [7]:
val dfWithMinutesTimeGap = df.add("TimeGapMinutes") { row ->
    row["TimeGapSeconds"] as Long / 60
}

dfWithMinutesTimeGap.groupBy("TransactionType").plot {
    layout.title = "Transaction TimeGaps"
    layout.xAxisLabel = "Time Gap (Minutes)"
    layout.yAxisLabel = "Transaction Frequency"

    statBin("TimeGapMinutes", binsOption = BinsOption.byNumber(30)) {
        bars {
            alpha = 0.8
            x(Stat.x)
            y(Stat.count)
            fillColor("TransactionType") {
            }
            position = Position.stack()

            line {
                x(Stat.x)
                y(Stat.count)
                fillColor("TransactionType") {
                }
                width = 1.5
                type = LineType.SOLID
            }
        }
    }
}

In [8]:
df.groupBy("TransactionType").plot {
    layout.title = "Customer Age Distribution By Transaction Type"
    layout.xAxisLabel = "Customer Age)"
    layout.yAxisLabel = "Count"

    statBin("CustomerAge", binsOption = BinsOption.byNumber(15)) {
        bars {
            alpha = 0.8
            x(Stat.x)
            y(Stat.count)
            position = Position.stack()
            fillColor("TransactionType") {
            }

            line {
                x(Stat.x)
                y(Stat.count)
                width = 1.5
                type = LineType.SOLID
                fillColor("TransactionType") {
                }
            }
        }
    }
}

In [9]:
val merchantCounts = df["MerchantID"].valueCounts()

plot(merchantCounts) {
    x("count")
    y("MerchantID")

    bars {
        fillColor("MerchantID") {
        }
        borderLine.width = 0.0
    }

    layout {
        title = "Common Merchants"
        size = 900 to 1000
        style {
            legend.position = LegendPosition.None
        }
    }
}

In [10]:
val customerOccuptations = df["CustomerOccupation"]

df.plot {
    statCount(customerOccuptations) {
        pie {
            slice(Stat.count)
            fillColor(Stat.x named "CustomerOccupation")
            size = 25.0
        }
    }
    layout {
        title="Customer Occupation Distribution"
        style(Style.Void)
    }
}

Dataset only contains 4 occupations, with mostly evenly distributed amount.

In [11]:
df.plot {
    layout.title = "Account Balance Density"
    layout.xAxisLabel = "Account Balance"
    layout.yAxisLabel = "Density"

    densityPlot("AccountBalance") {
        fillColor = Color.RED
    }
}

In [12]:
val numericColumns: MutableList<String> = mutableListOf()

for (col in df.columns()) {
    if (df[col].first() is Double || df[col].first() is Float || df[col].first() is Int) {
        numericColumns.add(col.name())
    }
}

val numericsDF = df.select {
    cols {
        val valueClassname = it.first()!!::class.simpleName
        valueClassname == "Double" || valueClassname == "Float" || valueClassname == "Int"
    }
}

numericsDF.corr()

column,TransactionAmount,CustomerAge,TransactionDuration,LoginAttempts,AccountBalance,HourOfDay
TransactionAmount,1.0,-0.025616,0.004359,-0.008445,-0.025165,-0.020779
CustomerAge,-0.025616,1.0,-0.017936,0.007653,0.319942,-0.006955
TransactionDuration,0.004359,-0.017936,1.0,0.032639,0.005577,-0.006953
LoginAttempts,-0.008445,0.007653,0.032639,1.0,0.014999,0.018079
AccountBalance,-0.025165,0.319942,0.005577,0.014999,1.0,-0.000349
HourOfDay,-0.020779,-0.006955,-0.006953,0.018079,-0.000349,1.0


Correlation matrix table; there is barely any correlation between any of the data.
(All values are less than 0.05)