In [1]:
using CSV
using DataFrames
using Dates
using Statistics

In [2]:
# Julia version
VERSION

v"1.4.1"

In [3]:
readdir("data")

2-element Array{String,1}:
 "dff.pkl"
 "sales_data_sample.csv"

In [3]:
folder = "/home/vaclav/Data/Kaggle/EEE-CIS_Fraud_Detection"
files = ["train_transaction.csv", "train_identity.csv"]

2-element Array{String,1}:
 "train_transaction.csv"
 "train_identity.csv"

In [5]:
joinpath(folder,files[1])

"/home/vaclav/Data/Kaggle/EEE-CIS_Fraud_Detection/train_transaction.csv"

In [6]:
s = Dict()

# load transactions ~600MB
ts = now()
df = CSV.read(joinpath(folder,files[1]), DataFrame)
te = now()
time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)
push!(s, "load_transactions"=>time_in_sec)

# load identity ~25MB
ts = now()
df2 = CSV.read(joinpath(folder,files[2]), DataFrame)
te = now()
time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)
push!(s, "load_identity"=>time_in_sec)

# join
ts = now()
dff = join(df, df2, kind = :inner, on = "TransactionID")
te = now()
time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)
push!(s, "merge"=>time_in_sec)

# group by
ts = now()
grp = combine(groupby(dff, ["isFraud","ProductCD","card4","card6","id_15","id_31"]), 
    :TransactionAmt=>maximum=>:TransactionAmountMax, 
    :TransactionAmt=>mean=>:TransactionAmountMean)
te = now()
time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)
push!(s, "aggregation"=>time_in_sec)

# group by
ts = now()
sort!(dff, ["card1","addr1","D9"])
sort!(dff, ["addr1","D9","card1"])
sort!(dff, ["D9","card1","addr1"])
te = now()
time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)
push!(s, "sort"=>time_in_sec)

│   caller = ip:0x0
└ @ Core :-1


Dict{Any,Any} with 5 entries:
  "merge"             => 6.369
  "sort"              => 6.896
  "load_transactions" => 28.601
  "aggregation"       => 6.078
  "load_identity"     => 0.307

In [7]:
DataFrame(s)

Unnamed: 0_level_0,aggregation,load_identity,load_transactions,merge,sort
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64
1,6.078,0.307,28.601,6.369,6.896


In [8]:
# check the shape of the dataframes
nrow(dff), length(names(dff)), nrow(grp), length(names(grp))

(144233, 434, 4553, 8)

## Groupby Details
https://dataframes.juliadata.org/stable/man/split_apply_combine/

In [9]:
groupby(dff, ["isFraud","ProductCD","card4","card6","id_15","id_31"])

Unnamed: 0_level_0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2
Unnamed: 0_level_1,Int64,Int64,Int64,Float64,String,Int64,Float64?
1,3067453,0,1729328,200.0,H,1030,157.0
2,3073296,0,1816710,100.0,H,1675,174.0
3,3061872,0,1642116,100.0,H,1974,111.0
4,3078930,0,1902540,100.0,H,6697,111.0
5,3038788,0,1211592,75.0,H,7508,321.0
6,3056336,0,1556463,25.0,H,9500,321.0
7,3026050,0,951882,50.0,H,9500,321.0
8,2999258,0,348274,50.0,H,10680,373.0
9,2995902,0,260116,40.0,H,12526,381.0
10,3124444,0,2767769,50.0,H,12839,321.0

Unnamed: 0_level_0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2
Unnamed: 0_level_1,Int64,Int64,Int64,Float64,String,Int64,Float64?
1,3216693,1,5439563,25.0,S,18375,174.0


In [10]:
ENV["COLUMNS"]=120
# this function group by `missing` as well
combine(groupby(dff, ["isFraud","ProductCD","card4","card6","id_15","id_31"]), 
    :TransactionAmt=>maximum=>:TransactionAmountMax, 
    :TransactionAmt=>mean=>:TransactionAmountMean)

Unnamed: 0_level_0,isFraud,ProductCD,card4,card6,id_15,id_31,TransactionAmountMax,TransactionAmountMean
Unnamed: 0_level_1,Int64,String,String?,String?,String?,String?,Float64,Float64
1,0,H,visa,debit,Found,firefox 57.0,300.0,71.1765
2,0,R,visa,credit,Found,ie 11.0 for desktop,1000.0,208.58
3,1,R,visa,debit,Found,chrome 63.0 for android,300.0,216.667
4,0,C,visa,credit,Found,chrome 65.0,410.373,49.3293
5,0,R,visa,debit,Found,chrome 62.0 for android,200.0,98.9583
6,1,C,mastercard,credit,New,chrome 63.0,225.504,47.2223
7,0,C,mastercard,credit,Found,chrome 62.0 for android,154.071,44.5909
8,0,C,mastercard,credit,Found,chrome 63.0,302.111,50.8962
9,1,C,mastercard,credit,Found,chrome 63.0,265.498,44.8802
10,0,C,mastercard,credit,New,chrome 63.0,302.111,48.2015


## Sorting Details

In [11]:
sort!(dff, ["card1","addr1","D9"])

Unnamed: 0_level_0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5
Unnamed: 0_level_1,Int64,Int64,Int64,Float64,String,Int64,Float64?,Float64?,String?,Float64?
1,3230924,0,5787419,23.443,C,1000,555.0,185.0,mastercard,224.0
2,3020767,0,842821,150.0,R,1004,583.0,150.0,visa,226.0
3,3028973,0,1022173,30.0,H,1004,583.0,150.0,visa,226.0
4,3386444,0,10082484,50.0,H,1004,583.0,150.0,visa,226.0
5,3038871,0,1212802,50.0,H,1005,543.0,150.0,mastercard,117.0
6,3234681,0,5883179,150.0,R,1006,399.0,150.0,american express,146.0
7,3436647,0,11468973,100.0,R,1006,399.0,150.0,american express,146.0
8,3095681,0,2145214,150.0,R,1006,399.0,150.0,american express,146.0
9,3021401,0,850730,23.203,C,1007,555.0,135.0,mastercard,224.0
10,3226241,0,5651177,55.164,C,1007,555.0,135.0,mastercard,224.0


# Run multiple times

In [4]:
function run_julia()
    s = Dict()
    f = open("julia.csv","a")

    # load transactions ~600MB
    ts = now()
    df = CSV.read(joinpath(folder,files[1]), DataFrame)
    te = now()
    time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)
    push!(s, "load_transactions"=>time_in_sec)
    write(f,string(Dates.format(now(), "YYYY-mm-dd HH:MM:SS"),"|julia|load_transactions|",time_in_sec,"\n"))
    

    # load identity ~25MB
    ts = now()
    df2 = CSV.read(joinpath(folder,files[2]), DataFrame)
    te = now()
    time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)
    push!(s, "load_identity"=>time_in_sec)
    write(f,string(Dates.format(now(), "YYYY-mm-dd HH:MM:SS"),"|julia|load_identity|",time_in_sec,"\n"))

    # join
    ts = now()
    dff = join(df, df2, kind = :inner, on = "TransactionID")
    te = now()
    time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)
    push!(s, "merge"=>time_in_sec)
    write(f,string(Dates.format(now(), "YYYY-mm-dd HH:MM:SS"),"|julia|merge|",time_in_sec,"\n"))

    # group by
    ts = now()
    grp = combine(groupby(dff, ["isFraud","ProductCD","card4","card6","id_15","id_31"]), 
        :TransactionAmt=>maximum=>:TransactionAmountMax, 
        :TransactionAmt=>mean=>:TransactionAmountMean)
    te = now()
    time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)
    push!(s, "aggregation"=>time_in_sec)
    write(f,string(Dates.format(now(), "YYYY-mm-dd HH:MM:SS"),"|julia|aggregation|",time_in_sec,"\n"))

    # group by
    ts = now()
    sort!(dff, ["card1","addr1","D9"])
    sort!(dff, ["addr1","D9","card1"])
    sort!(dff, ["D9","card1","addr1"])
    te = now()
    time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)
    push!(s, "sort"=>time_in_sec)
    write(f,string(Dates.format(now(), "YYYY-mm-dd HH:MM:SS"),"|julia|sorting|",time_in_sec,"\n"))
    
    close(f)
    return s
end

run_julia (generic function with 1 method)

In [5]:
for i in 1:7
    run_julia()
end

│   caller = ip:0x0
└ @ Core :-1


In [52]:
# expected csv output
string(Dates.format(now(), "YYYY-mm-dd HH:MM:SS"),"|julia|load_identity|",time_in_sec)

"2021-01-21 22:56:32|julia|step|6.896"