In [2]:
using DataFrames
using CSV
import Base.Iterators: flatten, zip
import IterTools.subsets

In [3]:
struct Rule
    ant::Array{Int64,1}
    con::Array{Int64,1}
end

In [4]:
Rule([1], [2,3])

Rule([1], [2, 3])

# Data load

In [5]:
onigiri_data = CSV.read("onigiri.csv", delim=",")
onigiri_array = Matrix(onigiri_data)

5×5 Array{Int64,2}:
 0  1  1  1  0
 1  1  0  0  0
 1  0  0  1  1
 0  1  1  1  0
 0  1  0  1  0

In [6]:
function GTI(mat::Array{Int64,2}, indexes::Array{Int64,1})
    return mat[:,indexes] .==1
end

GTI (generic function with 1 method)

In [7]:
[2]

1-element Array{Int64,1}:
 2

In [8]:
GTI(onigiri_array,[2])

5×1 BitArray{2}:
 1
 1
 0
 1
 1

In [9]:
onigiri_array

5×5 Array{Int64,2}:
 0  1  1  1  0
 1  1  0  0  0
 1  0  0  1  1
 0  1  1  1  0
 0  1  0  1  0

In [162]:
function support(array_2d::Array{Int64,2}, indexes::Array{Int64,1}; m="num")
    gti_b = GTI(array_2d, indexes)
    if size(gti_b)[2] ==0
        return 0
    end
    b = all(gti_b, dims=2)
    @show length(b)
    if m =="num"
        return sum(b)
    elseif m == "ratio"
        return sum(b)/length(b)
    elseif m=="bool"
        return b
    end
end

            
    
        
            

support (generic function with 1 method)

In [11]:
function confidence(array_2d::Array{Int64,2}, 
        X_indexes::Array{Int64,1}, Y_indexes::Array{Int64,1})
    sup_X = support(array_2d, X_indexes)
    X_Y_indexes = cat(X_indexes, Y_indexes, dims=1)
    return support(array_2d, X_Y_indexes)/sup_X
end

confidence (generic function with 1 method)

In [12]:
function getF1(array_2D::Array{Int64,2}, minsup::Float64)
    return [[col] for col in 1:size(array_2D)[2] if support(array_2D, [col], m="ratio") >= minsup]
end

getF1 (generic function with 1 method)

In [13]:
getF1(onigiri_array, 0.4)

4-element Array{Array{Int64,1},1}:
 [1]
 [2]
 [3]
 [4]

In [122]:
function getFkPlusOne(array_2D::Array{Int64,2}, indexes::Array{Array{Int64,1},1}, minsup::Float64)
    return [col for col in indexes if support(array_2D, col, m="ratio") >= minsup]
                
end

getFkPlusOne (generic function with 1 method)

In [123]:
function getCkPlusOne(prevCandidate::Array{Array{Int64, 1}, 1}, k)
    @assert all(length.(prevCandidate) .==  k-1)
    @assert k>1
    items = unique(collect(flatten(prevCandidate)))
    tmp_candidates = [x for x in subsets(items, k)]
    if k ==2
        return tmp_candidates
    end
    
    candidates = [
        candidate for candidate in tmp_candidates
        if all(
                x in prevCandidate
                for x in subsets(candidate, k-1))
    ]
                
    return candidates
                
end

getCkPlusOne (generic function with 1 method)

In [124]:
function isEmpty(F::Array{Array{Int64,1},1})
    if length(F) < 1
        return true
    else
        return false
    end
end

isEmpty (generic function with 1 method)

In [125]:
function isCalcConfNeeded(array_prev_ant::Array{Array{Int64,1},1},
                    array_ant::Array{Array{Int64,1},1}, set_f::Array{Int64,1})
    array_prev_con = [setdiff(set_f,  set_c) for set_c in array_ant]
    array_con = [setdiff(set_f, set_c) for set_c in array_ant]
    
    out = []
    for (a,c) in zip(array_ant, array_con)
        out_inner = []
        for i in 1:length(c)

            array_ant_candidate = a
            cand = c[i]
            array_candidate_ant = vcat(a, cand)
            array_candidate_con = filter(x ->x != cand, c)
            
            res = any([issubset(array_candidate_ant, i) for i in array_prev_ant])
            append!(out_inner, res)
        end
        if all(out_inner)
            append!(out, true)
        else
            append!(out, false)
        end
    end
    
    out = convert(Array{Bool, 1}, out)

    return out
end

                
            

isCalcConfNeeded (generic function with 1 method)

In [143]:
function frequent(array_2D::Array{Int64,2}; minsum::Float64)
    k = 1
    F_now = getF1(array_2D, minsum)
    F_list = []
    append!(F_list, [F_now])
    
    while(true)
        C_next = getCkPlusOne(F_now, k+1)
        F_next = getFkPlusOne(array_2D, C_next, minsum)
        
        if isEmpty(F_next)
            break
        end
        k += 1
        F_now = F_next
        append!(F_list, [F_now])
    end
    
    F_list = convert(Array{Array{Array{Int64,1},1},1}, F_list)
    
    return F_list
end

frequent (generic function with 2 methods)

In [144]:
_F_list = frequent(onigiri_array, minsum=0.4)

1-element Array{Array{Array{Int64,1},1},1}:
 []

In [154]:
function find_rules(array_2D::Array{Int64, 2}, 
        F_list::Array{Array{Array{Int64,1},1},1}; minconf::Float64)
    conf_list = []
    for F in F_list
        k = length(F[1])
        
        if k == 1
            #conf_list = vcat(conf_list, Any{[Rule([0],[0])]}) # DUMMY!!
        
        elseif k == 2
            conf_list_k = []
            for f_2 in F
                A = f_2[1]
                B = f_2[2]
                conf_AB = confidence(array_2D, [A], [B])
                if conf_AB >= minconf
                    #append!(conf_list_k, Rule([A],[B]))
                    conf_list_k = vcat(conf_list_k, Rule([A],[B]))

                end
                conf_BA = confidence(array_2D, [B], [A])
                if conf_BA >= minconf
                    #append!(conf_list_k, Rule([B],[A]))
                    conf_list_k = vcat(conf_list_k, Rule([B],[A]))
                end
            end
            append!(conf_list, [conf_list_k])   


        elseif k >= 3
            conf_list_k = []
            for f_k in F
                
                j = 1
                
                array_antecedent =  collect(subsets(f_k, k-1))
                array_consequent = [setdiff(f_k,  set_c) for set_c in array_antecedent]
                conf = [confidence(array_2D, ant, con) for (ant, con) in zip(array_antecedent, array_consequent)]
                isHigher = conf .>= minconf
                if sum(isHigher) > 0
                    array_antecedent_filtered_by_conf = array_antecedent[isHigher]
                    array_consequent_filtered_by_conf = array_consequent[isHigher]
                    append!(conf_list_k, [Rule(a,c) for (a,c) in zip(array_antecedent_filtered_by_conf,
                                                                                            array_consequent_filtered_by_conf)])
                    
                    while(j < k-1)
                        array_antecedent_new = collect(subsets(f_k, k-(j+1)))
                        _res = isCalcConfNeeded(array_antecedent_filtered_by_conf, array_antecedent_new, f_k)
                        if sum(_res) > 0
                            array_antecedent_filtered_by_prev = array_antecedent_new[_res]
                            array_consequent_filtered_by_prev = [setdiff(f_k,  set_c) 
                                                                                            for set_c in array_antecedent_filtered_by_prev]
                            conf = [confidence(array_2D, ant, con) for (ant, con) in zip(array_antecedent_filtered_by_prev, 
                                                                                                                                array_consequent_filtered_by_prev)]
                            isHigher = conf .>= minconf
                            if sum(isHigher) > 0
                                array_antecedent_filtered_by_prev_and_conf = array_antecedent_filtered_by_prev[isHigher]
                                array_consequent_filtered_by_prev_and_conf = array_consequent_filtered_by_prev[isHigher]
                                append!(conf_list_k, [Rule(a,c) for (a,c) in zip(array_antecedent_filtered_by_prev_and_conf, 
                                                                                                         array_consequent_filtered_by_prev_and_conf)])
                            end
                        end
                        j += 1
                    end #while
                end
            end
            append!(conf_list, [conf_list_k])
        end
    end
    conf_list = convert(Array{Array{Rule,1},1}, conf_list)
    return conf_list
end
        
            
                    
        
                    
                        
            

find_rules (generic function with 2 methods)

In [129]:
find_rules(onigiri_array, _F_list, 0.7)

BoundsError: BoundsError: attempt to access 0-element Array{Array{Int64,1},1} at index [1]

In [130]:
store_data = CSV.read("store_data_trans.csv", delim=",")
store_array = Matrix(store_data)
store_array

7501×120 Array{Int64,2}:
 0  1  1  0  1  0  0  0  0  0  0  0  0  …  0  0  0  1  0  0  1  0  0  1  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  1  0  0  0  0  0  0  0  0     0  0  1  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  1  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0  …  0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  1  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0  …  0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0 

In [132]:
minsum = 0.007
F_list = frequent(store_array, minsum)

3-element Array{Array{Array{Int64,1},1},1}:
 [[2], [3], [5], [7], [8], [9], [10], [11], [13], [14]  …  [110], [111], [112], [114], [115], [116], [117], [118], [119], [120]]                                                                                         
 [[2, 73], [5, 26], [5, 44], [5, 72], [5, 73], [5, 101], [13, 38], [13, 44], [13, 73], [13, 101]  …  [93, 101], [98, 101], [98, 109], [100, 101], [101, 107], [101, 109], [101, 111], [101, 116], [101, 117], [111, 117]]
 [[73, 101, 109], [26, 44, 72], [26, 44, 101], [26, 72, 101]]                                                                                                                                                            

In [133]:
minconf = 0.01
find_rules(store_array, F_list, minconf)

conf_list = Any[Any[Rule([2], [73]), Rule([73], [2]), Rule([5], [26]), Rule([26], [5]), Rule([5], [44]), Rule([44], [5]), Rule([5], [72]), Rule([72], [5]), Rule([5], [73]), Rule([73], [5]), Rule([5], [101]), Rule([101], [5]), Rule([13], [38]), Rule([38], [13]), Rule([13], [44]), Rule([44], [13]), Rule([13], [73]), Rule([73], [13]), Rule([13], [101]), Rule([101], [13]), Rule([16], [18]), Rule([18], [16]), Rule([16], [26]), Rule([26], [16]), Rule([16], [38]), Rule([38], [16]), Rule([16], [41]), Rule([41], [16]), Rule([16], [44]), Rule([44], [16]), Rule([16], [49]), Rule([49], [16]), Rule([16], [50]), Rule([50], [16]), Rule([16], [55]), Rule([55], [16]), Rule([16], [56]), Rule([56], [16]), Rule([16], [72]), Rule([72], [16]), Rule([16], [73]), Rule([73], [16]), Rule([16], [83]), Rule([83], [16]), Rule([16], [98]), Rule([98], [16]), Rule([16], [101]), Rule([101], [16]), Rule([16], [109]), Rule([109], [16]), Rule([16], [111]), Rule([111], [16]), Rule([16], [117]), Rule([117], [16]), Rule([17




2-element Array{Array{Rule,1},1}:
 [Rule([2], [73]), Rule([73], [2]), Rule([5], [26]), Rule([26], [5]), Rule([5], [44]), Rule([44], [5]), Rule([5], [72]), Rule([72], [5]), Rule([5], [73]), Rule([73], [5])  …  Rule([101], [109]), Rule([109], [101]), Rule([101], [111]), Rule([111], [101]), Rule([101], [116]), Rule([116], [101]), Rule([101], [117]), Rule([117], [101]), Rule([111], [117]), Rule([117], [111])]                                                                                            
 [Rule([73, 101], [109]), Rule([73, 109], [101]), Rule([101, 109], [73]), Rule([73], [101, 109]), Rule([101], [73, 109]), Rule([109], [73, 101]), Rule([26, 44], [72]), Rule([26, 72], [44]), Rule([44, 72], [26]), Rule([26], [44, 72])  …  Rule([44, 101], [26]), Rule([26], [44, 101]), Rule([44], [26, 101]), Rule([101], [26, 44]), Rule([26, 72], [101]), Rule([26, 101], [72]), Rule([72, 101], [26]), Rule([26], [72, 101]), Rule([72], [26, 101]), Rule([101], [26, 72])]

# mnist

In [166]:
mnist_data = CSV.read("mnist_8x8.csv", delim=",")
mnist_array = Matrix(store_data)
mnist_array = convert(Array{Int64, 2}, mnist_array)

897×64 Array{Int64,2}:
 0  0  0  1  1  0  0  0  0  0  0  1  1  …  1  0  0  0  0  0  0  1  1  1  0  0
 0  0  0  0  1  1  0  0  0  0  0  1  1     1  1  0  0  0  0  0  0  1  1  1  0
 0  0  0  1  1  0  0  0  0  1  1  0  1     0  1  1  0  0  0  0  1  1  1  0  0
 0  0  0  0  1  0  0  0  0  0  0  0  1     1  1  0  0  0  0  0  0  1  0  0  0
 0  0  1  1  0  0  0  0  0  0  1  1  1     1  1  0  0  0  0  1  1  1  1  0  0
 0  0  0  1  1  0  0  0  0  0  0  1  1  …  1  1  1  0  0  0  0  1  1  1  0  0
 0  0  0  1  1  1  1  0  0  0  0  0  0     0  0  0  0  0  0  1  0  0  0  0  0
 0  0  1  1  1  0  0  0  0  0  1  1  1     0  1  1  0  0  0  1  1  1  1  0  0
 0  0  1  1  0  0  0  0  0  0  1  1  1     1  1  0  0  0  0  1  1  1  0  0  0
 0  0  0  1  1  1  0  0  0  0  1  1  1     1  1  0  0  0  0  0  1  1  0  0  0
 0  0  0  0  1  1  0  0  0  0  0  0  1  …  1  1  0  0  0  0  0  0  1  1  0  0
 0  0  0  1  0  0  0  0  0  0  1  1  0     1  1  1  0  0  0  0  1  1  1  1  0
 0  0  1  1  1  1  0  0  0  0  1  1  1   

In [167]:
size(mnist_array)

(897, 64)

# Julia

In [163]:
support(mnist_array, [3], m="ratio")

length(b) = 897


0.2463768115942029

In [158]:
@time F_list = frequent(mnist_array, minsum=0.3)

  3.775753 seconds (6.54 M allocations: 668.203 MiB, 4.88% gc time)


6-element Array{Array{Array{Int64,1},1},1}:
 [[3], [4], [5], [6], [11], [12], [13], [14], [19], [20]  …  [46], [51], [52], [53], [54], [55], [59], [60], [61], [62]]                                                                                                                                                                                                                                                                                                                         
 [[3, 4], [3, 11], [3, 12], [3, 13], [3, 59], [3, 60], [4, 5], [4, 6], [4, 11], [4, 12]  …  [53, 61], [53, 62], [54, 60], [54, 61], [54, 62], [55, 61], [59, 60], [60, 61], [60, 62], [61, 62]]                                                                                                                                                                                                                                                  
 [[3, 4, 11], [3, 4, 13], [3, 4, 59], [3, 4, 60], [3, 11, 59], [3, 11, 60], [3, 12, 60],

In [156]:
@time find_rules(mnist_array, F_list, minconf=0.1)

  1.829713 seconds (6.75 M allocations: 2.984 GiB, 13.66% gc time)


6-element Array{Array{Rule,1},1}:
 [Rule([4], [5]), Rule([5], [4]), Rule([4], [11]), Rule([11], [4]), Rule([4], [12]), Rule([12], [4]), Rule([4], [13]), Rule([13], [4]), Rule([4], [14]), Rule([14], [4])  …  Rule([54], [61]), Rule([61], [54]), Rule([54], [62]), Rule([62], [54]), Rule([60], [61]), Rule([61], [60]), Rule([60], [62]), Rule([62], [60]), Rule([61], [62]), Rule([62], [61])]                                                                                                                                                                                                                                                                                                                                                                                    
 [Rule([4, 5], [11]), Rule([4, 11], [5]), Rule([5, 11], [4]), Rule([4], [5, 11]), Rule([5], [4, 11]), Rule([11], [4, 5]), Rule([4, 5], [12]), Rule([4, 12], [5]), Rule([5, 12], [4]), Rule([4], [5, 12])  …  Rule([61, 62], [54]), Rule([54], [61