In [77]:
using StatsKit
using MLJ
using AlgebraOfGraphics, CairoMakie
import BetaML
set_aog_theme!()
ENV["DATAFRAMES_ROWS"] = 8

8

In [89]:
function processpassengerData(passengerData::DataFrame)::DataFrame
    passengerData = select(passengerData, Not([:Name]))
    passengerData.RoomDeck = [
        String(deck[1]) for deck in split.(passengerData.Cabin, "/")
    ] |> categorical
    passengerData.RoomSide = [
        String(deck[3]) for deck in split.(passengerData.Cabin, "/")
    ] |> categorical
    passengerData.TotalSpent = passengerData.RoomService + passengerData.FoodCourt + passengerData.ShoppingMall + passengerData.Spa + passengerData.VRDeck
    passengerData = select(passengerData, Not(:Cabin))
    return passengerData
end

passengerData = CSV.read("data/train.csv", DataFrame) |>
                processpassengerData

MethodError: MethodError: no method matching split(::Missing, ::String)

Closest candidates are:
  split(!Matched::T, ::Any; limit, keepempty) where T<:AbstractString
   @ Base strings/util.jl:601


In [79]:
transported_deck = AlgebraOfGraphics.data(passengerData) *
                   frequency() *
                   mapping(:Transported, color=:VIP)
draw(transported_deck)

MethodError: MethodError: findall(::AlgebraOfGraphics.var"#8#9", ::Vector{Any}) is ambiguous.

Candidates:
  findall(el::T, cont::Array{T}; returnTuple) where T
    @ BetaML.Utils ~/.julia/packages/BetaML/V0tAN/src/Utils/Processing.jl:71
  findall(testf::F, A::AbstractArray) where F<:Function
    @ Base array.jl:2359
  findall(testf::Function, A)
    @ Base array.jl:2355

Possible fix, define
  findall(::F, ::Array{T}) where {F<:Function, T<:Function}


#### Statistical Approach

In [80]:
fn = @formula(Transported ~ VIP + CryoSleep + Age + RoomService)
logitModel = glm(fn, passengerData, Bernoulli(), LogitLink())

function measureAccuracy(model, passengerData::DataFrame)::Float64
    predictions = GLM.predict(model, passengerData)
    predictions = [
        if x < 0.5
            false
        else
            true
        end for x in predictions
    ]
    prediction_df = DataFrame(y_actual=passengerData.Transported, y_predicted=predictions, prob_predicted=predictions)
    prediction_df.correctly_classified = prediction_df.y_actual .== prediction_df.y_predicted
    accuracy = mean(prediction_df.correctly_classified)

    return accuracy

end

acc = measureAccuracy(logitModel, passengerData)

0.7201029367241901

#### Machine Learning Approach

In [81]:
passengerData = MLJ.coerce!(passengerData,
    MLJ.autotype(passengerData)
)

y, X = MLJ.unpack(passengerData, ==(:Transported), !=([:PassengerID, :RoomService, :FoodCourt, :ShoppingMall, :Spa, :VRDeck]), colname -> true)
train, test = MLJ.partition(eachindex(y), 0.7; shuffle=true)
Tree = BetaML.DecisionTreeClassifier()
mach = machine(Tree, X, y; scitype_check_level=0)
MLJ.fit!(mach, rows=train)
pred = MLJ.predict(mach, X[test, :])
yhat = StatsBase.mode.(pred)
MLJ.accuracy(yhat, y[test])


┌ Info: Training machine(DecisionTreeClassifier(max_depth = 0, …), …).
└ @ MLJBase /Users/zachary.drake/.julia/packages/MLJBase/5cxU0/src/machines.jl:492


0.7885973763874874

#### Predictions

In [82]:
predictData = CSV.read("data/test.csv", DataFrame) |>
              dropmissing |>
              processpassengerData

pred = MLJ.predict(mach, predictData)
predictData.Transported = StatsBase.mode.(pred)

select!(predictData, :PassengerId, :Transported)

CSV.write("output/submission.csv", predictData)

"output/submission.csv"

In [84]:
predictData

Row,PassengerId,Transported
Unnamed: 0_level_1,String7,Cat…
1,0013_01,true
2,0018_01,false
3,0019_01,true
4,0021_01,true
⋮,⋮,⋮
3278,9265_01,true
3279,9266_01,true
3280,9266_02,true
3281,9277_01,true
