In [135]:
using DataFrames
using CSV
using GLM
using CategoricalArrays
using StatsKit
ENV["DATAFRAMES_ROWS"] = 6

6

In [136]:
function pipeline(data::DataFrame)::DataFrame
    data = select(data, Not(:PassengerId))
    data.RoomDeck = categorical([String(deck[1]) for deck in split.(data.Cabin, "/")])
    data.RoomNum = categorical([String(deck[2]) for deck in split.(data.Cabin, "/")])
    data.RoomSide = categorical([String(deck[3]) for deck in split.(data.Cabin, "/")])
    data.TotalSpent = data.RoomService + data.FoodCourt + data.ShoppingMall + data.Spa + data.VRDeck
    data = select(data, Not(:Cabin))
    data = transform(data, names(data, AbstractString) .=> categorical, renamecols=false)

    return data
end

function measureAccuracy(model, data::DataFrame)::Float64
    predictions = predict(model, pipeline(data))
    predictions = [
        if x < 0.5
            false
        else
            true
        end for x in predictions
    ]
    prediction_df = DataFrame(y_actual=data.Transported, y_predicted=predictions, prob_predicted=predictions)
    prediction_df.correctly_classified = prediction_df.y_actual .== prediction_df.y_predicted
    accuracy = mean(prediction_df.correctly_classified)

    return accuracy

end

measureAccuracy (generic function with 1 method)

In [137]:
data = CSV.read("data/train.csv", DataFrame) |> dropmissing

Row,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
Unnamed: 0_level_1,String7,String7,Bool,String15,String15,Float64,Bool,Float64,Float64,Float64,Float64,Float64,String31,Bool
1,0001_01,Europa,false,B/0/P,TRAPPIST-1e,39.0,false,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,false
2,0002_01,Earth,false,F/0/S,TRAPPIST-1e,24.0,false,109.0,9.0,25.0,549.0,44.0,Juanna Vines,true
3,0003_01,Europa,false,A/0/S,TRAPPIST-1e,58.0,true,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,false
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
6604,9279_01,Earth,false,G/1500/S,TRAPPIST-1e,26.0,false,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,true
6605,9280_01,Europa,false,E/608/S,55 Cancri e,32.0,false,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,false
6606,9280_02,Europa,false,E/608/S,TRAPPIST-1e,44.0,false,126.0,4688.0,0.0,0.0,12.0,Propsh Hontichre,true


In [154]:
fn = @formula(Transported ~ HomePlanet + CryoSleep + ())
model = glm(fn, pipeline(data), Normal(), IdentityLink())

StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Normal{Float64}, IdentityLink}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}}

Transported ~ 1 + HomePlanet + CryoSleep + RoomService + FoodCourt + ShoppingMall + Spa + VRDeck + RoomService & FoodCourt + RoomService & ShoppingMall + FoodCourt & ShoppingMall + RoomService & Spa + FoodCourt & Spa + ShoppingMall & Spa + RoomService & VRDeck + FoodCourt & VRDeck + ShoppingMall & VRDeck + Spa & VRDeck + RoomService & FoodCourt & ShoppingMall + RoomService & FoodCourt & Spa + RoomService & ShoppingMall & Spa + FoodCourt & ShoppingMall & Spa + RoomService & FoodCourt & VRDeck + RoomService & ShoppingMall & VRDeck + FoodCourt & ShoppingMall & VRDeck + RoomService & Spa & VRDeck + FoodCourt & Spa & VRDeck + ShoppingMall & Spa & VRDeck + RoomService & FoodCourt & ShoppingMall & Spa + RoomService & FoodCourt & ShoppingMall & VRDeck + RoomServ

In [155]:
measureAccuracy(model, data)

0.4965183166818044