In [3]:
using Pkg
Pkg.status() # Installed Packages

[32m[1mStatus[22m[39m `C:\Users\Utsav\Utsav\Development\Julia\SpaceshipTitanic\Project.toml`
 [90m [024491cd] [39mBetaML v0.9.6
 [90m [336ed68f] [39mCSV v0.10.9
 [90m [a93c6f00] [39mDataFrames v1.5.0
 [90m [31c24e10] [39mDistributions v0.25.86
 [90m [5789e2e9] [39mFileIO v1.16.0
 [90m [7073ff75] [39mIJulia v1.24.0
 [90m [82e4d734] [39mImageIO v0.6.6
 [90m [6218d12a] [39mImageMagick v1.2.2
 [90m [916415d5] [39mImages v0.25.2
 [90m [add582a8] [39mMLJ v0.19.1
 [90m [c6f25543] [39mMLJDecisionTreeInterface v0.4.0
 [90m [094fc8d1] [39mMLJFlux v0.2.9
 [90m [5ae90465] [39mMLJScikitLearnInterface v0.3.0
 [90m [91a5bcdd] [39mPlots v1.38.8
 [90m [d330b81b] [39mPyPlot v2.11.1
 [90m [321657f4] [39mScientificTypes v3.0.2
 [90m [3646fa90] [39mScikitLearn v0.7.0
 [90m [f3b207a7] [39mStatsPlots v0.15.4
 [90m [239c3e63] [39mVega v2.3.1
 [90m [6385f0a0] [39mWordCloud v0.10.8


In [4]:
# Necessary Imports

using Random
using CSV
using DataFrames
using Plots
using Plots.PlotMeasures
using StatsPlots
# using WordCloud
using Images, FileIO
using Vega
using Statistics
using MLJ
using BetaML
# using ScikitLearn
# using MLJDecisionTreeInterface

In [5]:
# Inputs

# Unnecessary Columns
UNNECESSARY_COLS = ["PassengerId","Name"]

# Handling Missing Value 
MISSING_STRATEGY = "median"
MISSING_COLS = ["Age", "FoodCourt", "ShoppingMall", "Spa", "VRDeck" ,"RoomService"]

# Handling Categorical Features
CATEGORICAL_COLS = ["HomePlanet", "CryoSleep","Cabin", "Destination" ,"VIP"]

5-element Vector{String}:
 "HomePlanet"
 "CryoSleep"
 "Cabin"
 "Destination"
 "VIP"

In [6]:
# Config

BASE_PATH = "../data"
TRAIN_PATH = "$BASE_PATH/train.csv"
TEST_PATH = "$BASE_PATH/test.csv"
SUB_PATH = "$BASE_PATH/sample_submission.csv"


"../data/sample_submission.csv"

In [7]:
train = CSV.File(TRAIN_PATH) |> DataFrame
test = CSV.File(TEST_PATH) |> DataFrame
sub = CSV.File(SUB_PATH) |> DataFrame
print("Done")

Done

In [8]:
function data_imputer_strategy(strategy)
    if (strategy == "mean") 
        return Statistics.mean
    elseif(strategy == "median")
        return Statistics.median
    elseif(strategy == "mode")
        return Statistics.mode
    end
end

data_imputer_strategy (generic function with 1 method)

In [9]:
TARGET = "Transported"
MISSING = "Z"
RANDOM_STATE = 42

42

In [10]:
function dropping_unnecessary_cols(X_train,X_test)
    for col in UNNECESSARY_COLS 
        if col in names(X_train)
            select!(X_train,Not(col))
        end
        if col in names(X_test)
            select!(X_test,Not(col))
        end
    end
    return X_train,X_test
end

function missing_value_handler(X_train,X_test,cols)    
    other_cols = filter(x -> ~(x in cols),names(X_train))
    simple_imputer = @load SimpleImputer pkg=BetaML
    imputer = simple_imputer(statistic=data_imputer_strategy(MISSING_STRATEGY))
    X_train2 = select(X_train,cols) |> MLJ.matrix
    X_test2 = select(X_test,cols) |> MLJ.matrix
    (fitResults,_,_) = MLJ.fit(imputer,0,X_train2)
    
    
    X_train_imputed = MLJ.transform(imputer,fitResults,X_train2) |> DataFrame
    X_test_imputed = MLJ.transform(imputer,fitResults,X_test2) |> DataFrame
    rename!(X_train_imputed,cols)
    rename!(X_test_imputed,cols)

    # for col in cols 
    #     X_train[!,col] = X_train_imputed[!,col]
    #     X_test[!,col] = X_test_imputed[!,col]
    # end
    
    X_train = DataFrame(hcat(select(X_train,other_cols) |> Matrix, X_train_imputed |> Matrix),:auto)
    X_test = DataFrame(hcat(select(X_test,other_cols) |> Matrix, X_test_imputed |> Matrix),:auto)
    rename!(X_train,[other_cols...,cols...])
    rename!(X_test,[other_cols...,cols...])


    X_train[!,cols] = convert.(Float64,X_train_imputed[!,cols])
    X_test[!,cols] = convert.(Float64,X_test_imputed[!,cols])

    return X_train,X_test
end

function categorical_feature_handler(X_train,X_test,cols)
    for col in cols 
        X_train[!,col] = replace!(X_train[!,col], missing => MISSING)
        X_test[!,col] = replace!(X_test[!,col], missing => MISSING)
    end  
    return X_train,X_test      
end

function label_encoding_handler(X_train,X_test,cols) 
    function gen_label_encoder() 
        label_encoder = Dict()
        for col in cols 
            col_mapper = Dict()
            idx = 0
            for val in unique(X_train[!,col])
                col_mapper[val] = idx
                idx+=1
            end
            label_encoder[col] = col_mapper
        end
        label_encoder
    end

    label_encoder = gen_label_encoder()

    for col in cols
        # X_train[!,col] = map(x -> (x===missing) ? missing : get(label_encoder[col],x,-1),X_train[!,col])
        # X_test[!,col] = map(x -> (x===missing) ? missing : get(label_encoder[col],x,-1),X_test[!,col])

        X_train[!,col] = map(x -> get(label_encoder[col],x,-1),X_train[!,col])
        X_test[!,col] = map(x -> get(label_encoder[col],x,-1),X_test[!,col])

    end
    
    X_train,X_test
end

function partition_data(y)
    fraction = 0.8
    rng = RANDOM_STATE
    shuffle = true

    train_idx, test_idx = MLJ.partition(eachindex(y[!,TARGET]), 0.8,shuffle=true,rng=RANDOM_STATE)

    train_idx, test_idx
end

function add_necessary_data_types(train,y_train,test) 
    map = []

    for col in names(train)
        if col in CATEGORICAL_COLS
            push!(map, col => Multiclass)
        else 
            push!(map, col => Continuous)
        end
    end

    # println(map...)
    
    train = coerce(train,map...)
    test = coerce(test,map...)
    # y_train = coerce(y_train,TARGET => Count)
    y_train = coerce(y_train,TARGET => Multiclass)
    train,y_train,test 
end

function preprocessor(train,test,sub) 
    X_train = select(train,Not(TARGET))
    y_train = select(train,TARGET)
    X_train,test = dropping_unnecessary_cols(X_train,test)
    X_train,test = missing_value_handler(X_train,test,MISSING_COLS)
    X_train,test = categorical_feature_handler(X_train,test,CATEGORICAL_COLS)
    X_train,test = label_encoding_handler(X_train,test,CATEGORICAL_COLS)
    X_train,y_train,test = add_necessary_data_types(X_train,y_train,test)
    # X_train,y_train,X_test,y_test = partition_data(X_train,y_train)
    # y_train = y_train[!,TARGET]

    X_train,y_train,test,sub
end

# X_train,y_train,X_test,y_test,test,sub = preprocessor(train,test,sub) 
X_train,y_train,test,sub = preprocessor(train,test,sub)
train_idx, test_idx = partition_data(y_train)
y_train = y_train[!,TARGET]
println(size(train_idx),size(test_idx),size(test),size(sub))
print("Done!")

┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\Utsav\.julia\packages\MLJModels\OJDDo\src\loading.jl:159


import BetaML ✔


(6954,)(1739,)(4277, 11)(4277, 2)
Done!

In [50]:
sub

Row,PassengerId,Transported
Unnamed: 0_level_1,String7,Bool
1,0013_01,true
2,0018_01,false
3,0019_01,true
4,0021_01,true
5,0023_01,true
6,0027_01,true
7,0029_01,true
8,0032_01,true
9,0032_02,true
10,0033_01,false


In [11]:
MLJ.models(matching(X_train,y_train))

4-element Vector{NamedTuple{(:name, :package_name, :is_supervised, :abstract_type, :deep_properties, :docstring, :fit_data_scitype, :human_name, :hyperparameter_ranges, :hyperparameter_types, :hyperparameters, :implemented_methods, :inverse_transform_scitype, :is_pure_julia, :is_wrapper, :iteration_parameter, :load_path, :package_license, :package_url, :package_uuid, :predict_scitype, :prediction_type, :reporting_operations, :reports_feature_importances, :supports_class_weights, :supports_online, :supports_training_losses, :supports_weights, :transform_scitype, :input_scitype, :target_scitype, :output_scitype)}}:
 (name = ConstantClassifier, package_name = MLJModels, ... )
 (name = DecisionTreeClassifier, package_name = BetaML, ... )
 (name = DeterministicConstantClassifier, package_name = MLJModels, ... )
 (name = RandomForestClassifier, package_name = BetaML, ... )

In [12]:
test_results=Dict()

Dict{Any, Any}()

In [13]:
# @doc MLJDecisionTreeInterface.DecisionTreeClassifier

Random.seed!(RANDOM_STATE)
model = @load DecisionTreeClassifier pkg=DecisionTree
model = machine(model(),X_train,y_train)
eval_results=evaluate!(model,rows=train_idx, resampling=CV(nfolds=10,shuffle=true,rng=RANDOM_STATE), measures=[MLJ.accuracy],operation=predict_mode)
y_hat = predict_mode(model, rows=test_idx)
test_results["DecisionTree"] = MLJ.accuracy(y_hat, y_train[test_idx])
eval_results

import MLJDecisionTreeInterface

┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\Utsav\.julia\packages\MLJModels\OJDDo\src\loading.jl:159


 ✔




│ supports. Suppress this type check by specifying `scitype_check_level=0`.
│ 
│ Run `@doc DecisionTree.DecisionTreeClassifier` to learn more about your model's requirements.
│ 
│ Commonly, but non exclusively, supervised models are constructed using the syntax
│ `machine(model, X, y)` or `machine(model, X, y, w)` while most other models are
│ constructed with `machine(model, X)`.  Here `X` are features, `y` a target, and `w`
│ sample or class weights.
│ 
│ In general, data in `machine(model, data...)` is expected to satisfy
│ 
│     scitype(data) <: MLJ.fit_data_scitype(model)
│ 
│ In the present case:
│ 
│ scitype(data) = Tuple{Table{Union{AbstractVector{Continuous}, AbstractVector{Multiclass{6561}}, AbstractVector{Multiclass{4}}, AbstractVector{Multiclass{3}}}}, AbstractVector{Multiclass{2}}}
│ 
│ fit_data_scitype(model) = Tuple{Table{<:Union{AbstractVector{<:Continuous}, AbstractVector{<:Count}, AbstractVector{<:OrderedFactor}}}, AbstractVector{<:Finite}}
└ @ MLJBase C:\Users\Utsav

┌ Info: Creating subsamples from a subset of all rows. 
└ @ MLJBase C:\Users\Utsav\.julia\packages\MLJBase\WKVEo\src\resampling.jl:601


[33mEvaluating over 10 folds:  20%[=====>                   ]  ETA: 0:00:38[39m[K






PerformanceEvaluation object with these fields:
  measure, operation, measurement, per_fold,
  per_observation, fitted_params_per_fold,
  report_per_fold, train_test_rows
Extract:
┌────────────┬──────────────┬─────────────┬─────────┬───────────────────────────
│[22m measure    [0m│[22m operation    [0m│[22m measurement [0m│[22m 1.96*SE [0m│[22m per_fold                [0m ⋯
├────────────┼──────────────┼─────────────┼─────────┼───────────────────────────
│ Accuracy() │ predict_mode │ 0.717       │ 0.0179  │ [0.71, 0.721, 0.741, 0.7 ⋯
└────────────┴──────────────┴─────────────┴─────────┴───────────────────────────
[36m                                                                1 column omitted[0m


In [14]:
# @doc MLJDecisionTreeInterface.RandomForestClassifier

Random.seed!(RANDOM_STATE)
model = @load RandomForestClassifier pkg=DecisionTree
model = machine(model(),X_train,y_train)
eval_results=evaluate!(model,rows=train_idx, resampling=CV(nfolds=10,shuffle=true,rng=RANDOM_STATE), measures=[MLJ.accuracy],operation=predict_mode)
y_hat = predict_mode(model, rows=test_idx)
test_results["RandomForest"] =  MLJ.accuracy(y_hat, y_train[test_idx])
eval_results

import MLJDecisionTreeInterface ✔


┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\Utsav\.julia\packages\MLJModels\OJDDo\src\loading.jl:159


│ supports. Suppress this type check by specifying `scitype_check_level=0`.
│ 
│ Run `@doc DecisionTree.RandomForestClassifier` to learn more about your model's requirements.
│ 
│ Commonly, but non exclusively, supervised models are constructed using the syntax
│ `machine(model, X, y)` or `machine(model, X, y, w)` while most other models are
│ constructed with `machine(model, X)`.  Here `X` are features, `y` a target, and `w`
│ sample or class weights.
│ 
│ In general, data in `machine(model, data...)` is expected to satisfy
│ 
│     scitype(data) <: MLJ.fit_data_scitype(model)
│ 
│ In the present case:
│ 
│ scitype(data) = Tuple{Table{Union{AbstractVector{Continuous}, AbstractVector{Multiclass{6561}}, AbstractVector{Multiclass{4}}, AbstractVector{Multiclass{3}}}}, AbstractVector{Multiclass{2}}}
│ 
│ fit_data_scitype(model) = Tuple{Table{<:Union{AbstractVector{<:Continuous}, AbstractVector{<:Count}, AbstractVector{<:OrderedFactor}}}, AbstractVector{<:Finite}}
└ @ MLJBase C:\Users\Utsav

[33mEvaluating over 10 folds:  20%[=====>                   ]  ETA: 0:00:10[39m[K

















PerformanceEvaluation object with these fields:
  measure, operation, measurement, per_fold,
  per_observation, fitted_params_per_fold,
  report_per_fold, train_test_rows
Extract:
┌────────────┬──────────────┬─────────────┬─────────┬───────────────────────────
│[22m measure    [0m│[22m operation    [0m│[22m measurement [0m│[22m 1.96*SE [0m│[22m per_fold                [0m ⋯
├────────────┼──────────────┼─────────────┼─────────┼───────────────────────────
│ Accuracy() │ predict_mode │ 0.784       │ 0.0146  │ [0.796, 0.793, 0.799, 0. ⋯
└────────────┴──────────────┴─────────────┴─────────┴───────────────────────────
[36m                                                                1 column omitted[0m


In [15]:
# @doc MLJScikitLearnInterface.ExtraTreesClassifier

Random.seed!(RANDOM_STATE)
model = @load ExtraTreesClassifier
model = machine(model(),X_train,y_train)
eval_results=evaluate!(model,rows=train_idx, resampling=CV(nfolds=10,shuffle=true,rng=RANDOM_STATE), measures=[MLJ.accuracy],operation=predict_mode)
y_hat = predict_mode(model, rows=test_idx)
test_results["ExtraTrees"] =  MLJ.accuracy(y_hat, y_train[test_idx])
eval_results

┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\Utsav\.julia\packages\MLJModels\OJDDo\src\loading.jl:159


import MLJScikitLearnInterface ✔


│ supports. Suppress this type check by specifying `scitype_check_level=0`.
│ 
│ Run `@doc ScikitLearn.ExtraTreesClassifier` to learn more about your model's requirements.
│ 
│ Commonly, but non exclusively, supervised models are constructed using the syntax
│ `machine(model, X, y)` or `machine(model, X, y, w)` while most other models are
│ constructed with `machine(model, X)`.  Here `X` are features, `y` a target, and `w`
│ sample or class weights.
│ 
│ In general, data in `machine(model, data...)` is expected to satisfy
│ 
│     scitype(data) <: MLJ.fit_data_scitype(model)
│ 
│ In the present case:
│ 
│ scitype(data) = Tuple{Table{Union{AbstractVector{Continuous}, AbstractVector{Multiclass{6561}}, AbstractVector{Multiclass{4}}, AbstractVector{Multiclass{3}}}}, AbstractVector{Multiclass{2}}}
│ 
│ fit_data_scitype(model) = Tuple{Table{<:AbstractVector{<:Continuous}}, AbstractVector{<:Finite}}
└ @ MLJBase C:\Users\Utsav\.julia\packages\MLJBase\WKVEo\src\machines.jl:230


┌ Info: Creating subsamples from a subset of all rows. 
└ @ MLJBase C:\Users\Utsav\.julia\packages\MLJBase\WKVEo\src\resampling.jl:601


[33mEvaluating over 10 folds:  20%[=====>                   ]  ETA: 0:00:17[39m[K

















PerformanceEvaluation object with these fields:
  measure, operation, measurement, per_fold,
  per_observation, fitted_params_per_fold,
  report_per_fold, train_test_rows
Extract:
┌────────────┬──────────────┬─────────────┬─────────┬───────────────────────────
│[22m measure    [0m│[22m operation    [0m│[22m measurement [0m│[22m 1.96*SE [0m│[22m per_fold                [0m ⋯
├────────────┼──────────────┼─────────────┼─────────┼───────────────────────────
│ Accuracy() │ predict_mode │ 0.779       │ 0.0142  │ [0.784, 0.792, 0.793, 0. ⋯
└────────────┴──────────────┴─────────────┴─────────┴───────────────────────────
[36m                                                                1 column omitted[0m


In [16]:
# @doc MLJScikitLearnInterface.LogisticClassifier

Random.seed!(RANDOM_STATE)
model = @load LogisticClassifier pkg=ScikitLearn
model = machine(model(),X_train,y_train)
eval_results=evaluate!(model,rows=train_idx, resampling=CV(nfolds=10,shuffle=true,rng=RANDOM_STATE), measures=[MLJ.accuracy],operation=predict_mode)
y_hat = predict_mode(model, rows=test_idx)
test_results["Logistic"] =  MLJ.accuracy(y_hat, y_train[test_idx])
eval_results

import MLJScikitLearnInterface

┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\Utsav\.julia\packages\MLJModels\OJDDo\src\loading.jl:159


 ✔




│ supports. Suppress this type check by specifying `scitype_check_level=0`.
│ 
│ Run `@doc ScikitLearn.LogisticClassifier` to learn more about your model's requirements.
│ 
│ Commonly, but non exclusively, supervised models are constructed using the syntax
│ `machine(model, X, y)` or `machine(model, X, y, w)` while most other models are
│ constructed with `machine(model, X)`.  Here `X` are features, `y` a target, and `w`
│ sample or class weights.
│ 
│ In general, data in `machine(model, data...)` is expected to satisfy
│ 
│     scitype(data) <: MLJ.fit_data_scitype(model)
│ 
│ In the present case:
│ 
│ scitype(data) = Tuple{Table{Union{AbstractVector{Continuous}, AbstractVector{Multiclass{6561}}, AbstractVector{Multiclass{4}}, AbstractVector{Multiclass{3}}}}, AbstractVector{Multiclass{2}}}
│ 
│ fit_data_scitype(model) = Tuple{Table{<:AbstractVector{<:Continuous}}, AbstractVector{<:Finite}}
└ @ MLJBase C:\Users\Utsav\.julia\packages\MLJBase\WKVEo\src\machines.jl:230
┌ Info: Creating su

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[33mEvaluating over 10 folds:  20%[=====>                   ]  ETA: 0:00:02[39m[K

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alte

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


PerformanceEvaluation object with these fields:
  measure, operation, measurement, per_fold,
  per_observation, fitted_params_per_fold,
  report_per_fold, train_test_rows
Extract:
┌────────────┬──────────────┬─────────────┬─────────┬───────────────────────────
│[22m measure    [0m│[22m operation    [0m│[22m measurement [0m│[22m 1.96*SE [0m│[22m per_fold                [0m ⋯
├────────────┼──────────────┼─────────────┼─────────┼───────────────────────────
│ Accuracy() │ predict_mode │ 0.781       │ 0.0147  │ [0.802, 0.795, 0.774, 0. ⋯
└────────────┴──────────────┴─────────────┴─────────┴───────────────────────────
[36m                                                                1 column omitted[0m


In [17]:
# @doc MLJScikitLearnInterface.KNeighborsClassifier

Random.seed!(RANDOM_STATE)
model = @load KNeighborsClassifier
model = machine(model(),X_train,y_train)
eval_results=evaluate!(model,rows=train_idx, resampling=CV(nfolds=10,shuffle=true,rng=RANDOM_STATE), measures=[MLJ.accuracy],operation=predict_mode)
y_hat = predict_mode(model, rows=test_idx)
test_results["KNeighbors"] =  MLJ.accuracy(y_hat, y_train[test_idx])
eval_results

import MLJScikitLearnInterface ✔


┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\Utsav\.julia\packages\MLJModels\OJDDo\src\loading.jl:159


│ supports. Suppress this type check by specifying `scitype_check_level=0`.
│ 
│ Run `@doc ScikitLearn.KNeighborsClassifier` to learn more about your model's requirements.
│ 
│ Commonly, but non exclusively, supervised models are constructed using the syntax
│ `machine(model, X, y)` or `machine(model, X, y, w)` while most other models are
│ constructed with `machine(model, X)`.  Here `X` are features, `y` a target, and `w`
│ sample or class weights.
│ 
│ In general, data in `machine(model, data...)` is expected to satisfy
│ 
│     scitype(data) <: MLJ.fit_data_scitype(model)
│ 
│ In the present case:
│ 
│ scitype(data) = Tuple{Table{Union{AbstractVector{Continuous}, AbstractVector{Multiclass{6561}}, AbstractVector{Multiclass{4}}, AbstractVector{Multiclass{3}}}}, AbstractVector{Multiclass{2}}}
│ 
│ fit_data_scitype(model) = Tuple{Table{<:AbstractVector{<:Continuous}}, AbstractVector{<:Finite}}
└ @ MLJBase C:\Users\Utsav\.julia\packages\MLJBase\WKVEo\src\machines.jl:230


┌ Info: Creating subsamples from a subset of all rows. 
└ @ MLJBase C:\Users\Utsav\.julia\packages\MLJBase\WKVEo\src\resampling.jl:601
[33mEvaluating over 10 folds:  20%[=====>                   ]  ETA: 0:00:04[39m[K







PerformanceEvaluation object with these fields:
  measure, operation, measurement, per_fold,
  per_observation, fitted_params_per_fold,
  report_per_fold, train_test_rows
Extract:
┌────────────┬──────────────┬─────────────┬─────────┬───────────────────────────
│[22m measure    [0m│[22m operation    [0m│[22m measurement [0m│[22m 1.96*SE [0m│[22m per_fold                [0m ⋯
├────────────┼──────────────┼─────────────┼─────────┼───────────────────────────
│ Accuracy() │ predict_mode │ 0.759       │ 0.0136  │ [0.741, 0.777, 0.76, 0.7 ⋯
└────────────┴──────────────┴─────────────┴─────────┴───────────────────────────
[36m                                                                1 column omitted[0m


In [18]:
# @doc MLJScikitLearnInterface.AdaBoostClassifier

Random.seed!(RANDOM_STATE)
model = @load AdaBoostClassifier
model = machine(model(),X_train,y_train)
eval_results=evaluate!(model,rows=train_idx, resampling=CV(nfolds=10,shuffle=true,rng=RANDOM_STATE), measures=[MLJ.accuracy],operation=predict_mode)
y_hat = predict_mode(model, rows=test_idx)
test_results["AdaBoost"] =  MLJ.accuracy(y_hat, y_train[test_idx])
eval_results

import MLJScikitLearnInterface ✔

┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\Utsav\.julia\packages\MLJModels\OJDDo\src\loading.jl:159





│ supports. Suppress this type check by specifying `scitype_check_level=0`.
│ 
│ Run `@doc ScikitLearn.AdaBoostClassifier` to learn more about your model's requirements.
│ 
│ Commonly, but non exclusively, supervised models are constructed using the syntax
│ `machine(model, X, y)` or `machine(model, X, y, w)` while most other models are
│ constructed with `machine(model, X)`.  Here `X` are features, `y` a target, and `w`
│ sample or class weights.
│ 
│ In general, data in `machine(model, data...)` is expected to satisfy
│ 
│     scitype(data) <: MLJ.fit_data_scitype(model)
│ 
│ In the present case:
│ 
│ scitype(data) = Tuple{Table{Union{AbstractVector{Continuous}, AbstractVector{Multiclass{6561}}, AbstractVector{Multiclass{4}}, AbstractVector{Multiclass{3}}}}, AbstractVector{Multiclass{2}}}
│ 
│ fit_data_scitype(model) = Tuple{Table{<:AbstractVector{<:Continuous}}, AbstractVector{<:Finite}}
└ @ MLJBase C:\Users\Utsav\.julia\packages\MLJBase\WKVEo\src\machines.jl:230


┌ Info: Creating subsamples from a subset of all rows. 
└ @ MLJBase C:\Users\Utsav\.julia\packages\MLJBase\WKVEo\src\resampling.jl:601


[33mEvaluating over 10 folds:  20%[=====>                   ]  ETA: 0:00:05[39m[K

















PerformanceEvaluation object with these fields:
  measure, operation, measurement, per_fold,
  per_observation, fitted_params_per_fold,
  report_per_fold, train_test_rows
Extract:
┌────────────┬──────────────┬─────────────┬─────────┬───────────────────────────
│[22m measure    [0m│[22m operation    [0m│[22m measurement [0m│[22m 1.96*SE [0m│[22m per_fold                [0m ⋯
├────────────┼──────────────┼─────────────┼─────────┼───────────────────────────
│ Accuracy() │ predict_mode │ 0.792       │ 0.0141  │ [0.784, 0.792, 0.818, 0. ⋯
└────────────┴──────────────┴─────────────┴─────────┴───────────────────────────
[36m                                                                1 column omitted[0m


In [19]:
# @doc MLJScikitLearnInterface.GradientBoostingClassifier

Random.seed!(RANDOM_STATE)
model = @load GradientBoostingClassifier
model = machine(model(),X_train,y_train)
eval_results=evaluate!(model,rows=train_idx, resampling=CV(nfolds=10,shuffle=true,rng=RANDOM_STATE), measures=[MLJ.accuracy],operation=predict_mode)
y_hat = predict_mode(model, rows=test_idx)
test_results["GradientBoosting"] =  MLJ.accuracy(y_hat, y_train[test_idx])
eval_results

import MLJScikitLearnInterface ✔


┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\Utsav\.julia\packages\MLJModels\OJDDo\src\loading.jl:159


│ supports. Suppress this type check by specifying `scitype_check_level=0`.
│ 
│ Run `@doc ScikitLearn.GradientBoostingClassifier` to learn more about your model's requirements.
│ 
│ Commonly, but non exclusively, supervised models are constructed using the syntax
│ `machine(model, X, y)` or `machine(model, X, y, w)` while most other models are
│ constructed with `machine(model, X)`.  Here `X` are features, `y` a target, and `w`
│ sample or class weights.
│ 
│ In general, data in `machine(model, data...)` is expected to satisfy
│ 
│     scitype(data) <: MLJ.fit_data_scitype(model)
│ 
│ In the present case:
│ 
│ scitype(data) = Tuple{Table{Union{AbstractVector{Continuous}, AbstractVector{Multiclass{6561}}, AbstractVector{Multiclass{4}}, AbstractVector{Multiclass{3}}}}, AbstractVector{Multiclass{2}}}
│ 
│ fit_data_scitype(model) = Tuple{Table{<:AbstractVector{<:Continuous}}, AbstractVector{<:Finite}}
└ @ MLJBase C:\Users\Utsav\.julia\packages\MLJBase\WKVEo\src\machines.jl:230
┌ Info: Cre

[33mEvaluating over 10 folds:  20%[=====>                   ]  ETA: 0:00:10[39m[K

















PerformanceEvaluation object with these fields:
  measure, operation, measurement, per_fold,
  per_observation, fitted_params_per_fold,
  report_per_fold, train_test_rows
Extract:
┌────────────┬──────────────┬─────────────┬─────────┬───────────────────────────
│[22m measure    [0m│[22m operation    [0m│[22m measurement [0m│[22m 1.96*SE [0m│[22m per_fold                [0m ⋯
├────────────┼──────────────┼─────────────┼─────────┼───────────────────────────
│ Accuracy() │ predict_mode │ 0.795       │ 0.0155  │ [0.796, 0.797, 0.799, 0. ⋯
└────────────┴──────────────┴─────────────┴─────────┴───────────────────────────
[36m                                                                1 column omitted[0m


In [20]:
sort(collect(test_results),by = x->x[2])

7-element Vector{Pair{Any, Any}}:
     "DecisionTree" => 0.7303047728579644
       "KNeighbors" => 0.750431282346176
         "Logistic" => 0.7763082231167338
       "ExtraTrees" => 0.7918343875790684
         "AdaBoost" => 0.7935595169637722
     "RandomForest" => 0.7941345600920069
 "GradientBoosting" => 0.8050603795284647

In [21]:
#checking fitting parameters for possible best model
report(model)

In [22]:
y_sub = MLJ.predict(model,test)
y_sub

4277-element CategoricalDistributions.UnivariateFiniteVector{Multiclass{2}, Bool, UInt32, Float64}:
 UnivariateFinite{Multiclass{2}}(false=>0.365, true=>0.635)
 UnivariateFinite{Multiclass{2}}(false=>0.894, true=>0.106)
 UnivariateFinite{Multiclass{2}}(false=>0.115, true=>0.885)
 UnivariateFinite{Multiclass{2}}(false=>0.156, true=>0.844)
 UnivariateFinite{Multiclass{2}}(false=>0.472, true=>0.528)
 UnivariateFinite{Multiclass{2}}(false=>0.398, true=>0.602)
 UnivariateFinite{Multiclass{2}}(false=>0.115, true=>0.885)
 UnivariateFinite{Multiclass{2}}(false=>0.132, true=>0.868)
 UnivariateFinite{Multiclass{2}}(false=>0.115, true=>0.885)
 UnivariateFinite{Multiclass{2}}(false=>0.53, true=>0.47)
 ⋮
 UnivariateFinite{Multiclass{2}}(false=>0.495, true=>0.505)
 UnivariateFinite{Multiclass{2}}(false=>0.359, true=>0.641)
 UnivariateFinite{Multiclass{2}}(false=>0.0982, true=>0.902)
 UnivariateFinite{Multiclass{2}}(false=>0.437, true=>0.563)
 UnivariateFinite{Multiclass{2}}(false=>0.365, true=>0.635

In [24]:
# function load_model(name)
#     model = missing
#     if (name == "DecisionTree" )
#         model = @load DecisionTreeClassifier pkg=DecisionTree
#         # model = model()
#     elseif (name == "RandomForest")
#         model = @load RandomForestClassifier pkg=DecisionTree
#         # model = model()
#     elseif (name == "ExtraTrees")
#         model = @load ExtraTreesClassifier pkg=ScikitLearn
#         # model = model()
#     elseif (name == "Logistic")
#         model = @load LogisticClassifier pkg=ScikitLearn
#         # model = model()
#     elseif (name == "KNeighbors")
#         model = @load KNeighborsClassifier pkg=ScikitLearn
#         # model = model()
#     elseif (name == "AdaBoost")
#         model = @load AdaBoostClassifier pkg=ScikitLearn
#         # model = model()
#     elseif (name == "GradientBoosting")
#         model = @load GradientBoostingClassifier pkg=ScikitLearn
#         # model = model()
#     else 
#         throw(DomainError(name,"Invalid Model Name"))
#     end
#     model
# end

In [34]:
function load_model(name)
    model = missing
    if name == "DecisionTree" 
        model = @load DecisionTreeClassifier pkg=DecisionTree
        # model = model()
    elseif name == "RandomForest"
        model = @load RandomForestClassifier pkg=DecisionTree
        # model = model()
    elseif name == "ExtraTrees"
        model = @load ExtraTreesClassifier pkg=ScikitLearn
        # model = model()
    elseif name == "Logistic"
        model = @load LogisticClassifier pkg=ScikitLearn
        # model = model()
    elseif name == "KNeighbors"
        model = @load KNeighborsClassifier pkg=ScikitLearn
        # model = model()
    elseif name == "AdaBoost"
        model = @load AdaBoostClassifier pkg=ScikitLearn
        # model = model()
    elseif name == "GradientBoosting"
        model = @load GradientBoostingClassifier pkg=ScikitLearn
        # model = model()
    else 
        throw(DomainError(name,"Invalid Model Name"))
    end
    model
end

function setup_model(name,X,y)
    # model_preprocessing()
    model = load_model(name)
    model = machine(model(),X,y)
    model
end

setup_model (generic function with 1 method)

predict (generic function with 1 method)

In [58]:
function predict(model,test,sub)
    y_sub = MLJ.predict(model,test)
    sub[!,TARGET] = map(x -> x ? "True" : "False",pdf.(y_sub,true) .>= 0.5)
    sub
end

function evaluate(X_train,y_train,test,sub)
    train_results = Dict()
    test_results = Dict()

    all_models = [
        "DecisionTree",
        "RandomForest",
        "ExtraTrees",
        "Logistic",
        "KNeighbors",
        "AdaBoost",
        "GradientBoosting"
    ]

    # train_idx, test_idx = partition_data(y_train)
    # y_train = y_train[!,TARGET]

    for model_name in all_models 
        model = setup_model(model_name,X_train,y_train)
        eval_results=evaluate!(model,rows=train_idx, resampling=CV(nfolds=10,shuffle=true,rng=RANDOM_STATE), measures=[MLJ.accuracy],operation=predict_mode)
        test_results[model_name] = MLJ.accuracy(predict_mode(model, rows=test_idx), y_train[test_idx])
        train_results[model_name] = MLJ.accuracy(predict_mode(model, rows=train_idx), y_train[train_idx])
        println("$model_name 10 Folds :\n $(eval_results.per_fold)\n\n\n")
        sub = predict(model,test,sub)
        CSV.write("../output/$model_name.csv",sub)
    end

    train_results,test_results
end

train_results,test_results = evaluate(X_train,y_train,test,sub)

import MLJDecisionTreeInterface ✔


┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\Utsav\.julia\packages\MLJModels\OJDDo\src\loading.jl:159
│ supports. Suppress this type check by specifying `scitype_check_level=0`.
│ 
│ Run `@doc DecisionTree.DecisionTreeClassifier` to learn more about your model's requirements.
│ 
│ Commonly, but non exclusively, supervised models are constructed using the syntax
│ `machine(model, X, y)` or `machine(model, X, y, w)` while most other models are
│ constructed with `machine(model, X)`.  Here `X` are features, `y` a target, and `w`
│ sample or class weights.
│ 
│ In general, data in `machine(model, data...)` is expected to satisfy
│ 
│     scitype(data) <: MLJ.fit_data_scitype(model)
│ 
│ In the present case:
│ 
│ scitype(data) = Tuple{Table{Union{AbstractVector{Continuous}, AbstractVector{Multiclass{6561}}, AbstractVector{Multiclass{4}}, AbstractVector{Multiclass{3}}}}, AbstractVector{Multiclass{2}}}
│ 
│ fit_data_scitype(model) = Tuple{Table{<:Union{AbstractVector{









DecisionTree 10 Folds :
 [[0.7183908045977012, 0.7126436781609196, 0.742816091954023, 0.7227011494252873, 0.7309352517985612, 0.7352517985611511, 0.7050359712230216, 0.7093525179856115, 0.6474820143884892, 0.7482014388489209]]





import MLJDecisionTreeInterface ✔


┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\Utsav\.julia\packages\MLJModels\OJDDo\src\loading.jl:159
│ supports. Suppress this type check by specifying `scitype_check_level=0`.
│ 
│ Run `@doc DecisionTree.RandomForestClassifier` to learn more about your model's requirements.
│ 
│ Commonly, but non exclusively, supervised models are constructed using the syntax
│ `machine(model, X, y)` or `machine(model, X, y, w)` while most other models are
│ constructed with `machine(model, X)`.  Here `X` are features, `y` a target, and `w`
│ sample or class weights.
│ 
│ In general, data in `machine(model, data...)` is expected to satisfy
│ 
│     scitype(data) <: MLJ.fit_data_scitype(model)
│ 
│ In the present case:
│ 
│ scitype(data) = Tuple{Table{Union{AbstractVector{Continuous}, AbstractVector{Multiclass{6561}}, AbstractVector{Multiclass{4}}, AbstractVector{Multiclass{3}}}}, AbstractVector{Multiclass{2}}}
│ 
│ fit_data_scitype(model) = Tuple{Table{<:Union{AbstractVector{

┌ Info: Creating subsamples from a subset of all rows. 
└ @ MLJBase C:\Users\Utsav\.julia\packages\MLJBase\WKVEo\src\resampling.jl:601


[33mEvaluating over 10 folds:  20%[=====>                   ]  ETA: 0:00:05[39m[K

















RandomForest 10 Folds :
 [[0.7873563218390804, 0.7945402298850575, 0.7887931034482758, 0.771551724137931, 0.8086330935251799, 0.781294964028777, 0.7611510791366907, 0.7741007194244605, 0.7467625899280576, 0.8302158273381295]]





import MLJScikitLearnInterface ✔


┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\Utsav\.julia\packages\MLJModels\OJDDo\src\loading.jl:159
│ supports. Suppress this type check by specifying `scitype_check_level=0`.
│ 
│ Run `@doc ScikitLearn.ExtraTreesClassifier` to learn more about your model's requirements.
│ 
│ Commonly, but non exclusively, supervised models are constructed using the syntax
│ `machine(model, X, y)` or `machine(model, X, y, w)` while most other models are
│ constructed with `machine(model, X)`.  Here `X` are features, `y` a target, and `w`
│ sample or class weights.
│ 
│ In general, data in `machine(model, data...)` is expected to satisfy
│ 
│     scitype(data) <: MLJ.fit_data_scitype(model)
│ 
│ In the present case:
│ 
│ scitype(data) = Tuple{Table{Union{AbstractVector{Continuous}, AbstractVector{Multiclass{6561}}, AbstractVector{Multiclass{4}}, AbstractVector{Multiclass{3}}}}, AbstractVector{Multiclass{2}}}
│ 
│ fit_data_scitype(model) = Tuple{Table{<:AbstractVector{<:Continu

┌ Info: Creating subsamples from a subset of all rows. 
└ @ MLJBase C:\Users\Utsav\.julia\packages\MLJBase\WKVEo\src\resampling.jl:601
[33mEvaluating over 10 folds:  20%[=====>                   ]  ETA: 0:00:06[39m[K

















ExtraTrees 10 Folds :
 [[0.7916666666666666, 0.7844827586206897, 0.7945402298850575, 0.7787356321839081, 0.8057553956834532, 0.7769784172661871, 0.7467625899280576, 0.7726618705035971, 0.7424460431654676, 0.8100719424460432]]





import MLJScikitLearnInterface ✔


┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\Utsav\.julia\packages\MLJModels\OJDDo\src\loading.jl:159
│ supports. Suppress this type check by specifying `scitype_check_level=0`.
│ 
│ Run `@doc ScikitLearn.LogisticClassifier` to learn more about your model's requirements.
│ 
│ Commonly, but non exclusively, supervised models are constructed using the syntax
│ `machine(model, X, y)` or `machine(model, X, y, w)` while most other models are
│ constructed with `machine(model, X)`.  Here `X` are features, `y` a target, and `w`
│ sample or class weights.
│ 
│ In general, data in `machine(model, data...)` is expected to satisfy
│ 
│     scitype(data) <: MLJ.fit_data_scitype(model)
│ 
│ In the present case:
│ 
│ scitype(data) = Tuple{Table{Union{AbstractVector{Continuous}, AbstractVector{Multiclass{6561}}, AbstractVector{Multiclass{4}}, AbstractVector{Multiclass{3}}}}, AbstractVector{Multiclass{2}}}
│ 
│ fit_data_scitype(model) = Tuple{Table{<:AbstractVector{<:Continuou



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alte

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic 10 Folds :
 [[0.8017241379310345, 0.7945402298850575, 0.7744252873563219, 0.7844827586206897, 0.8129496402877698, 0.7827338129496403, 0.7654676258992805, 0.7597122302158273, 0.7366906474820144, 0.7956834532374101]]



import MLJScikitLearnInterface ✔


┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\Utsav\.julia\packages\MLJModels\OJDDo\src\loading.jl:159
│ supports. Suppress this type check by specifying `scitype_check_level=0`.
│ 
│ Run `@doc ScikitLearn.KNeighborsClassifier` to learn more about your model's requirements.
│ 
│ Commonly, but non exclusively, supervised models are constructed using the syntax
│ `machine(model, X, y)` or `machine(model, X, y, w)` while most other models are
│ constructed with `machine(model, X)`.  Here `X` are features, `y` a target, and `w`
│ sample or class weights.
│ 
│ In general, data in `machine(model, data...)` is expected to satisfy
│ 
│     scitype(data) <: MLJ.fit_data_scitype(model)
│ 
│ In the present case:
│ 
│ scitype(data) = Tuple{Table{Union{AbstractVector{Continuous}, AbstractVector{Multiclass{6561}}, AbstractVector{Multiclass{4}}, AbstractVector{Multiclass{3}}}}, AbstractVector{Multiclass{2}}}
│ 
│ fit_data_scitype(model) = Tuple{Table{<:AbstractVector{<:Continu







KNeighbors 10 Folds :
 [[0.7413793103448276, 0.7772988505747127, 0.7600574712643678, 0.7385057471264368, 0.7654676258992805, 0.7798561151079136, 0.743884892086331, 0.7597122302158273, 0.7309352517985612, 0.7956834532374101]]





import MLJScikitLearnInterface ✔


┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\Utsav\.julia\packages\MLJModels\OJDDo\src\loading.jl:159
│ supports. Suppress this type check by specifying `scitype_check_level=0`.
│ 
│ Run `@doc ScikitLearn.AdaBoostClassifier` to learn more about your model's requirements.
│ 
│ Commonly, but non exclusively, supervised models are constructed using the syntax
│ `machine(model, X, y)` or `machine(model, X, y, w)` while most other models are
│ constructed with `machine(model, X)`.  Here `X` are features, `y` a target, and `w`
│ sample or class weights.
│ 
│ In general, data in `machine(model, data...)` is expected to satisfy
│ 
│     scitype(data) <: MLJ.fit_data_scitype(model)
│ 
│ In the present case:
│ 
│ scitype(data) = Tuple{Table{Union{AbstractVector{Continuous}, AbstractVector{Multiclass{6561}}, AbstractVector{Multiclass{4}}, AbstractVector{Multiclass{3}}}}, AbstractVector{Multiclass{2}}}
│ 
│ fit_data_scitype(model) = Tuple{Table{<:AbstractVector{<:Continuou

[33mEvaluating over 10 folds:  20%[=====>                   ]  ETA: 0:00:02[39m[K

















AdaBoost 10 Folds :
 [[0.7844827586206897, 0.7916666666666666, 0.8175287356321839, 0.7959770114942528, 0.8129496402877698, 0.8086330935251799, 0.7654676258992805, 0.7899280575539569, 0.7496402877697842, 0.8071942446043165]]



import MLJScikitLearnInterface ✔


┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\Utsav\.julia\packages\MLJModels\OJDDo\src\loading.jl:159
│ supports. Suppress this type check by specifying `scitype_check_level=0`.
│ 
│ Run `@doc ScikitLearn.GradientBoostingClassifier` to learn more about your model's requirements.
│ 
│ Commonly, but non exclusively, supervised models are constructed using the syntax
│ `machine(model, X, y)` or `machine(model, X, y, w)` while most other models are
│ constructed with `machine(model, X)`.  Here `X` are features, `y` a target, and `w`
│ sample or class weights.
│ 
│ In general, data in `machine(model, data...)` is expected to satisfy
│ 
│     scitype(data) <: MLJ.fit_data_scitype(model)
│ 
│ In the present case:
│ 
│ scitype(data) = Tuple{Table{Union{AbstractVector{Continuous}, AbstractVector{Multiclass{6561}}, AbstractVector{Multiclass{4}}, AbstractVector{Multiclass{3}}}}, AbstractVector{Multiclass{2}}}
│ 
│ fit_data_scitype(model) = Tuple{Table{<:AbstractVector{<:C

┌ Info: Creating subsamples from a subset of all rows. 
└ @ MLJBase C:\Users\Utsav\.julia\packages\MLJBase\WKVEo\src\resampling.jl:601
[33mEvaluating over 10 folds:  20%[=====>                   ]  ETA: 0:00:07[39m[K

















GradientBoosting 10 Folds :
 [[0.7959770114942528, 0.7974137931034483, 0.7988505747126436, 0.7959770114942528, 0.8287769784172662, 0.8, 0.7741007194244605, 0.7870503597122303, 0.743884892086331, 0.823021582733813]]





(Dict{Any, Any}("RandomForest" => 0.9749784296807593, "KNeighbors" => 0.8143514524014955, "DecisionTree" => 0.9744032211676733, "ExtraTrees" => 0.9805867126833477, "AdaBoost" => 0.7952257693413862, "Logistic" => 0.7766752947943629, "GradientBoosting" => 0.8120506183491516), Dict{Any, Any}("RandomForest" => 0.7929844738355376, "KNeighbors" => 0.750431282346176, "DecisionTree" => 0.7366302472685451, "ExtraTrees" => 0.78953421506613, "AdaBoost" => 0.7935595169637722, "Logistic" => 0.7763082231167338, "GradientBoosting" => 0.8050603795284647))

In [59]:
sort(collect(test_results),by=x->x[2])

7-element Vector{Pair{Any, Any}}:
     "DecisionTree" => 0.7366302472685451
       "KNeighbors" => 0.750431282346176
         "Logistic" => 0.7763082231167338
       "ExtraTrees" => 0.78953421506613
     "RandomForest" => 0.7929844738355376
         "AdaBoost" => 0.7935595169637722
 "GradientBoosting" => 0.8050603795284647

In [60]:
sort(collect(train_results),by=x->x[2])

7-element Vector{Pair{Any, Any}}:
         "Logistic" => 0.7766752947943629
         "AdaBoost" => 0.7952257693413862
 "GradientBoosting" => 0.8120506183491516
       "KNeighbors" => 0.8143514524014955
     "DecisionTree" => 0.9744032211676733
     "RandomForest" => 0.9749784296807593
       "ExtraTrees" => 0.9805867126833477