-
Notifications
You must be signed in to change notification settings - Fork 28
/
Copy pathsingle_parallel_example.jl
107 lines (86 loc) · 2.88 KB
/
single_parallel_example.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# activate local env
using Pkg
Pkg.activate(".")
# load packages/modules
using Base.Threads
using Distributed
using CSV
using DataFrames
using AutoMLPipeline
using Random
nprocs() ==1 && addprocs(exeflags=["--project=$(Base.active_project())"])
workers()
# disable warnings
@everywhere import PythonCall
@everywhere const PYC=PythonCall
@everywhere warnings = PYC.pyimport("warnings")
@everywhere warnings.filterwarnings("ignore")
@everywhere using DataFrames
@everywhere using AutoMLPipeline
# get data
profbdata = getprofb()
X = profbdata[:,2:end];
Y = profbdata[:,1] |> Vector;
topdf(x)=first(x,5)
topdf(profbdata)
#### Scaler
rb = SKPreprocessor("RobustScaler");
pt = SKPreprocessor("PowerTransformer");
norm = SKPreprocessor("Normalizer");
mx = SKPreprocessor("MinMaxScaler");
std = SKPreprocessor("StandardScaler")
disc = CatNumDiscriminator();
#### categorical preprocessing
ohe = OneHotEncoder();
#### Column selector
catf = CatFeatureSelector();
numf = NumFeatureSelector();
# load filters
#### Decomposition
apca = SKPreprocessor("PCA",Dict(:autocomponent=>true));
pca = SKPreprocessor("PCA");
afa = SKPreprocessor("FactorAnalysis",Dict(:autocomponent=>true));
fa = SKPreprocessor("FactorAnalysis");
aica = SKPreprocessor("FastICA",Dict(:autocomponent=>true));
ica = SKPreprocessor("FastICA");
#### Learners
rf = SKLearner("RandomForestClassifier",Dict(:impl_args=>Dict(:n_estimators => 10)));
gb = SKLearner("GradientBoostingClassifier");
lsvc = SKLearner("LinearSVC");
mlp = SKLearner("MLPClassifier");
jrf = RandomForest();
stack = StackEnsemble();
rbfsvc = SKLearner("SVC");
ada = SKLearner("AdaBoostClassifier");
vote = VoteEnsemble();
best = BestLearner();
tree = PrunedTree()
sgd = SKLearner("SGDClassifier");
# filter categories and hotbit encode
pohe = @pipeline catf |> ohe |> fa |> lsvc ;
crossvalidate(pohe,X,Y)
# filter numeric and apply pca and ica (unigrams)
pdec = @pipeline (numf |> pca) + (numf |> ica);
tr = fit_transform!(pdec,X,Y)
# filter numeric, apply rb/pt transforms and ica/pca extractions (bigrams)
ppt = @pipeline (numf |> rb |> ica) + (numf |> pt |> pca);
tr = fit_transform!(ppt,X,Y)
# rf learn baseline
rfp1 = @pipeline ( (catf |> ohe) + (catf |> ohe) + (numf) ) |> rf;
crossvalidate(rfp1, X,Y)
# rf learn: bigrams, 2-blocks
rfp2 = @pipeline ((catf |> ohe) + numf |> rb |> ica) + (numf |> pt |> pca) |> ada;
crossvalidate(rfp2, X,Y)
# lsvc learn: bigrams, 3-blocks
plsvc = @pipeline ((numf |> rb |> pca)+(numf |> rb |> fa)+(numf |> rb |> ica)+(catf |> ohe )) |> lsvc;
pred = fit_transform!(plsvc,X,Y)
crossvalidate(plsvc,X,Y)
learners = [jrf,ada,sgd,tree,lsvc]
learners = @distributed (vcat) for learner in learners
pcmc = @pipeline disc |> ((catf |> ohe) + (numf)) |> rb |> pca |> learner
println(learner.name)
mean,sd,_ = crossvalidate(pcmc,X,Y,"accuracy_score",10)
DataFrame(name=learner.name,mean=mean,sd=sd)
end
sort!(learners,:mean,rev=true)
@show learners;