-
Notifications
You must be signed in to change notification settings - Fork 28
/
Copy pathdouble_parallel_example.jl
115 lines (97 loc) · 3.24 KB
/
double_parallel_example.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# make sure local environment is activated
using Pkg
Pkg.activate(".")
# Symbolic Pipeline Composition
# For parallel search
using AutoMLPipeline
using Distributed
using DataFrames
# Add workers
nprocs() ==1 && addprocs(exeflags=["--project=$(Base.active_project())"])
workers()
# disable warnings
@everywhere import PythonCall
@everywhere const PYC=PythonCall
@everywhere warnings = PYC.pyimport("warnings")
@everywhere warnings.filterwarnings("ignore")
@sync @everywhere using AutoMLPipeline
@sync @everywhere using DataFrames
# get data
begin
data = getprofb()
X = data[:,2:end]
Y = data[:,1] |> Vector
end
#### feature selectors
catf = CatFeatureSelector();
numf = NumFeatureSelector();
# hot-bit encoder
ohe = OneHotEncoder();
#### feature scalers
rb = SKPreprocessor("RobustScaler");
pt = SKPreprocessor("PowerTransformer");
mx = SKPreprocessor("MinMaxScaler");
std = SKPreprocessor("StandardScaler");
norm = SKPreprocessor("Normalizer");
#### feature extractors
pca = SKPreprocessor("PCA", Dict(:autocomponent => true));
ica = SKPreprocessor("FastICA", Dict(:autocomponent => true));
fa = SKPreprocessor("FactorAnalysis", Dict(:autocomponent => true));
#### Learners
rf = SKLearner("RandomForestClassifier", Dict(:impl_args => Dict(:n_estimators => 10)));
gb = SKLearner("GradientBoostingClassifier");
lsvc = SKLearner("LinearSVC");
mlp = SKLearner("MLPClassifier");
stack = StackEnsemble();
rbfsvc = SKLearner("SVC");
ada = SKLearner("AdaBoostClassifier");
vote = VoteEnsemble();
best = BestLearner();
tree = PrunedTree();
sgd = SKLearner("SGDClassifier");
noop = Identity(Dict(:name => "Noop"));
pipe = @pipeline catf;
pred = fit_transform!(pipe, X, Y)
pipe = @pipeline catf |> ohe;
pred = fit_transform!(pipe, X, Y)
pipe = @pipeline numf;
pred = fit_transform!(pipe, X, Y)
pipe = @pipeline numf |> norm;
pred = fit_transform!(pipe, X, Y)
pipe = @pipeline (numf |> norm) + (catf |> ohe);
pred = fit_transform!(pipe, X, Y)
pipe = @pipeline (numf |> norm) + (catf |> ohe) |> rf;
pred = fit_transform!(pipe, X, Y)
crossvalidate(pipe,X,Y)
pipe = @pipeline (numf |> norm) + (catf |> ohe) |> sgd;
crossvalidate(pipe,X,Y)
pipe = @pipeline (numf |> norm |> pca) + (numf |> rb |> pca) + (catf |> ohe) |> tree;
crossvalidate(pipe,X,Y)
# Parallel Search for Datamining Optimal Pipelines
function prpsearch()
learners = [rf,ada,sgd,tree,rbfsvc,lsvc,gb];
scalers = [rb,pt,norm,std,mx,noop];
extractors = [pca,ica,fa,noop];
dftable = @sync @distributed (vcat) for lr in learners
@distributed (vcat) for sc in scalers
@distributed (vcat) for xt in extractors
pipe = @pipeline (catf |> ohe) + (numf |> sc |> xt) |> lr
scn = sc.name[1:end - 4]; xtn = xt.name[1:end - 4]; lrn = lr.name[1:end - 4]
pname = "$scn |> $xtn |> $lrn"
ptime = @elapsed begin
mean, sd, kfold, _ = crossvalidate(pipe, X, Y, "accuracy_score", 3)
end
DataFrame(pipeline=pname, mean=mean, sd=sd, time=ptime, folds=kfold)
end
end
end
sort!(dftable, :mean, rev=true);
dftable
end
runtime = @elapsed begin
df = prpsearch()
end;
serialtime = df.time |> sum;
(serialtime = "$(round(serialtime / 60.0)) minutes", paralleltime = "$(round(runtime)) seconds")
# pipeline performances
@show df