In [5]:
import openml
from openml import datasets, tasks, runs, flows, config, study
from openml.datasets import edit_dataset, fork_dataset, get_dataset
from openml.tasks import TaskType
import os, pandas as pd, sklearn, arff, pprint, numpy as np, seaborn as sns
from sklearn import neighbors

config.apikey = ''
config.server = 'https://www.openml.org/api/v1' # https://test.openml.org/api/v1/xml
config.set_cache_directory(os.path.expanduser('~/openml/cache'))
config.cachedir = '~/openml/cache'

# benchmark suite

In [97]:
# benchmark suite 중에서 우리가 원하는 'OpenML-CC18' 가져오기

benchmark_suite = openml.study.get_suite('OpenML-CC18') # obtain the benchmark suite
benchmark_suite

OpenML Benchmark Suite
ID..............: 99
Name............: OpenML-CC18 Curated Classification benchmark
Status..........: active
Main Entity Type: task
Study URL.......: https://www.openml.org/s/99
# of Data.......: 72
# of Tasks......: 72
Creator.........: https://www.openml.org/u/1
Upload Time.....: 2019-02-21 18:47:13

In [108]:
# benchmark suite에 존재하는 task ids, data ids

print(benchmark_suite.tasks)
print(benchmark_suite.data)
print(benchmark_suite.study_id)
print(benchmark_suite.runs)
print(benchmark_suite.setups)

[3, 6, 11, 12, 14, 15, 16, 18, 22, 23, 28, 29, 31, 32, 37, 43, 45, 49, 53, 219, 2074, 2079, 3021, 3022, 3481, 3549, 3560, 3573, 3902, 3903, 3904, 3913, 3917, 3918, 7592, 9910, 9946, 9952, 9957, 9960, 9964, 9971, 9976, 9977, 9978, 9981, 9985, 10093, 10101, 14952, 14954, 14965, 14969, 14970, 125920, 125922, 146195, 146800, 146817, 146819, 146820, 146821, 146822, 146824, 146825, 167119, 167120, 167121, 167124, 167125, 167140, 167141]
[3, 6, 11, 12, 14, 15, 16, 18, 22, 23, 28, 29, 31, 32, 37, 44, 46, 50, 54, 151, 182, 188, 38, 307, 300, 458, 469, 554, 1049, 1050, 1053, 1063, 1067, 1068, 1590, 4134, 1510, 1489, 1494, 1497, 1501, 1480, 1485, 1486, 1487, 1468, 1475, 1462, 1464, 4534, 6332, 1461, 4538, 1478, 23381, 40499, 40668, 40966, 40982, 40994, 40983, 40975, 40984, 40979, 40996, 41027, 23517, 40923, 40927, 40978, 40670, 40701]
99
None
None


# task (환경)

In [99]:
task_ids = [3, 6, 11, 12]

tasks = openml.tasks.get_tasks(task_ids)
task = tasks[0]
print(task)

features, targets = task.get_X_and_y()
print('\n source data shape \n =========================')
print(features.shape)
print(targets.shape)

OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 3
Task URL.............: https://www.openml.org/t/3
Estimation Procedure.: crossvalidation
Target Feature.......: class
# of Classes.........: 2
Cost Matrix..........: Available

 source data shape 
(3196, 36)
(3196,)


In [109]:
# 원하는 task를 가져올 수 있다

# tasks = openml.tasks.list_tasks(output_format="dataframe") # 46592개
# tasks = openml.tasks.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION)
tasks = openml.tasks.list_tasks(tag="OpenML100", output_format="dataframe") # 91개
# tasks = openml.tasks.list_tasks(data_id=1471, output_format="dataframe") # 24개
# tasks = openml.tasks.list_tasks(task_type_id=1, size=100)
# tasks = openml.tasks.list_tasks(size=10, offset=50, output_format="dataframe")
# tasks = pd.DataFrame.from_dict(tasks, orient="index")

tasks.head()


Unnamed: 0,tid,ttid,did,name,task_type,status,estimation_procedure,source_data,target_feature,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
3,3,TaskType.SUPERVISED_CLASSIFICATION,3,kr-vs-kp,Supervised Classification,active,10-fold Crossvalidation,3,class,1669,3.0,1527,2,37,3196,0,0,0,37
6,6,TaskType.SUPERVISED_CLASSIFICATION,6,letter,Supervised Classification,active,10-fold Crossvalidation,6,class,813,26.0,734,26,17,20000,0,0,16,1
11,11,TaskType.SUPERVISED_CLASSIFICATION,11,balance-scale,Supervised Classification,active,10-fold Crossvalidation,11,class,288,3.0,49,3,5,625,0,0,4,1
12,12,TaskType.SUPERVISED_CLASSIFICATION,12,mfeat-factors,Supervised Classification,active,10-fold Crossvalidation,12,class,200,10.0,200,10,217,2000,0,0,216,1
14,14,TaskType.SUPERVISED_CLASSIFICATION,14,mfeat-fourier,Supervised Classification,active,10-fold Crossvalidation,14,class,200,10.0,200,10,77,2000,0,0,76,1


In [39]:
# 가져 온 task 필터링

filtered_tasks = tasks.query("NumberOfInstances > 500 and NumberOfInstances < 1000")
filtered_tasks = filtered_tasks.query('estimation_procedure == "10-fold Crossvalidation"')
filtered_tasks.head()

Unnamed: 0,tid,ttid,did,name,task_type,status,estimation_procedure,source_data,target_feature,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
11,11,TaskType.SUPERVISED_CLASSIFICATION,11,balance-scale,Supervised Classification,active,10-fold Crossvalidation,11,class,288,3.0,49,3,5,625,0,0,4,1
15,15,TaskType.SUPERVISED_CLASSIFICATION,15,breast-w,Supervised Classification,active,10-fold Crossvalidation,15,Class,458,2.0,241,2,10,699,16,16,9,1
29,29,TaskType.SUPERVISED_CLASSIFICATION,29,credit-approval,Supervised Classification,active,10-fold Crossvalidation,29,class,383,14.0,307,2,16,690,37,67,6,10
37,37,TaskType.SUPERVISED_CLASSIFICATION,37,diabetes,Supervised Classification,active,10-fold Crossvalidation,37,class,500,2.0,268,2,9,768,0,0,8,1
41,41,TaskType.SUPERVISED_CLASSIFICATION,42,soybean,Supervised Classification,active,10-fold Crossvalidation,42,class,92,19.0,8,19,36,683,121,2337,0,36


In [112]:
# 원하는 task를 직접 만들고 publish할 수도 있다

config.server = 'https://test.openml.org/api/v1/xml'

openml.config.start_using_configuration_for_example()

try:
    my_task = openml.tasks.create_task(
        task_type=TaskType.SUPERVISED_CLASSIFICATION,
        dataset_id=128,
        target_name="class",
        evaluation_measure="predictive_accuracy",
        estimation_procedure_id=1,
    )
    my_task.publish()
    
except openml.exceptions.OpenMLServerException as e:
    # Error code for 'task already exists'
    if e.code == 614:
        # Lookup task
        tasks = openml.tasks.list_tasks(data_id=128, output_format="dataframe")
        tasks = tasks.query(
            'task_type == "Supervised Classification" '
            'and estimation_procedure == "10-fold Crossvalidation" '
            'and evaluation_measures == "predictive_accuracy"'
        )
        task_id = tasks.loc[:, "tid"].values[0]
        print("Task already exists. Task ID is", task_id)

# reverting to prod server
openml.config.stop_using_configuration_for_example()


config.server = 'https://www.openml.org/api/v1'

# dataset

In [113]:
# dataset 가져오기

dlist = datasets.list_datasets(output_format='dataframe')
dlist

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
2,2,anneal,1,1,active,ARFF,684.0,7.0,8.0,5.0,39.0,898.0,898.0,22175.0,6.0,33.0
3,3,kr-vs-kp,1,1,active,ARFF,1669.0,3.0,1527.0,2.0,37.0,3196.0,0.0,0.0,0.0,37.0
4,4,labor,1,1,active,ARFF,37.0,3.0,20.0,2.0,17.0,57.0,56.0,326.0,8.0,9.0
5,5,arrhythmia,1,1,active,ARFF,245.0,13.0,2.0,13.0,280.0,452.0,384.0,408.0,206.0,74.0
6,6,letter,1,1,active,ARFF,813.0,26.0,734.0,26.0,17.0,20000.0,0.0,0.0,16.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44192,44192,Diabetes(scikit-learn),13,31897,active,arff,,,,0.0,11.0,442.0,0.0,0.0,11.0,0.0
44193,44193,Weather,18,31897,active,arff,9.0,,5.0,2.0,5.0,14.0,0.0,0.0,2.0,3.0
44194,44194,Diabetes(scikit-learn),14,31897,active,arff,,,,0.0,11.0,442.0,0.0,0.0,11.0,0.0
44195,44195,Weather,19,31897,active,arff,9.0,,5.0,2.0,5.0,14.0,0.0,0.0,2.0,3.0


In [52]:
# dataset 필터링

dlist_filtered = dlist[dlist.NumberOfInstances > 10000]
dlist_filtered = dlist_filtered.query("NumberOfClasses > 50")
dlist_filtered

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
41167,41167,dionis,1,1478,active,ARFF,2469.0,355.0,878.0,355.0,61.0,416188.0,0.0,0.0,60.0,1.0
41169,41169,helena,1,1478,active,ARFF,4005.0,100.0,111.0,100.0,28.0,65196.0,0.0,0.0,27.0,1.0
41960,41960,seattlecrime6,1,9035,active,ARFF,131297.0,144.0,1.0,144.0,8.0,523590.0,3615.0,6916.0,2.0,6.0
41983,41983,CIFAR-100,1,86,active,arff,600.0,,600.0,100.0,3073.0,60000.0,0.0,0.0,3072.0,1.0
42078,42078,beer_reviews,4,5332,active,arff,117586.0,,241.0,104.0,13.0,1586614.0,68136.0,68148.0,9.0,0.0
42087,42087,beer_reviews,5,5332,active,arff,117586.0,,241.0,104.0,13.0,1586614.0,68136.0,68148.0,9.0,0.0
42088,42088,beer_reviews,6,5332,active,arff,117586.0,,241.0,104.0,13.0,1586614.0,68136.0,68148.0,9.0,0.0
42089,42089,vancouver_employee,1,5332,active,arff,117586.0,,241.0,104.0,13.0,1586614.0,68136.0,68148.0,9.0,0.0
42396,42396,aloi,3,2902,active,arff,108.0,,108.0,1000.0,129.0,108000.0,0.0,0.0,128.0,1.0


In [114]:
odata = datasets.get_dataset(42396)
odata

OpenML Dataset
Name..........: aloi
Version.......: 3
Format........: arff
Upload Date...: 2020-04-26 18:36:09
Licence.......: Public
Download URL..: https://api.openml.org/data/v1/download/21829701/aloi.arff
OpenML URL....: https://www.openml.org/d/42396
# of features.: 129
# of instances: 108000

In [115]:
X, y, attribute_names, _ = odata.get_data(target=odata.default_target_attribute)
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,8,7,5,6,0,1,5,4,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8,7,5,6,0,1,5,3,0,0,...,0,0,0,0,0,0,0,0,0,0
2,8,7,5,5,0,1,5,3,0,0,...,0,0,0,0,0,0,0,0,0,1
3,7,6,3,6,0,1,5,4,0,0,...,0,0,0,0,0,0,0,0,0,3
4,8,7,6,5,0,1,5,2,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107995,9,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
107996,9,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
107997,9,0,0,0,0,2,0,0,0,0,...,0,0,0,0,1,0,0,0,1,1
107998,9,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,1,0,0,0,2


# study (run collection)

In [119]:
# study = run collection

studies = openml.study.list_studies(output_format="dataframe", status="all")
studies.tail(10)

Unnamed: 0,id,alias,main_entity_type,name,status,creation_date,creator
231,231,327d6bad160846539b0630c3514b2aa3,run,test bench,in_preparation,2019-11-16 22:45:33,10468
232,232,bef72849a94e44209fd2906e3ca6d308,run,Test-Study,in_preparation,2019-11-17 00:12:57,10468
241,241,fairml,run,FairML,in_preparation,2020-05-20 18:05:51,1935
272,272,d0a10cb3754f4f3fa08e7ce89a97afd1,run,case_Regression,in_preparation,2021-03-26 04:13:11,22339
273,273,3de0a34046f14f428ffd61140aa82195,run,case_Classifier,in_preparation,2021-03-26 05:12:56,22339
275,275,,run,CC18-Example,in_preparation,2021-08-26 16:08:33,869
276,276,,run,CC18-Example,in_preparation,2021-08-26 17:38:33,869
294,294,12345,run,CC18-Example,in_preparation,2022-06-22 07:25:05,31253
295,295,peng-test,run,CC18-Example,in_preparation,2022-06-30 02:58:26,6127
296,296,splitter,run,Tree splitter test,in_preparation,2022-06-30 03:01:07,6127


In [118]:
study_id = 123

study = openml.study.get_study(study_id)
print('\n study \n', study)
print('\n study description \n', study.description)
print('\n study runs \n', study.runs[:10])


 study 
 OpenML Study
ID..............: 123
Name............: Linear vs. Non Linear
Status..........: active
Main Entity Type: run
Study URL.......: https://www.openml.org/s/123
# of Data.......: 299
# of Tasks......: 299
# of Flows......: 5
# of Runs.......: 1693
Creator.........: https://www.openml.org/u/1
Upload Time.....: 2019-02-21 19:55:30

 study description 
 Comparison of linear and non-linear models.

[Jupyter Notebook](https://github.com/janvanrijn/linear-vs-non-linear/blob/master/notebook/Linear-vs-Non-Linear.ipynb)

 study runs 
 [9199877, 9199878, 9199879, 9199880, 9199881, 9199882, 9199883, 9199884, 9199885, 9199886]


# evaluation

In [126]:
evaluations = openml.evaluations.list_evaluations(
    function="predictive_accuracy", tasks=[3539, 3547], setups=None, flows=None, runs=None, study=study.study_id,
    output_format="dataframe", size=100
    )

evaluations.tail()

Unnamed: 0,run_id,task_id,setup_id,flow_id,flow_name,data_id,data_name,function,upload_time,uploader,uploader_name,value,values,array_data
6,9200518,3547,7130404,7756,sklearn.model_selection._search.RandomizedSear...,455,cars,predictive_accuracy,2018-05-06 11:07:06,3886,Benjamin Strang,0.79803,,
7,9201107,3539,7130967,7725,sklearn.model_selection._search.RandomizedSear...,446,prnn_crabs,predictive_accuracy,2018-05-06 13:03:21,3886,Benjamin Strang,0.695,,
8,9201108,3539,7130968,7725,sklearn.model_selection._search.RandomizedSear...,446,prnn_crabs,predictive_accuracy,2018-05-06 13:03:22,3886,Benjamin Strang,0.93,,
9,9201111,3547,7130971,7725,sklearn.model_selection._search.RandomizedSear...,455,cars,predictive_accuracy,2018-05-06 13:03:26,3886,Benjamin Strang,0.711823,,
10,9201112,3547,7130972,7725,sklearn.model_selection._search.RandomizedSear...,455,cars,predictive_accuracy,2018-05-06 13:03:27,3886,Benjamin Strang,0.805419,,


# flows (모델)

In [161]:
flist = flows.list_flows(size=500, output_format="dataframe", )
flist

Unnamed: 0,id,full_name,name,version,external_version,uploader
1,1,openml.evaluation.EuclideanDistance(1.0),openml.evaluation.EuclideanDistance,1,,1
2,2,openml.evaluation.PolynomialKernel(1.0),openml.evaluation.PolynomialKernel,1,,1
3,3,openml.evaluation.RBFKernel(1.0),openml.evaluation.RBFKernel,1,,1
4,4,openml.evaluation.area_under_roc_curve(1.0),openml.evaluation.area_under_roc_curve,1,,1
5,5,openml.evaluation.average_cost(1.0),openml.evaluation.average_cost,1,,1
...,...,...,...,...,...,...
536,536,weka.Bagging_VotedPerceptron(2),weka.Bagging_VotedPerceptron,2,Weka_3.7.12-SNAPSHOT_10470,2
548,548,weka.BayesNet_GeneticSearch(2),weka.BayesNet_GeneticSearch,2,Weka_3.7.12-SNAPSHOT_10386,2
549,549,weka.GeneticSearch(2),weka.GeneticSearch,2,Weka_3.7.12-SNAPSHOT_10154,2
550,550,weka.BayesNet_HillClimber(2),weka.BayesNet_HillClimber,2,Weka_3.7.12-SNAPSHOT_10386,2


# runs

In [80]:
rlist = runs.list_runs(
    id=None, task=[14951], setup=None, flow=None, study=None,
    size=100, output_format="dataframe"
    )
rlist

Unnamed: 0,run_id,task_id,setup_id,flow_id,uploader,task_type,upload_time,error_message
2414731,2414731,14951,480738,6756,1,TaskType.SUPERVISED_CLASSIFICATION,2017-06-16 10:51:52,
8857787,8857787,14951,6832828,7599,1,TaskType.SUPERVISED_CLASSIFICATION,2018-02-10 18:17:32,
8857789,8857789,14951,6832830,7599,1,TaskType.SUPERVISED_CLASSIFICATION,2018-02-10 18:23:29,
8857791,8857791,14951,6832832,7599,1,TaskType.SUPERVISED_CLASSIFICATION,2018-02-10 18:31:01,
8857793,8857793,14951,6832834,7599,1,TaskType.SUPERVISED_CLASSIFICATION,2018-02-10 18:36:39,
...,...,...,...,...,...,...,...,...
2275741,2275741,14951,29014,5503,2,TaskType.SUPERVISED_CLASSIFICATION,2017-05-19 15:31:19,
2347778,2347778,14951,29019,5503,2,TaskType.SUPERVISED_CLASSIFICATION,2017-05-29 12:20:26,
2347780,2347780,14951,29014,5503,2,TaskType.SUPERVISED_CLASSIFICATION,2017-05-29 12:21:11,
2347784,2347784,14951,29014,5503,2,TaskType.SUPERVISED_CLASSIFICATION,2017-05-29 12:34:15,


In [133]:
scores = []
for idx, row in rlist.iterrows():
    run = runs.get_run(row['run_id'])
    scores.append({"flow":run.flow_name,
                    "score":run.evaluations['area_under_roc_curve']})
pd.DataFrame.from_dict(scores)

Unnamed: 0,flow,score
0,sklearn.tree.tree.ExtraTreeClassifier(2),0.783674
1,sklearn.tree.tree.ExtraTreeClassifier(11),0.788219
2,sklearn.tree.tree.ExtraTreeClassifier(11),0.789927
3,sklearn.tree.tree.ExtraTreeClassifier(11),0.786671
4,sklearn.tree.tree.ExtraTreeClassifier(11),0.791326
...,...,...
95,sklearn.neighbors.classification.KNeighborsCla...,0.994680
96,sklearn.neighbors.classification.KNeighborsCla...,0.980389
97,sklearn.neighbors.classification.KNeighborsCla...,0.994680
98,sklearn.neighbors.classification.KNeighborsCla...,0.994680


## run model on task

In [143]:
task = openml.tasks.get_task(14951)
clf = sklearn.linear_model.LogisticRegression()
run = runs.run_model_on_task(clf, task)
score = run.get_metric_fn(sklearn.metrics.accuracy_score)
myrun = run.publish()

print(myrun)
print("Accuracy: {:.2f}%".format(score.mean()))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

OpenML Run
Uploader Name: None
Metric.......: None
Run ID.......: 10587932
Run URL......: https://www.openml.org/r/10587932
Task ID......: 14951
Task Type....: None
Task URL.....: https://www.openml.org/t/14951
Flow ID......: 19001
Flow Name....: sklearn.linear_model._logistic.LogisticRegression
Flow URL.....: https://www.openml.org/f/19001
Setup ID.....: None
Setup String.: Python_3.6.13. Sklearn_0.24.2. NumPy_1.19.5. SciPy_1.5.4.
Dataset ID...: 1471
Dataset URL..: https://www.openml.org/d/1471
Accuracy: 0.64%


## 남이 해 놓은 실험결과 가져오기 


In [165]:
weka_flows = {
    'SVM': 8666,
    'LoginBoost(REPTree)': 8676,
    'REPTree': 8693,
    'Logistic': 8680,
    'Random Forest': 8690,
    'k-NN': 8682,
#    'NaiveBayes': 8688,   
}
flowid_flowname = {flow_id: flow_name 
                   for flow_name, flow_id in weka_flows.items()}

study = openml.study.get_study(study_id)
measures = ['predictive_accuracy']

evaluations = dict()
for measure in measures:
    evaluations[measure] = openml.evaluations.list_evaluations(measure,                             # 원하는 평가지표
                                                               flows=weka_flows.values(),           # 원하는 모델   
                                                               tasks=study.tasks)                   # 원하는 task
    
records = []
for measure in measures:
    for eid, evaluation in evaluations[measure].items():
        record = {'task_id': evaluation.task_id,
                  'setup_id': evaluation.setup_id,
                  'flow_id': evaluation.flow_id, 
                  'flow_name': flowid_flowname[evaluation.flow_id],
                  'measure': measure,
                  'value': evaluation.value}
        records.append(record)
df = pd.DataFrame(data=records, columns=['task_id', 'setup_id', 'flow_id', 'flow_name', 'measure', 'value'])
df = pd.pivot_table(df, index=['task_id', 'setup_id', 'flow_id', 'flow_name'], columns='measure', values='value')
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,measure,predictive_accuracy
task_id,setup_id,flow_id,flow_name,Unnamed: 4_level_1
3,8254068,8666,SVM,0.995620
3,8254093,8693,REPTree,0.992178
3,8254134,8690,Random Forest,0.990926
6,8254093,8693,REPTree,0.877800
11,8254068,8666,SVM,1.000000
...,...,...,...,...
125920,8254093,8693,REPTree,0.580000
125920,8254134,8690,Random Forest,0.554000
125922,8254068,8666,SVM,0.998727
125922,8254093,8693,REPTree,0.936909


# In-depth examples
- https://openml.github.io/openml-python/develop/examples/index.html#sphx-glr-examples

## obtaining flow ids

In [8]:
import sklearn.tree

clf = sklearn.tree.DecisionTreeClassifier()
flow = openml.extensions.get_extension_by_model(clf).model_to_flow(clf).publish()
flow_id = flow.flow_id
print(flow_id)
print(flow.name, flow.external_version)

18820
sklearn.tree._classes.DecisionTreeClassifier openml==0.12.2,sklearn==0.24.2
18820


In [None]:
flow_id = openml.flows.flow_exists(name=flow.name, external_version=flow.external_version)
print(flow_id)

In [9]:
flow_ids = openml.flows.get_flow_id(name=flow.name)
print(flow_ids)

[17367, 17456, 17504, 17609, 18586, 18612, 18685, 18700, 18708, 18719, 18734, 18747, 18756, 18757, 18758, 18761, 18768, 18790, 18814, 18820, 18858, 19020, 19034, 19044, 19085, 19117, 19123, 19147]


In [10]:
flow_ids = openml.flows.get_flow_id(model=clf, exact_version=False)
print(flow_ids)

[17367, 17456, 17504, 17609, 18586, 18612, 18685, 18700, 18708, 18719, 18734, 18747, 18756, 18757, 18758, 18761, 18768, 18790, 18814, 18820, 18858, 19020, 19034, 19044, 19085, 19117, 19123, 19147]


## benchmark suites

## logging

## benchmark studies

In [12]:
import uuid
from sklearn.ensemble import RandomForestClassifier

study_id = 123
study = openml.study.get_study(study_id)
evaluations = openml.evaluations.list_evaluations(
    function="predictive_accuracy", output_format="dataframe", study=study.study_id,
)
evaluations.head()

Unnamed: 0,run_id,task_id,setup_id,flow_id,flow_name,data_id,data_name,function,upload_time,uploader,uploader_name,value,values,array_data
0,9199877,3,7130157,7722,sklearn.model_selection._search.RandomizedSear...,3,kr-vs-kp,predictive_accuracy,2018-05-06 08:39:07,3886,Benjamin Strang,0.974969,,
1,9199878,6,7130158,7722,sklearn.model_selection._search.RandomizedSear...,6,letter,predictive_accuracy,2018-05-06 08:41:04,3886,Benjamin Strang,0.7165,,
2,9199879,6,7130159,7729,sklearn.model_selection._search.RandomizedSear...,6,letter,predictive_accuracy,2018-05-06 08:43:06,3886,Benjamin Strang,0.9672,,
3,9199880,11,7130158,7722,sklearn.model_selection._search.RandomizedSear...,11,balance-scale,predictive_accuracy,2018-05-06 08:43:08,3886,Benjamin Strang,0.8864,,
4,9199881,11,7130159,7729,sklearn.model_selection._search.RandomizedSear...,11,balance-scale,predictive_accuracy,2018-05-06 08:43:09,3886,Benjamin Strang,0.976,,


In [None]:
# uploading studies

import uuid
from sklearn.ensemble import RandomForestClassifier

# Model to be used
clf = RandomForestClassifier()

# We'll create a study with one run on 3 datasets present in the suite
tasks = [115, 259, 307]

# To verify
suite = openml.study.get_suite(1)
print(all([t_id in suite.tasks for t_id in tasks]))

run_ids = []
for task_id in tasks:
    task = openml.tasks.get_task(task_id)
    run = openml.runs.run_model_on_task(clf, task)
    run.publish()
    run_ids.append(run.run_id)

# The study needs a machine-readable and unique alias. To obtain this,
# we simply generate a random uuid.
alias = uuid.uuid4().hex

new_study = openml.study.create_study(
    name="Test-Study",
    description="Test study for the Python tutorial on studies",
    run_ids=run_ids,
    alias=alias,
    benchmark_suite=suite.study_id,
)
new_study.publish()
print(new_study)

## run setup
## tasks
## datasets
## creating and using a custom flow

## tasks: retreving splits

In [14]:
task_id = 233
task = openml.tasks.get_task(task_id)
n_repeats, n_folds, n_samples = task.get_split_dimensions()
print(
    "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
        task_id, n_repeats, n_folds, n_samples,
    )
)

Task 233: number of repeats: 1, number of folds: 1, number of samples 1.


In [15]:
train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0,)

print(train_indices.shape, train_indices.dtype)
print(test_indices.shape, test_indices.dtype)

(2142,) int32
(1054,) int32


In [16]:
# split data 

X, y = task.get_X_and_y(dataset_format="dataframe")
X_train = X.iloc[train_indices]
y_train = y.iloc[train_indices]
X_test = X.iloc[test_indices]
y_test = y.iloc[test_indices]

print(
    "X_train.shape: {}, y_train.shape: {}, X_test.shape: {}, y_test.shape: {}".format(
        X_train.shape, y_train.shape, X_test.shape, y_test.shape,
    )
)

X_train.shape: (2142, 36), y_train.shape: (2142,), X_test.shape: (1054, 36), y_test.shape: (1054,)


In [18]:
# cross-validation version

task_id = 3
task = openml.tasks.get_task(task_id)
X, y = task.get_X_and_y(dataset_format="dataframe")
n_repeats, n_folds, n_samples = task.get_split_dimensions()
print(
    "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
        task_id, n_repeats, n_folds, n_samples,
    )
)

for repeat_idx in range(n_repeats):
    for fold_idx in range(n_folds):
        for sample_idx in range(n_samples):
            train_indices, test_indices = task.get_train_test_split_indices(
                repeat=repeat_idx, fold=fold_idx, sample=sample_idx,
            )
            X_train = X.iloc[train_indices]
            y_train = y.iloc[train_indices]
            X_test = X.iloc[test_indices]
            y_test = y.iloc[test_indices]

            print(
                "Repeat #{}, fold #{}, samples {}: X_train.shape: {}, "
                "y_train.shape {}, X_test.shape {}, y_test.shape {}".format(
                    repeat_idx,
                    fold_idx,
                    sample_idx,
                    X_train.shape,
                    y_train.shape,
                    X_test.shape,
                    y_test.shape,
                )
            )

Task 3: number of repeats: 1, number of folds: 10, number of samples 1.
Repeat #0, fold #0, samples 0: X_train.shape: (2876, 36), y_train.shape (2876,), X_test.shape (320, 36), y_test.shape (320,)
Repeat #0, fold #1, samples 0: X_train.shape: (2876, 36), y_train.shape (2876,), X_test.shape (320, 36), y_test.shape (320,)
Repeat #0, fold #2, samples 0: X_train.shape: (2876, 36), y_train.shape (2876,), X_test.shape (320, 36), y_test.shape (320,)
Repeat #0, fold #3, samples 0: X_train.shape: (2876, 36), y_train.shape (2876,), X_test.shape (320, 36), y_test.shape (320,)
Repeat #0, fold #4, samples 0: X_train.shape: (2876, 36), y_train.shape (2876,), X_test.shape (320, 36), y_test.shape (320,)
Repeat #0, fold #5, samples 0: X_train.shape: (2876, 36), y_train.shape (2876,), X_test.shape (320, 36), y_test.shape (320,)
Repeat #0, fold #6, samples 0: X_train.shape: (2877, 36), y_train.shape (2877,), X_test.shape (319, 36), y_test.shape (319,)
Repeat #0, fold #7, samples 0: X_train.shape: (2877, 

In [None]:
# multiple repeats version

task_id = 1767
task = openml.tasks.get_task(task_id)
X, y = task.get_X_and_y(dataset_format="dataframe")
n_repeats, n_folds, n_samples = task.get_split_dimensions()
print(
    "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
        task_id, n_repeats, n_folds, n_samples,
    )
)

for repeat_idx in range(n_repeats):
    for fold_idx in range(n_folds):
        for sample_idx in range(n_samples):
            train_indices, test_indices = task.get_train_test_split_indices(
                repeat=repeat_idx, fold=fold_idx, sample=sample_idx,
            )
            X_train = X.iloc[train_indices]
            y_train = y.iloc[train_indices]
            X_test = X.iloc[test_indices]
            y_test = y.iloc[test_indices]

            print(
                "Repeat #{}, fold #{}, samples {}: X_train.shape: {}, "
                "y_train.shape {}, X_test.shape {}, y_test.shape {}".format(
                    repeat_idx,
                    fold_idx,
                    sample_idx,
                    X_train.shape,
                    y_train.shape,
                    X_test.shape,
                    y_test.shape,
                )
            )

In [19]:
# learning curves version

task_id = 1702
task = openml.tasks.get_task(task_id)
X, y = task.get_X_and_y(dataset_format="dataframe")
n_repeats, n_folds, n_samples = task.get_split_dimensions()
print(
    "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
        task_id, n_repeats, n_folds, n_samples,
    )
)

for repeat_idx in range(n_repeats):
    for fold_idx in range(n_folds):
        for sample_idx in range(n_samples):
            train_indices, test_indices = task.get_train_test_split_indices(
                repeat=repeat_idx, fold=fold_idx, sample=sample_idx,
            )
            X_train = X.iloc[train_indices]
            y_train = y.iloc[train_indices]
            X_test = X.iloc[test_indices]
            y_test = y.iloc[test_indices]

            print(
                "Repeat #{}, fold #{}, samples {}: X_train.shape: {}, "
                "y_train.shape {}, X_test.shape {}, y_test.shape {}".format(
                    repeat_idx,
                    fold_idx,
                    sample_idx,
                    X_train.shape,
                    y_train.shape,
                    X_test.shape,
                    y_test.shape,
                )
            )

Task 1702: number of repeats: 1, number of folds: 10, number of samples 12.
Repeat #0, fold #0, samples 0: X_train.shape: (64, 36), y_train.shape (64,), X_test.shape (320, 36), y_test.shape (320,)
Repeat #0, fold #0, samples 1: X_train.shape: (91, 36), y_train.shape (91,), X_test.shape (320, 36), y_test.shape (320,)
Repeat #0, fold #0, samples 2: X_train.shape: (128, 36), y_train.shape (128,), X_test.shape (320, 36), y_test.shape (320,)
Repeat #0, fold #0, samples 3: X_train.shape: (181, 36), y_train.shape (181,), X_test.shape (320, 36), y_test.shape (320,)
Repeat #0, fold #0, samples 4: X_train.shape: (256, 36), y_train.shape (256,), X_test.shape (320, 36), y_test.shape (320,)
Repeat #0, fold #0, samples 5: X_train.shape: (362, 36), y_train.shape (362,), X_test.shape (320, 36), y_test.shape (320,)
Repeat #0, fold #0, samples 6: X_train.shape: (512, 36), y_train.shape (512,), X_test.shape (320, 36), y_test.shape (320,)
Repeat #0, fold #0, samples 7: X_train.shape: (724, 36), y_train.sh