# User guide
- https://openml.github.io/openml-python/develop/usage.html#usage

In [2]:
import openml
from openml import datasets, tasks, runs, flows, config
from openml.datasets import edit_dataset, fork_dataset, get_dataset
import os, pandas as pd, sklearn, arff, pprint, numpy as np
from sklearn import neighbors
from openml.tasks import TaskType

In [None]:
# A simple example

config.start_using_configuration_for_example()

task = openml.tasks.get_task(99) # 403
data = openml.datasets.get_dataset(task.dataset_id)
clf = neighbors.KNeighborsClassifier(n_neighbors=5)
run = openml.runs.run_model_on_task(clf, task, avoid_duplicate_runs=False)
# Publish the experiment on OpenML (optional, requires an API key).
# For this tutorial, our configuration publishes to the test server
# as to not crowd the main server with runs created by examples.
myrun = run.publish()
print(f"kNN on {data.name}: {myrun.openml_url}")

config.stop_using_configuration_for_example()

## Configuration

In [3]:
config.apikey = ''
config.server = 'https://www.openml.org/api/v1' 
config.set_cache_directory(os.path.expanduser('~/openml/cache'))
config.cachedir = '~/openml/cache'

# Using OpenML benchmark suites

In [4]:
# using the main entity type task, only benchmark suites are returned
# each benchmark suite has an ID, some also have an alias. These can be
# used to obtain the full details. 

studies = openml.study.list_suites(status = 'all') # OrderedDict
studies.keys()
studies[99]

{'id': 99,
 'alias': 'OpenML-CC18',
 'main_entity_type': 'task',
 'name': 'OpenML-CC18 Curated Classification benchmark',
 'status': 'active',
 'creation_date': '2019-02-21 18:47:13',
 'creator': 1}

In [5]:
benchmark_suite = openml.study.get_suite('OpenML-CC18') # obtain the benchmark suite
benchmark_suite

OpenML Benchmark Suite
ID..............: 99
Name............: OpenML-CC18 Curated Classification benchmark
Status..........: active
Main Entity Type: task
Study URL.......: https://www.openml.org/s/99
# of Data.......: 72
# of Tasks......: 72
Creator.........: https://www.openml.org/u/1
Upload Time.....: 2019-02-21 18:47:13

In [6]:
for task_id in benchmark_suite.tasks:  # iterate over all tasks
    task = openml.tasks.get_task(task_id)  # download the OpenML task
    print(task)
    features, targets = task.get_X_and_y()  # get the data
    break

OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 3
Task URL.............: https://www.openml.org/t/3
Estimation Procedure.: crossvalidation
Target Feature.......: class
# of Classes.........: 2
Cost Matrix..........: Available


In [None]:
benchmark_suite = openml.study.get_suite('OpenML-CC18')  # obtain the benchmark suite

# build a scikit-learn classifier
clf = sklearn.pipeline.make_pipeline(sklearn.preprocessing.Imputer(),
                                     sklearn.tree.DecisionTreeClassifier())

for task_id in benchmark_suite.tasks:  # iterate over all tasks

    task = openml.tasks.get_task(task_id)  # download the OpenML task
    run = openml.runs.run_model_on_task(clf, task)  # run the classifier on the task
    score = run.get_metric_score(sklearn.metrics.accuracy_score)  # print accuracy score
    print('Data set: %s; Accuracy: %0.2f' % (task.get_dataset().name,score.mean()))
    run.publish()  # publish the experiment on OpenML (optional, requires internet and an API key)
    print('URL for run: %s/run/%d' %(openml.config.server,run.run_id))
    break

In [None]:
# Define a scikit-learn classifier or pipeline
clf = sklearn.pipeline.Pipeline(
    steps=[
        ('imputer', sklearn.impute.SimpleImputer()),
        ('estimator', sklearn.tree.DecisionTreeClassifier())
    ]
)
# Download the OpenML task for the german credit card dataset with 10-fold
# cross-validation.
task = openml.tasks.get_task(32)
# Run the scikit-learn model on the task.
run = openml.runs.run_model_on_task(clf, task)
# Publish the experiment on OpenML (optional, requires an API key.
# You can get your own API key by signing up to OpenML.org)
run.publish()
print(f'View the run online: {run.openml_url}')

In [None]:
# runs = openml.runs.list_runs(task=benchmark_suite.tasks, limit=1000)

# Further code examples and use cases

In [None]:
# https://github.com/openml/benchmark-suites

# Cheet Sheet

In [8]:
dlist = datasets.list_datasets(size=100)
pd.DataFrame.from_dict(dlist, orient='index')[['name','NumberOfInstances', 'NumberOfFeatures']][:3]

Unnamed: 0,name,NumberOfInstances,NumberOfFeatures
2,anneal,898.0,39.0
3,kr-vs-kp,3196.0,37.0
4,labor,57.0,17.0


# Benchmark studies
- studies are a container for runs 

In [10]:
studies = openml.study.list_studies(output_format="dataframe", status="all")
studies.head(n=10)

Unnamed: 0,id,alias,main_entity_type,name,status,creation_date,creator
1,1,Study_1,run,A large-scale comparison of classification alg...,in_preparation,2015-10-20 15:27:26,2
2,2,Study_2,run,Fast Algorithm Selection using Learning Curves,in_preparation,2015-10-20 15:28:44,2
3,3,Study_3,run,Multi-Task Learning with a Natural Metric for ...,in_preparation,2015-10-20 15:34:39,2
5,5,Study_5,run,Local and Global Feature Selection on Multilab...,in_preparation,2015-11-19 11:20:33,749
7,7,Study_7,run,Massive machine learning experiments using mlr...,in_preparation,2016-01-06 17:49:36,64
8,8,Study_8,run,Decision tree comparaison,in_preparation,2016-03-13 13:38:31,1135
10,10,Study_10,run,Collaborative primer,in_preparation,2016-03-16 12:16:08,507
11,11,Study_11,run,Having a Blast: Meta-Learning and Heterogeneou...,in_preparation,2016-03-22 16:48:06,1
12,12,Study_12,run,Subspace Clustering via Seeking Neighbors with...,in_preparation,2016-03-31 15:05:45,1195
13,13,Study_13,run,Meta-QSAR: learning how to learn QSARs,in_preparation,2016-04-05 13:57:41,62


In [11]:
study = openml.study.get_study(123)
print('\n study \n', study)
print('\n study description \n', study.description)
print('\n study runs \n', study.runs[:10])


 study 
 OpenML Study
ID..............: 123
Name............: Linear vs. Non Linear
Status..........: active
Main Entity Type: run
Study URL.......: https://www.openml.org/s/123
# of Data.......: 299
# of Tasks......: 299
# of Flows......: 5
# of Runs.......: 1693
Creator.........: https://www.openml.org/u/1
Upload Time.....: 2019-02-21 19:55:30

 study description 
 Comparison of linear and non-linear models.

[Jupyter Notebook](https://github.com/janvanrijn/linear-vs-non-linear/blob/master/notebook/Linear-vs-Non-Linear.ipynb)

 study runs 
 [9199877, 9199878, 9199879, 9199880, 9199881, 9199882, 9199883, 9199884, 9199885, 9199886]


In [12]:
evaluations = openml.evaluations.list_evaluations(
    function="predictive_accuracy", output_format="dataframe", study=study.study_id,
)
evaluations.head()

Unnamed: 0,run_id,task_id,setup_id,flow_id,flow_name,data_id,data_name,function,upload_time,uploader,uploader_name,value,values,array_data
0,9199877,3,7130157,7722,sklearn.model_selection._search.RandomizedSear...,3,kr-vs-kp,predictive_accuracy,2018-05-06 08:39:07,3886,Benjamin Strang,0.974969,,
1,9199878,6,7130158,7722,sklearn.model_selection._search.RandomizedSear...,6,letter,predictive_accuracy,2018-05-06 08:41:04,3886,Benjamin Strang,0.7165,,
2,9199879,6,7130159,7729,sklearn.model_selection._search.RandomizedSear...,6,letter,predictive_accuracy,2018-05-06 08:43:06,3886,Benjamin Strang,0.9672,,
3,9199880,11,7130158,7722,sklearn.model_selection._search.RandomizedSear...,11,balance-scale,predictive_accuracy,2018-05-06 08:43:08,3886,Benjamin Strang,0.8864,,
4,9199881,11,7130159,7729,sklearn.model_selection._search.RandomizedSear...,11,balance-scale,predictive_accuracy,2018-05-06 08:43:09,3886,Benjamin Strang,0.976,,


# Tasks
- https://openml.github.io/openml-python/develop/examples/30_extended/tasks_tutorial.html#sphx-glr-examples-30-extended-tasks-tutorial-py

In [14]:
# tasks = openml.tasks.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION)
# tasks = openml.tasks.list_tasks(output_format="dataframe") # 46592개
# tasks = openml.tasks.list_tasks(tag="OpenML100", output_format="dataframe") # 91개
# tasks = openml.tasks.list_tasks(data_id=1471, output_format="dataframe") 
tasks = openml.tasks.list_tasks(size=10, offset=50, output_format="dataframe")

# tasks = pd.DataFrame.from_dict(tasks, orient="index")
print(tasks.columns)
print(f"First 5 of {len(tasks)} tasks:")
tasks.head()

Index(['tid', 'ttid', 'did', 'name', 'task_type', 'status',
       'estimation_procedure', 'evaluation_measures', 'source_data',
       'target_feature', 'MajorityClassSize', 'MaxNominalAttDistinctValues',
       'MinorityClassSize', 'NumberOfClasses', 'NumberOfFeatures',
       'NumberOfInstances', 'NumberOfInstancesWithMissingValues',
       'NumberOfMissingValues', 'NumberOfNumericFeatures',
       'NumberOfSymbolicFeatures', 'number_samples'],
      dtype='object')
First 5 of 10 tasks:


Unnamed: 0,tid,ttid,did,name,task_type,status,estimation_procedure,evaluation_measures,source_data,target_feature,...,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures,number_samples
59,59,TaskType.SUPERVISED_CLASSIFICATION,61,iris,Supervised Classification,active,10-fold Crossvalidation,predictive_accuracy,61,class,...,3.0,50.0,3,5,150,0,0,4,1,
60,60,TaskType.SUPERVISED_CLASSIFICATION,62,zoo,Supervised Classification,active,10-fold Crossvalidation,predictive_accuracy,62,type,...,7.0,4.0,7,17,101,0,0,1,16,
62,62,TaskType.LEARNING_CURVE,2,anneal,Learning Curve,active,10 times 10-fold Learning Curve,predictive_accuracy,2,class,...,7.0,8.0,5,39,898,898,22175,6,33,9.0
63,63,TaskType.LEARNING_CURVE,3,kr-vs-kp,Learning Curve,active,10 times 10-fold Learning Curve,predictive_accuracy,3,class,...,3.0,1527.0,2,37,3196,0,0,0,37,12.0
64,64,TaskType.LEARNING_CURVE,4,labor,Learning Curve,active,10 times 10-fold Learning Curve,predictive_accuracy,4,class,...,3.0,20.0,2,17,57,56,326,8,9,1.0


In [15]:
filtered_tasks = tasks.query("NumberOfInstances > 500 and NumberOfInstances < 1000")
print(list(filtered_tasks.index))

[62]


In [16]:
filtered_tasks = filtered_tasks.query('estimation_procedure == "10-fold Crossvalidation"')
print(list(filtered_tasks.index))

[]


## Downloading tasks

In [17]:
ids = [2, 1891, 31, 9983]
tasks = openml.tasks.get_tasks(ids)
print(tasks[0])

OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 2
Task URL.............: https://www.openml.org/t/2
Estimation Procedure.: crossvalidation
Evaluation Measure...: predictive_accuracy
Target Feature.......: class
# of Classes.........: 6
Cost Matrix..........: Available


## Creating tasks

In [45]:
openml.config.start_using_configuration_for_example()

try:
    my_task = openml.tasks.create_task(
        task_type=TaskType.SUPERVISED_CLASSIFICATION,
        dataset_id=128,
        target_name="class",
        evaluation_measure="predictive_accuracy",
        estimation_procedure_id=1,
    )
    my_task.publish()
except openml.exceptions.OpenMLServerException as e:
    # Error code for 'task already exists'
    if e.code == 614:
        # Lookup task
        tasks = openml.tasks.list_tasks(data_id=128, output_format="dataframe")
        tasks = tasks.query(
            'task_type == "Supervised Classification" '
            'and estimation_procedure == "10-fold Crossvalidation" '
            'and evaluation_measures == "predictive_accuracy"'
        )
        task_id = tasks.loc[:, "tid"].values[0]
        print("Task already exists. Task ID is", task_id)

# reverting to prod server
openml.config.stop_using_configuration_for_example()



  "Using the test server may result in reduced performance of the API!".format(server)


# Datasets

In [18]:
openml_list = openml.datasets.list_datasets()  # returns a dict

# Show a nice table with some key data properties
datalist = pd.DataFrame.from_dict(openml_list, orient="index")
datalist = datalist[["did", "name", "NumberOfInstances", "NumberOfFeatures", "NumberOfClasses"]]

print(f"First 10 of {len(datalist)} datasets...")
datalist.head(n=10)

# The same can be done with lesser lines of code
openml_df = openml.datasets.list_datasets(output_format="dataframe")
openml_df.head(n=10)

First 10 of 4333 datasets...


Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
2,2,anneal,1,1,active,ARFF,684.0,7.0,8.0,5.0,39.0,898.0,898.0,22175.0,6.0,33.0
3,3,kr-vs-kp,1,1,active,ARFF,1669.0,3.0,1527.0,2.0,37.0,3196.0,0.0,0.0,0.0,37.0
4,4,labor,1,1,active,ARFF,37.0,3.0,20.0,2.0,17.0,57.0,56.0,326.0,8.0,9.0
5,5,arrhythmia,1,1,active,ARFF,245.0,13.0,2.0,13.0,280.0,452.0,384.0,408.0,206.0,74.0
6,6,letter,1,1,active,ARFF,813.0,26.0,734.0,26.0,17.0,20000.0,0.0,0.0,16.0,1.0
7,7,audiology,1,1,active,ARFF,57.0,24.0,1.0,24.0,70.0,226.0,222.0,317.0,0.0,70.0
8,8,liver-disorders,1,1,active,ARFF,,,,0.0,6.0,345.0,0.0,0.0,6.0,0.0
9,9,autos,1,1,active,ARFF,67.0,22.0,3.0,6.0,26.0,205.0,46.0,59.0,15.0,11.0
10,10,lymph,1,1,active,ARFF,81.0,8.0,2.0,4.0,19.0,148.0,0.0,0.0,3.0,16.0
11,11,balance-scale,1,1,active,ARFF,288.0,3.0,49.0,3.0,5.0,625.0,0.0,0.0,4.0,1.0


In [19]:
datalist[datalist.NumberOfInstances > 10000].sort_values(["NumberOfInstances"]).head(n=20)
""
datalist.query('name == "eeg-eye-state"')
""
datalist.query("NumberOfClasses > 50")

Unnamed: 0,did,name,NumberOfInstances,NumberOfFeatures,NumberOfClasses
1491,1491,one-hundred-plants-margin,1600.0,65.0,100.0
1492,1492,one-hundred-plants-shape,1600.0,65.0,100.0
1493,1493,one-hundred-plants-texture,1599.0,65.0,100.0
4552,4552,BachChoralHarmony,5665.0,17.0,102.0
41167,41167,dionis,416188.0,61.0,355.0
41169,41169,helena,65196.0,28.0,100.0
41960,41960,seattlecrime6,523590.0,8.0,144.0
41983,41983,CIFAR-100,60000.0,3073.0,100.0
42078,42078,beer_reviews,1586614.0,13.0,104.0
42087,42087,beer_reviews,1586614.0,13.0,104.0
