In [1]:
import numpy as np
import pandas as pd
import uuid

# From Pandas dataframes

In [2]:
data = {
        'x1': np.random.uniform(low=100, high=200, size=100),
        'x2': np.random.uniform(low=5000, high=10000, size=100),
        'x3': [str(uuid.uuid4()) for _ in range(100)],
        'x4': np.random.randint(low=0, high=42, size=100),
        'select': np.random.choice(a=[False, True], size=100)
}
df = pd.DataFrame(data=data)

In [3]:
df.head()

Unnamed: 0,x1,x2,x3,x4,select
0,162.18128,8288.079075,a95fc8ff-c9d5-430e-b5f4-4bbe12d28ac2,32,False
1,147.418671,8161.865902,a8e2c568-dd5b-40dc-b0d9-b9aa02535e05,16,False
2,185.284693,5977.657132,531c88b6-c022-4625-b095-d29bb85a60d1,8,False
3,189.836512,8903.404609,e2598801-cf6b-4e64-80cf-9293aa2a004d,24,True
4,176.881828,8299.411857,ea1a87ef-109a-42dd-8727-5270288aa8df,2,False


If no columns are specified as outputs, but default the right-most column (in this case, `select`) will be considered the output.

In [4]:
from trustyai.model import Dataset

ds = Dataset.from_df(df)

In [5]:
len(ds.inputs)

100

In [6]:
print(ds.inputs[0].features)

[Feature{name='x1', type=number, value=162.18128018074924}, Feature{name='x2', type=number, value=8288.079075388061}, Feature{name='x3', type=categorical, value=a95fc8ff-c9d5-430e-b5f4-4bbe12d28ac2}, Feature{name='x4', type=number, value=32}]


In [7]:
len(ds.outputs)

100

In [8]:
print(ds.outputs[0].outputs)

[Output{value=false, type=boolean, score=1.0, name='select'}]


We can specify the outputs explicitly, though:

In [9]:
ds = Dataset.from_df(df, outputs=['x1', 'x4'])

In [10]:
print(ds.inputs[0].features)

[Feature{name='x2', type=number, value=8288.079075388061}, Feature{name='x3', type=categorical, value=a95fc8ff-c9d5-430e-b5f4-4bbe12d28ac2}, Feature{name='select', type=boolean, value=false}]


In [11]:
print(ds.outputs[0].outputs)

[Output{value=162.18128018074924, type=number, score=1.0, name='x1'}, Output{value=32.0, type=number, score=1.0, name='x4'}]


# From NumPy arrays

In [12]:
a = np.random.rand(100, 5)

In [13]:
a[0:10, :]

array([[0.27953374, 0.14350214, 0.85835075, 0.18845806, 0.01206274],
       [0.53248526, 0.67195857, 0.71734904, 0.39929908, 0.66092277],
       [0.45276017, 0.0480453 , 0.58338174, 0.78245456, 0.93636845],
       [0.79390006, 0.32327136, 0.98300792, 0.13107499, 0.43482848],
       [0.02587618, 0.01923374, 0.16373235, 0.35224968, 0.14961464],
       [0.82164409, 0.76620166, 0.23224766, 0.57686832, 0.5354371 ],
       [0.27912304, 0.73473497, 0.91184353, 0.99083292, 0.93095475],
       [0.67366937, 0.92351402, 0.90195418, 0.58753937, 0.67496797],
       [0.02141828, 0.34273817, 0.61652239, 0.8376688 , 0.6660273 ],
       [0.20558989, 0.38528159, 0.14387614, 0.33153919, 0.97763219]])

In [14]:
ds = Dataset.from_numpy(a)

In [15]:
len(ds.inputs)

100

Input and output features will be named automatically. As with dataframes, if no output is specified, the right-most column will be selected:

In [16]:
print(ds.inputs[0].features)

[Feature{name='input-0', type=number, value=0.2795337386802256}, Feature{name='input-1', type=number, value=0.1435021446779371}, Feature{name='input-2', type=number, value=0.8583507494791435}, Feature{name='input-3', type=number, value=0.18845806309449065}]


In [17]:
print(ds.outputs[0].outputs)

[Output{value=0.012062739012541845, type=number, score=1.0, name='output-0'}]


To specify the output columns, with NumPy arrays, we use the column indices:

In [18]:
ds = Dataset.from_numpy(a, outputs=[0, 2])

In [19]:
print(ds.inputs[0].features)

[Feature{name='input-0', type=number, value=0.1435021446779371}, Feature{name='input-1', type=number, value=0.18845806309449065}, Feature{name='input-2', type=number, value=0.012062739012541845}]


In [20]:
print(ds.outputs[0].outputs)

[Output{value=0.2795337386802256, type=number, score=1.0, name='output-0'}, Output{value=0.8583507494791435, type=number, score=1.0, name='output-1'}]
