In [1]:
import numpy as np
import pandas as pd
import uuid

# From Pandas dataframes

In [2]:
data = {
        'x1': np.random.uniform(low=100, high=200, size=100),
        'x2': np.random.uniform(low=5000, high=10000, size=100),
        'x3': [str(uuid.uuid4()) for _ in range(100)],
        'x4': np.random.randint(low=0, high=42, size=100),
        'select': np.random.choice(a=[False, True], size=100)
}
df = pd.DataFrame(data=data)

In [3]:
df.head()

Unnamed: 0,x1,x2,x3,x4,select
0,160.261305,6978.850214,9e6c22d5-7703-4cbf-87e3-6d94188ef07f,14,True
1,132.906274,6061.594338,173066d2-b9e1-4d00-a2e3-572762a227cc,6,True
2,106.956706,5566.16472,ea962608-cdde-481a-8a9c-92b160e1f4f1,3,False
3,140.831635,7149.308507,8c7ef383-6f8b-4f86-9c5c-13354de9ef50,2,False
4,159.180976,6562.940986,a322e3ad-c948-43eb-a2a1-7b0173116786,27,False


If no columns are specified as outputs, but default the right-most column (in this case, `select`) will be considered the output.

In [4]:
from trustyai.model import Dataset

ds = Dataset.from_df(df)

In [5]:
len(ds.inputs)

100

In [6]:
print(ds.inputs[0].features)

[Feature{name='x1', type=number, value=160.26130450351235}, Feature{name='x2', type=number, value=6978.850214496881}, Feature{name='x3', type=categorical, value=9e6c22d5-7703-4cbf-87e3-6d94188ef07f}, Feature{name='x4', type=number, value=14}]


In [7]:
len(ds.outputs)

100

In [8]:
print(ds.outputs[0].outputs)

[Output{value=true, type=boolean, score=1.0, name='select'}]


We can specify the outputs explicitly, though:

In [9]:
ds = Dataset.from_df(df, outputs=['x1', 'x4'])

In [10]:
print(ds.inputs[0].features)

[Feature{name='x2', type=number, value=6978.850214496881}, Feature{name='x3', type=categorical, value=9e6c22d5-7703-4cbf-87e3-6d94188ef07f}, Feature{name='select', type=boolean, value=true}]


In [11]:
print(ds.outputs[0].outputs)

[Output{value=160.26130450351235, type=number, score=1.0, name='x1'}, Output{value=14.0, type=number, score=1.0, name='x4'}]


# From NumPy arrays

In [12]:
a = np.random.rand(100, 5)

In [13]:
a[0:10, :]

array([[0.71124837, 0.25497396, 0.0043879 , 0.01941601, 0.13462067],
       [0.49391799, 0.4434559 , 0.81988994, 0.0335407 , 0.63943506],
       [0.08233212, 0.57428066, 0.4512564 , 0.22443756, 0.43393187],
       [0.89279482, 0.06394515, 0.2991043 , 0.50414473, 0.11474676],
       [0.63873596, 0.20519217, 0.43393211, 0.30621764, 0.4230174 ],
       [0.87026573, 0.50252132, 0.52861229, 0.85258927, 0.1854916 ],
       [0.99222415, 0.66777924, 0.86628986, 0.20177231, 0.16301299],
       [0.21190206, 0.72516796, 0.3604707 , 0.02035639, 0.20955053],
       [0.20244529, 0.43631068, 0.03200247, 0.52953023, 0.62815561],
       [0.41459242, 0.25297997, 0.31081976, 0.82771245, 0.6524537 ]])

In [14]:
ds = Dataset.from_numpy(a)

In [15]:
len(ds.inputs)

100

Input and output features will be named automatically. As with dataframes, if no output is specified, the right-most column will be selected:

In [16]:
print(ds.inputs[0].features)

[Feature{name='input-0', type=number, value=0.7112483688476773}, Feature{name='input-1', type=number, value=0.25497395568912773}, Feature{name='input-2', type=number, value=0.004387904765764783}, Feature{name='input-3', type=number, value=0.019416008661728656}]


In [17]:
print(ds.outputs[0].outputs)

[Output{value=0.13462066527655192, type=number, score=1.0, name='output-0'}]


To specify the output columns, with NumPy arrays, we use the column indices:

In [18]:
ds = Dataset.from_numpy(a, outputs=[0, 2])

In [19]:
print(ds.inputs[0].features)

[Feature{name='input-0', type=number, value=0.25497395568912773}, Feature{name='input-1', type=number, value=0.019416008661728656}, Feature{name='input-2', type=number, value=0.13462066527655192}]


In [20]:
print(ds.outputs[0].outputs)

[Output{value=0.7112483688476773, type=number, score=1.0, name='output-0'}, Output{value=0.004387904765764783, type=number, score=1.0, name='output-1'}]
