In [1]:
import trustyai

trustyai.init()

In [2]:
import numpy as np
import pandas as pd
import uuid

# From Pandas dataframes

In [3]:
data = {
        'x1': np.random.uniform(low=100, high=200, size=100),
        'x2': np.random.uniform(low=5000, high=10000, size=100),
        'x3': [str(uuid.uuid4()) for _ in range(100)],
        'x4': np.random.randint(low=0, high=42, size=100),
        'select': np.random.choice(a=[False, True], size=100)
}
df = pd.DataFrame(data=data)

In [4]:
df.head()

Unnamed: 0,x1,x2,x3,x4,select
0,180.336944,8511.929428,9c86c184-c012-40e6-9717-92fac3ded6b4,3,True
1,180.724603,7829.770876,32457b8c-5961-4d4a-8df4-9a3200b8f5f5,25,False
2,195.642489,9968.139201,d2c00720-cc19-440a-8307-822baca41d6b,8,True
3,143.899402,8825.876406,595ddbd0-95be-4614-95c5-61bee3e1709f,20,False
4,168.713244,7360.258384,54d6e7ab-5f63-475b-a478-76629212bc84,31,False


If no columns are specified as outputs, but default the right-most column (in this case, `select`) will be considered the output.

In [5]:
from trustyai.model import Dataset

ds = Dataset.from_df(df)

In [7]:
len(ds.inputs)

100

In [10]:
print(ds.inputs[0].features)

[Feature{name='x1', type=number, value=180.33694409031028}, Feature{name='x2', type=number, value=8511.929427809619}, Feature{name='x3', type=categorical, value=9c86c184-c012-40e6-9717-92fac3ded6b4}, Feature{name='x4', type=number, value=3}]


In [11]:
len(ds.outputs)

100

In [14]:
print(ds.outputs[0].outputs)

[Output{value=true, type=boolean, score=1.0, name='select'}]


We can specify the outputs explicitly, though:

In [15]:
ds = Dataset.from_df(df, outputs=['x1', 'x4'])

In [16]:
print(ds.inputs[0].features)

[Feature{name='x2', type=number, value=8511.929427809619}, Feature{name='x3', type=categorical, value=9c86c184-c012-40e6-9717-92fac3ded6b4}, Feature{name='select', type=boolean, value=true}]


In [17]:
print(ds.outputs[0].outputs)

[Output{value=180.33694409031028, type=number, score=1.0, name='x1'}, Output{value=3.0, type=number, score=1.0, name='x4'}]


# From NumPy arrays

In [18]:
a = np.random.rand(100, 5)

In [22]:
a[0:10, :]

array([[0.81313543, 0.37821429, 0.4235171 , 0.78807159, 0.1527972 ],
       [0.41059505, 0.1648843 , 0.12184703, 0.71026358, 0.00553422],
       [0.10407838, 0.45220526, 0.59695725, 0.56772907, 0.47156101],
       [0.84125378, 0.72438703, 0.6229128 , 0.06957149, 0.03252913],
       [0.07878667, 0.84110998, 0.23392872, 0.2289452 , 0.12636247],
       [0.16572858, 0.2801995 , 0.80852809, 0.33663819, 0.54473071],
       [0.40406926, 0.11531692, 0.38698205, 0.98688443, 0.10406886],
       [0.71084015, 0.43907801, 0.80731234, 0.94003446, 0.04215634],
       [0.30937055, 0.63099723, 0.05406696, 0.54085895, 0.47094727],
       [0.30250937, 0.60704153, 0.21670629, 0.31568341, 0.65088001]])

In [24]:
ds = Dataset.from_numpy(a)

In [25]:
len(ds.inputs)

100

Input and output features will be named automatically. As with dataframes, if no output is specified, the right-most column will be selected:

In [27]:
print(ds.inputs[0].features)

[Feature{name='input-0', type=number, value=0.8131354304497258}, Feature{name='input-1', type=number, value=0.378214286063826}, Feature{name='input-2', type=number, value=0.4235171002244317}, Feature{name='input-3', type=number, value=0.788071585544701}]


In [28]:
print(ds.outputs[0].outputs)

[Output{value=0.1527972048889169, type=number, score=1.0, name='output-0'}]


To specify the output columns, with NumPy arrays, we use the column indices:

In [29]:
ds = Dataset.from_numpy(a, outputs=[0, 2])

In [30]:
print(ds.inputs[0].features)

[Feature{name='input-0', type=number, value=0.378214286063826}, Feature{name='input-1', type=number, value=0.788071585544701}, Feature{name='input-2', type=number, value=0.1527972048889169}]


In [31]:
print(ds.outputs[0].outputs)

[Output{value=0.8131354304497258, type=number, score=1.0, name='output-0'}, Output{value=0.4235171002244317, type=number, score=1.0, name='output-1'}]
