In [1]:
import trustyai
trustyai.init()

import numpy as np
import pandas as pd
import uuid

# From Pandas dataframes

In [2]:
data = {
        'x1': np.random.uniform(low=100, high=200, size=100),
        'x2': np.random.uniform(low=5000, high=10000, size=100),
        'x3': [str(uuid.uuid4()) for _ in range(100)],
        'x4': np.random.randint(low=0, high=42, size=100),
        'select': np.random.choice(a=[False, True], size=100)
}
df = pd.DataFrame(data=data)

In [3]:
df.head()

Unnamed: 0,x1,x2,x3,x4,select
0,137.046187,8211.221755,94c165a3-2f96-4c0e-ba91-ff7389389cc6,8,True
1,128.00379,8740.583505,c6d48954-b812-4c66-8510-f369592d3c2d,19,False
2,133.66552,5012.144082,73fe8814-a486-46ba-a3fa-37545dbab13c,34,True
3,184.285242,9162.904404,302c1f95-bd2a-40ae-9484-8d5636657b48,10,True
4,198.094202,5211.204676,9df44360-f50b-4e4a-99dc-1fc2f52b590e,40,True


If no columns are specified as outputs, but default the right-most column (in this case, `select`) will be considered the output.

In [4]:
from trustyai.model import Dataset

ds = Dataset.from_df(df)

In [5]:
len(ds.inputs)

100

In [6]:
print(ds.inputs[0].features)

[Feature{name='x1', type=number, value=137.04618745682313}, Feature{name='x2', type=number, value=8211.221755064362}, Feature{name='x3', type=categorical, value=94c165a3-2f96-4c0e-ba91-ff7389389cc6}, Feature{name='x4', type=number, value=8}]


In [7]:
len(ds.outputs)

100

In [8]:
print(ds.outputs[0].outputs)

[Output{value=true, type=boolean, score=1.0, name='select'}]


We can specify the outputs explicitly, though:

In [9]:
ds = Dataset.from_df(df, outputs=['x1', 'x4'])

In [10]:
print(ds.inputs[0].features)

[Feature{name='x2', type=number, value=8211.221755064362}, Feature{name='x3', type=categorical, value=94c165a3-2f96-4c0e-ba91-ff7389389cc6}, Feature{name='select', type=boolean, value=true}]


In [11]:
print(ds.outputs[0].outputs)

[Output{value=137.04618745682313, type=number, score=1.0, name='x1'}, Output{value=8.0, type=number, score=1.0, name='x4'}]


# From NumPy arrays

In [12]:
a = np.random.rand(100, 5)

In [13]:
a[0:10, :]

array([[0.06846266, 0.87747272, 0.57910691, 0.91964535, 0.99541014],
       [0.56332706, 0.9327922 , 0.61668083, 0.60503134, 0.0248979 ],
       [0.3340935 , 0.48836763, 0.23359505, 0.91591934, 0.16273877],
       [0.24497669, 0.56953452, 0.45697015, 0.47467802, 0.02795062],
       [0.40499701, 0.04143314, 0.47772322, 0.57541183, 0.9599715 ],
       [0.44671596, 0.89711565, 0.86155502, 0.88764994, 0.62095446],
       [0.30886789, 0.62280324, 0.76152708, 0.90988784, 0.61695801],
       [0.63194451, 0.32250385, 0.93648568, 0.32525244, 0.12135435],
       [0.85597653, 0.73393663, 0.92814985, 0.48883991, 0.92768902],
       [0.73404678, 0.27637148, 0.55047676, 0.79044853, 0.91104915]])

In [14]:
ds = Dataset.from_numpy(a)

In [15]:
len(ds.inputs)

100

Input and output features will be named automatically. As with dataframes, if no output is specified, the right-most column will be selected:

In [16]:
print(ds.inputs[0].features)

[Feature{name='input-0', type=number, value=0.0684626610994179}, Feature{name='input-1', type=number, value=0.8774727223062212}, Feature{name='input-2', type=number, value=0.5791069119842132}, Feature{name='input-3', type=number, value=0.919645347608763}]


In [17]:
print(ds.outputs[0].outputs)

[Output{value=0.9954101410470496, type=number, score=1.0, name='output-0'}]


To specify the output columns, with NumPy arrays, we use the column indices:

In [18]:
ds = Dataset.from_numpy(a, outputs=[0, 2])

In [19]:
print(ds.inputs[0].features)

[Feature{name='input-0', type=number, value=0.8774727223062212}, Feature{name='input-1', type=number, value=0.919645347608763}, Feature{name='input-2', type=number, value=0.9954101410470496}]


In [20]:
print(ds.outputs[0].outputs)

[Output{value=0.0684626610994179, type=number, score=1.0, name='output-0'}, Output{value=0.5791069119842132, type=number, score=1.0, name='output-1'}]
