In [1]:
import trustyai

trustyai.init()

JVM already started


In [2]:
import numpy as np
import pandas as pd
import uuid

# From Pandas dataframes

In [3]:
data = {
        'x1': np.random.uniform(low=100, high=200, size=100),
        'x2': np.random.uniform(low=5000, high=10000, size=100),
        'x3': [str(uuid.uuid4()) for _ in range(100)],
        'x4': np.random.randint(low=0, high=42, size=100),
        'select': np.random.choice(a=[False, True], size=100)
}
df = pd.DataFrame(data=data)

In [4]:
df.head()

Unnamed: 0,x1,x2,x3,x4,select
0,192.413302,8090.983306,75fb58d1-92bb-47e2-b8aa-b2d8ee743b5b,0,True
1,167.928437,6271.652751,d19338e3-d274-4e53-a1ea-aefc5bf476de,10,False
2,163.343285,8759.261635,3b65e7dc-4120-4946-b2aa-8bd4a1d7b7d4,7,True
3,199.613108,5665.047383,a84fdd81-4bae-487f-87e5-4014a4b9c22e,16,True
4,132.654515,9360.414304,34698397-8e53-4735-b30b-43ab6d7b62a5,19,False


If no columns are specified as outputs, but default the right-most column (in this case, `select`) will be considered the output.

In [5]:
from trustyai.model import Dataset

ds = Dataset.from_df(df)

In [6]:
len(ds.inputs)

100

In [7]:
print(ds.inputs[0].features)

[Feature{name='x1', type=number, value=192.41330236121394}, Feature{name='x2', type=number, value=8090.983306133694}, Feature{name='x3', type=categorical, value=75fb58d1-92bb-47e2-b8aa-b2d8ee743b5b}, Feature{name='x4', type=number, value=0}]


In [8]:
len(ds.outputs)

100

In [9]:
print(ds.outputs[0].outputs)

[Output{value=true, type=boolean, score=1.0, name='select'}]


We can specify the outputs explicitly, though:

In [10]:
ds = Dataset.from_df(df, outputs=['x1', 'x4'])

In [11]:
print(ds.inputs[0].features)

[Feature{name='x2', type=number, value=8090.983306133694}, Feature{name='x3', type=categorical, value=75fb58d1-92bb-47e2-b8aa-b2d8ee743b5b}, Feature{name='select', type=boolean, value=true}]


In [12]:
print(ds.outputs[0].outputs)

[Output{value=192.41330236121394, type=number, score=1.0, name='x1'}, Output{value=0.0, type=number, score=1.0, name='x4'}]


# From NumPy arrays

In [13]:
a = np.random.rand(100, 5)

In [14]:
a[0:10, :]

array([[0.04229334, 0.50552788, 0.29568041, 0.85032319, 0.57693649],
       [0.28946931, 0.34368104, 0.41614114, 0.33756295, 0.68570649],
       [0.21141559, 0.64277735, 0.12951354, 0.5342462 , 0.87732067],
       [0.97396276, 0.01426533, 0.23252593, 0.03908873, 0.97164902],
       [0.13274512, 0.97182715, 0.16416289, 0.22757015, 0.31894558],
       [0.95391329, 0.57900831, 0.9081446 , 0.11285292, 0.76943787],
       [0.49356812, 0.08099851, 0.5367956 , 0.82392353, 0.48304461],
       [0.58690005, 0.80255662, 0.49041435, 0.79749456, 0.21884047],
       [0.69975546, 0.28111782, 0.79717247, 0.10067143, 0.67130933],
       [0.3724151 , 0.33908526, 0.62070464, 0.41828346, 0.0625247 ]])

In [15]:
ds = Dataset.from_numpy(a)

In [16]:
len(ds.inputs)

100

Input and output features will be named automatically. As with dataframes, if no output is specified, the right-most column will be selected:

In [17]:
print(ds.inputs[0].features)

[Feature{name='input-0', type=number, value=0.04229333521445078}, Feature{name='input-1', type=number, value=0.5055278848740934}, Feature{name='input-2', type=number, value=0.29568041115831967}, Feature{name='input-3', type=number, value=0.8503231909426352}]


In [18]:
print(ds.outputs[0].outputs)

[Output{value=0.5769364905115408, type=number, score=1.0, name='output-0'}]


To specify the output columns, with NumPy arrays, we use the column indices:

In [19]:
ds = Dataset.from_numpy(a, outputs=[0, 2])

In [20]:
print(ds.inputs[0].features)

[Feature{name='input-0', type=number, value=0.5055278848740934}, Feature{name='input-1', type=number, value=0.8503231909426352}, Feature{name='input-2', type=number, value=0.5769364905115408}]


In [21]:
print(ds.outputs[0].outputs)

[Output{value=0.04229333521445078, type=number, score=1.0, name='output-0'}, Output{value=0.29568041115831967, type=number, score=1.0, name='output-1'}]
