In [1]:
import numpy as np
import pandas as pd
import uuid

# From Pandas dataframes

In [2]:
data = {
        'x1': np.random.uniform(low=100, high=200, size=100),
        'x2': np.random.uniform(low=5000, high=10000, size=100),
        'x3': [str(uuid.uuid4()) for _ in range(100)],
        'x4': np.random.randint(low=0, high=42, size=100),
        'select': np.random.choice(a=[False, True], size=100)
}
df = pd.DataFrame(data=data)

In [3]:
df.head()

Unnamed: 0,x1,x2,x3,x4,select
0,199.691719,8423.634001,aa433471-f2f4-4210-beab-2a5fde4afc93,13,True
1,175.01551,5144.95498,8c8cfe67-7257-4f3d-9506-f4816c4e770e,41,True
2,181.826085,8659.891995,7bd8c06d-1064-4f13-b372-c65af8e30149,41,True
3,166.700817,7487.153923,2b0370a4-b41e-4b16-9c1f-a65f0a65464c,3,False
4,157.718382,5539.941095,6c245358-d261-4e15-8517-c3f09ecdac66,24,True


If no columns are specified as outputs, but default the right-most column (in this case, `select`) will be considered the output.

In [4]:
from trustyai.model import Dataset

ds = Dataset.from_df(df)

In [5]:
len(ds.inputs)

100

In [6]:
print(ds.inputs[0].features)

[Feature{name='x1', type=number, value=199.69171905389746}, Feature{name='x2', type=number, value=8423.634000506001}, Feature{name='x3', type=categorical, value=aa433471-f2f4-4210-beab-2a5fde4afc93}, Feature{name='x4', type=number, value=13}]


In [7]:
len(ds.outputs)

100

In [8]:
print(ds.outputs[0].outputs)

[Output{value=true, type=boolean, score=1.0, name='select'}]


We can specify the outputs explicitly, though:

In [9]:
ds = Dataset.from_df(df, outputs=['x1', 'x4'])

In [10]:
print(ds.inputs[0].features)

[Feature{name='x2', type=number, value=8423.634000506001}, Feature{name='x3', type=categorical, value=aa433471-f2f4-4210-beab-2a5fde4afc93}, Feature{name='select', type=boolean, value=true}]


In [11]:
print(ds.outputs[0].outputs)

[Output{value=199.69171905389746, type=number, score=1.0, name='x1'}, Output{value=13.0, type=number, score=1.0, name='x4'}]


# From NumPy arrays

In [12]:
a = np.random.rand(100, 5)

In [13]:
a[0:10, :]

array([[0.68468018, 0.60703705, 0.7619673 , 0.56649717, 0.72130264],
       [0.89003104, 0.84624854, 0.01211185, 0.271607  , 0.58048523],
       [0.32379929, 0.33718089, 0.41112117, 0.82583901, 0.65967431],
       [0.48934436, 0.28016091, 0.21326921, 0.66476719, 0.09109155],
       [0.02946721, 0.8990184 , 0.39921803, 0.7282812 , 0.18430694],
       [0.04172949, 0.39444183, 0.04355018, 0.88119431, 0.25622821],
       [0.76606467, 0.60040384, 0.07779518, 0.67480852, 0.7191328 ],
       [0.29319575, 0.84587892, 0.43774675, 0.23388257, 0.92403114],
       [0.96647025, 0.35831978, 0.8780017 , 0.18066205, 0.85001627],
       [0.93969352, 0.53751407, 0.35651024, 0.52651873, 0.53328941]])

In [14]:
ds = Dataset.from_numpy(a)

In [15]:
len(ds.inputs)

100

Input and output features will be named automatically. As with dataframes, if no output is specified, the right-most column will be selected:

In [16]:
print(ds.inputs[0].features)

[Feature{name='input-0', type=number, value=0.6846801847772033}, Feature{name='input-1', type=number, value=0.6070370508287989}, Feature{name='input-2', type=number, value=0.7619673021775413}, Feature{name='input-3', type=number, value=0.5664971655888972}]


In [17]:
print(ds.outputs[0].outputs)

[Output{value=0.7213026353546987, type=number, score=1.0, name='output-0'}]


To specify the output columns, with NumPy arrays, we use the column indices:

In [18]:
ds = Dataset.from_numpy(a, outputs=[0, 2])

In [19]:
print(ds.inputs[0].features)

[Feature{name='input-0', type=number, value=0.6070370508287989}, Feature{name='input-1', type=number, value=0.5664971655888972}, Feature{name='input-2', type=number, value=0.7213026353546987}]


In [20]:
print(ds.outputs[0].outputs)

[Output{value=0.6846801847772033, type=number, score=1.0, name='output-0'}, Output{value=0.7619673021775413, type=number, score=1.0, name='output-1'}]
