In [1]:
import numpy as np
import pandas as pd
import uuid

# From Pandas dataframes

In [2]:
data = {
        'x1': np.random.uniform(low=100, high=200, size=100),
        'x2': np.random.uniform(low=5000, high=10000, size=100),
        'x3': [str(uuid.uuid4()) for _ in range(100)],
        'x4': np.random.randint(low=0, high=42, size=100),
        'select': np.random.choice(a=[False, True], size=100)
}
df = pd.DataFrame(data=data)

In [3]:
df.head()

Unnamed: 0,x1,x2,x3,x4,select
0,110.775961,8897.843314,769f28d7-df70-4907-8f49-f8a5efeadbb2,17,False
1,134.70697,5968.935158,4d22cc7d-db38-4e25-8be4-f18394fb3705,33,False
2,124.157443,6880.264047,8de2e39b-406c-48ce-a6cc-169ee8f73ef0,39,True
3,170.961902,8536.049163,5727068c-35c9-4cde-83f0-9b5f522373de,39,False
4,172.601273,7478.442753,76c9587b-e68f-45e8-a39d-ed9fab2b5609,11,False


If no columns are specified as outputs, but default the right-most column (in this case, `select`) will be considered the output.

In [4]:
from trustyai.model import Dataset

ds = Dataset.from_df(df)

In [5]:
len(ds.inputs)

100

In [6]:
print(ds.inputs[0].features)

[Feature{name='x1', type=number, value=110.77596090318106}, Feature{name='x2', type=number, value=8897.843313935722}, Feature{name='x3', type=categorical, value=769f28d7-df70-4907-8f49-f8a5efeadbb2}, Feature{name='x4', type=number, value=17}]


In [7]:
len(ds.outputs)

100

In [8]:
print(ds.outputs[0].outputs)

[Output{value=false, type=boolean, score=1.0, name='select'}]


We can specify the outputs explicitly, though:

In [9]:
ds = Dataset.from_df(df, outputs=['x1', 'x4'])

In [10]:
print(ds.inputs[0].features)

[Feature{name='x2', type=number, value=8897.843313935722}, Feature{name='x3', type=categorical, value=769f28d7-df70-4907-8f49-f8a5efeadbb2}, Feature{name='select', type=boolean, value=false}]


In [11]:
print(ds.outputs[0].outputs)

[Output{value=110.77596090318106, type=number, score=1.0, name='x1'}, Output{value=17.0, type=number, score=1.0, name='x4'}]


# From NumPy arrays

In [12]:
a = np.random.rand(100, 5)

In [13]:
a[0:10, :]

array([[0.4941837 , 0.54138924, 0.79744718, 0.75966474, 0.6663965 ],
       [0.61321356, 0.93126828, 0.33589273, 0.41798983, 0.77295471],
       [0.0741324 , 0.22773897, 0.8609951 , 0.14798847, 0.29693842],
       [0.51173195, 0.29453894, 0.06253411, 0.39614618, 0.11552075],
       [0.46702147, 0.11739192, 0.2025332 , 0.19551742, 0.86863743],
       [0.17245352, 0.17041101, 0.57627698, 0.68669061, 0.37271922],
       [0.33980101, 0.45839755, 0.32261154, 0.9704679 , 0.13569625],
       [0.19868834, 0.30223221, 0.40911422, 0.53045714, 0.14810708],
       [0.85424792, 0.53920094, 0.17639279, 0.0272658 , 0.59151213],
       [0.94555969, 0.67387324, 0.06552553, 0.02347923, 0.3843669 ]])

In [14]:
ds = Dataset.from_numpy(a)

In [15]:
len(ds.inputs)

100

Input and output features will be named automatically. As with dataframes, if no output is specified, the right-most column will be selected:

In [16]:
print(ds.inputs[0].features)

[Feature{name='input-0', type=number, value=0.49418369920147376}, Feature{name='input-1', type=number, value=0.541389237195627}, Feature{name='input-2', type=number, value=0.79744718322189}, Feature{name='input-3', type=number, value=0.7596647445704917}]


In [17]:
print(ds.outputs[0].outputs)

[Output{value=0.6663965038979389, type=number, score=1.0, name='output-0'}]


To specify the output columns, with NumPy arrays, we use the column indices:

In [18]:
ds = Dataset.from_numpy(a, outputs=[0, 2])

In [19]:
print(ds.inputs[0].features)

[Feature{name='input-0', type=number, value=0.541389237195627}, Feature{name='input-1', type=number, value=0.7596647445704917}, Feature{name='input-2', type=number, value=0.6663965038979389}]


In [20]:
print(ds.outputs[0].outputs)

[Output{value=0.49418369920147376, type=number, score=1.0, name='output-0'}, Output{value=0.79744718322189, type=number, score=1.0, name='output-1'}]
