# Exercise 1

In [1]:
import numpy as np
from si.io.csv_file import read_csv
from si.data.dataset import Dataset

1.1) In this exercise, we will use the iris dataset. Load
the "iris.csv" using the appropriate method for this file
type (use the new functions from the package).

In [2]:
path = "../datasets/iris/iris.csv"
dataset = read_csv(filename=path, features=True, label=True)

1.2) Select the penultimate independent variable.
What is the dimension of the resulting array?

In [3]:
pen_ind_var = dataset.X[:, -2]
pen_ind_var.shape

(150,)

1.3) Select the last 10 samples from the iris dataset.
What is the mean of the last 10 samples for each
independent variable/feature?

In [4]:
last_10_samples = dataset.X[-10:, :]
mean = np.mean(last_10_samples, axis=0)
print(mean)

[6.45 3.03 5.33 2.17]


1.4) Select all samples from the dataset with values
less than or equal to 6 for all independent
variables/features. How many samples do you obtain?

In [5]:
mask = np.all(dataset.X <= 6, axis=1)
filtered_samples = dataset.X[mask]
len(filtered_samples)

89

1.5) Select all samples with a class/label different
from 'Iris-setosa'. How many samples do you obtain?

In [6]:
mask = dataset.y != 'Iris-setosa'
filtered_samples = dataset.X[mask]
len(filtered_samples)

100

## Optional

### Exercise 2 methods usage examples

#### Method dropna

Remove all samples containing at least one null value (NaN).

In [7]:
path = "../datasets/iris/iris.csv"
dataset = read_csv(filename=path, features=True, label=True)

In [8]:
dataset.X[0, 0] = np.nan
print(dataset.X[:3])
print(dataset.X.shape)
print(dataset.y.shape)

[[nan 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]]
(150, 4)
(150,)


In [9]:
dataset = Dataset.dropna(dataset)
print(dataset.X[:3])
print(dataset.X.shape)
print(dataset.y.shape)

[[4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]]
(149, 4)
(149,)


#### Method fillna

Replace all null values with another value

In [10]:
path = "../datasets/iris/iris.csv"
dataset = read_csv(filename=path, features=True, label=True)

In [11]:
dataset.X[0, 0] = np.nan
print(dataset.X[:3])

[[nan 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]]


In [12]:
Dataset.fillna(dataset, 5.0)
print(dataset.X[:3])

[[5.  3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]]


Replace all null values with the mean

In [13]:
path = "../datasets/iris/iris.csv"
dataset = read_csv(filename=path, features=True, label=True)

In [14]:
dataset.X[0, 0] = np.nan
print(dataset.X[:3])

[[nan 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]]


In [15]:
Dataset.fillna(dataset, 'mean')
print(np.nanmean(dataset.X[:, 0]))
print(dataset.X[:3])

5.8483221476510066
[[5.84832215 3.5        1.4        0.2       ]
 [4.9        3.         1.4        0.2       ]
 [4.7        3.2        1.3        0.2       ]]


Replace all null values with the median

In [16]:
path = "../datasets/iris/iris.csv"
dataset = read_csv(filename=path, features=True, label=True)

In [17]:
dataset.X[0, 0] = np.nan
print(dataset.X[:3])

[[nan 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]]


In [18]:
Dataset.fillna(dataset, 'median')
print(np.nanmedian(dataset.X[:, 0]))
print(dataset.X[:3])

5.8
[[5.8 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]]


#### Method remove_by_index

Remove the first sample of the dataset

In [19]:
path = "../datasets/iris/iris.csv"
dataset = read_csv(filename=path, features=True, label=True)

In [20]:
print(dataset.X[:3])
print(dataset.X.shape)
print(dataset.y.shape)

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]]
(150, 4)
(150,)


In [21]:
Dataset.remove_by_index(dataset, 0)
print(dataset.X[:3])
print(dataset.X.shape)
print(dataset.y.shape)

[[4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]]
(149, 4)
(149,)
