• 2.1) Add a method to the Dataset class that removes
all samples containing at least one null value (NaN).

In [3]:
import numpy as np
import pandas as pd

class Dataset:
    def __init__(self, X, y):
        self.X = X  # Attribute that stores the independent variables
        self.y = y  # Attribute that stores the corresponding labels

    def dropna(self):
        # 1. Identify the rows that do not contain NaN values
        non_nan_mask = ~np.any(np.isnan(self.X), axis=1) 
        
        # 2. Filter the rows in X that do not have NaN values
        self.X = self.X[non_nan_mask]  
        
        # 3. Update the y vector by removing the entries corresponding to the removed rows
        self.y = self.y[non_nan_mask] 
        
        return self

    def fillna(self, value):
        # For each column in X, replace NaN values with the appropriate value
        for i in range(self.X.shape[1]):  
            if np.any(np.isnan(self.X[:, i])): 
                # If the value is "mean", calculate the mean ignoring NaN
                if value == "mean":
                    fill_value = np.nanmean(self.X[:, i])  
                # If the value is "median", calculate the median ignoring NaN
                elif value == "median":
                    fill_value = np.nanmedian(self.X[:, i]) 
                else:
                    fill_value = value 

                # Replace NaN in column i with the calculated value
                self.X[:, i] = np.where(np.isnan(self.X[:, i]), fill_value, self.X[:, i])
        
        return self

    def remove_by_index(self, index):
        # Check if the index is valid
        if index < 0 or index >= self.X.shape[0]:  
            raise IndexError("Index out of bounds")  

        # Create a mask that keeps all rows except the one being removed
        mask = np.ones(self.X.shape[0], dtype=bool)
        mask[index] = False  
        
        # Apply the mask to filter X and y
        self.X = self.X[mask] 
        self.y = self.y[mask] 

        return self 

In [4]:
path = r"C:\Users\tiago\OneDrive\Documentos\GitHub\si\datasets\iris\iris.csv"
iris_df = pd.read_csv(path)

# Convert independent variables (X) and label vector (y)
X = iris_df.iloc[:, :-1].to_numpy()  # All columns except the last (independent variables)
y = iris_df.iloc[:, -1].to_numpy()   # The last column contains the classes (labels)

# Creating the Dataset object
dataset = Dataset(X, y)

# Removing samples with NaN
dataset.dropna()

print("X after dropna:", dataset.X)
print("y after dropna:", dataset.y)

X after dropna: [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.1 1.5 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1

2.2) Add a method to the Dataset class that
replaces all null values with another value or the
mean or median of the feature/variable.

In [5]:
# Convert independent variables (X) and label vector (y)
X = iris_df.iloc[:, :-1].to_numpy()  # All columns except the last (independent variables)
y = iris_df.iloc[:, -1].to_numpy()   # The last column contains the classes (labels)

# Adding NaN values for the example
X[0, 0] = np.nan  # Introducing a NaN for testing
X[2, 1] = np.nan  # Introducing another NaN for testing

# Creating the Dataset object
dataset = Dataset(X, y)

# Filling NaNs with the mean of the columns
dataset.fillna("mean")

print("X after fillna with mean:", dataset.X)

X after fillna with mean: [[5.84832215 3.5        1.4        0.2       ]
 [4.9        3.         1.4        0.2       ]
 [4.7        3.05302013 1.3        0.2       ]
 [4.6        3.1        1.5        0.2       ]
 [5.         3.6        1.4        0.2       ]
 [5.4        3.9        1.7        0.4       ]
 [4.6        3.4        1.4        0.3       ]
 [5.         3.4        1.5        0.2       ]
 [4.4        2.9        1.4        0.2       ]
 [4.9        3.1        1.5        0.1       ]
 [5.4        3.7        1.5        0.2       ]
 [4.8        3.4        1.6        0.2       ]
 [4.8        3.         1.4        0.1       ]
 [4.3        3.         1.1        0.1       ]
 [5.8        4.         1.2        0.2       ]
 [5.7        4.4        1.5        0.4       ]
 [5.4        3.9        1.3        0.4       ]
 [5.1        3.5        1.4        0.3       ]
 [5.7        3.8        1.7        0.3       ]
 [5.1        3.8        1.5        0.3       ]
 [5.4        3.4        1.7       

• 2.3) Add a method to the Dataset class that
removes a sample by its index. 

In [6]:
# Remove the sample at index 5 (for example)
dataset.remove_by_index(5)

# Checking the result
print("X after removing index 5:", dataset.X)
print("y after removing index 5:", dataset.y)

X after removing index 5: [[5.84832215 3.5        1.4        0.2       ]
 [4.9        3.         1.4        0.2       ]
 [4.7        3.05302013 1.3        0.2       ]
 [4.6        3.1        1.5        0.2       ]
 [5.         3.6        1.4        0.2       ]
 [4.6        3.4        1.4        0.3       ]
 [5.         3.4        1.5        0.2       ]
 [4.4        2.9        1.4        0.2       ]
 [4.9        3.1        1.5        0.1       ]
 [5.4        3.7        1.5        0.2       ]
 [4.8        3.4        1.6        0.2       ]
 [4.8        3.         1.4        0.1       ]
 [4.3        3.         1.1        0.1       ]
 [5.8        4.         1.2        0.2       ]
 [5.7        4.4        1.5        0.4       ]
 [5.4        3.9        1.3        0.4       ]
 [5.1        3.5        1.4        0.3       ]
 [5.7        3.8        1.7        0.3       ]
 [5.1        3.8        1.5        0.3       ]
 [5.4        3.4        1.7        0.2       ]
 [5.1        3.7        1.5       