In [128]:
# One-hot encoding mapping for the species
species_one_hot = {
    'Iris-setosa': [1, 0, 0],
    'Iris-versicolor': [0, 1, 0],
    'Iris-virginica': [0, 0, 1]
}

# Initialize a list to store the processed dataset
processed_dataset = []

# Define path of dataset file
file_path = '../Datasets/iris/iris.data'

# Read all lines of the file
with open(file_path, 'r') as file:
    lines = file.readlines()

# Process each line in the dataset
for line in lines:
    # Remove newline characters and split by commas
    parts = line.strip().split(',')
    
    # Convert numerical features to float
    numerical_values = [float(value) for value in parts[:-1]]
    
    # Get the one-hot encoding for the species
    species_encoding = species_one_hot.get(parts[-1])
    
    # Combine the numerical features and the one-hot encoded species
    if species_encoding is not None:
        processed_dataset.append(numerical_values + species_encoding)

# Display the first few processed rows
processed_dataset[:10]

[[5.1, 3.5, 1.4, 0.2, 1, 0, 0],
 [4.9, 3.0, 1.4, 0.2, 1, 0, 0],
 [4.7, 3.2, 1.3, 0.2, 1, 0, 0],
 [4.6, 3.1, 1.5, 0.2, 1, 0, 0],
 [5.0, 3.6, 1.4, 0.2, 1, 0, 0],
 [5.4, 3.9, 1.7, 0.4, 1, 0, 0],
 [4.6, 3.4, 1.4, 0.3, 1, 0, 0],
 [5.0, 3.4, 1.5, 0.2, 1, 0, 0],
 [4.4, 2.9, 1.4, 0.2, 1, 0, 0],
 [4.9, 3.1, 1.5, 0.1, 1, 0, 0]]

In [16]:
# Data Preprocessing - Encoding Algorithm

# todo : Work on the target variable index. It should be changeable and not constraint as being at the end.

# ! Assumed that the target variable (class) is located at the last index of a list !

# Initialize a list to store the processed dataset
processed_dataset = []

# Initialize a list to store the encoded dataset
encoded_dataset = []

# Initialize a dictionary to store the continuous typed features and their indexes
# Key = index of the list where the feature resides
# Value = dictionary of that feature values (Key&Value: feature value)
continuous_typed_features = dict()

# Initialize a dictionary to store the integer typed features and their indexes
# Key = index of the list where the feature resides
# Value = dictionary of that feature values (Key&Value: feature value)
integer_typed_features = dict()

# Initialize a dictionary to store the categorical features and their indexes
# Key = index of the list where the feature resides
# Value = dictionary of that feature values (Key: feature value & Value: number of first occurrence in list (1,2,3,..))
categorical_features = dict()

# Initialize a dictionary to store the target variables (classes) of dataset
# Key = target value
# Value = number of first occurrence in list (1,2,3,..)
targets = dict()

# Initialize a dictionary to store encode values
encoding_values = dict()

# Define path of dataset file
file_path = 'test-data.csv'

# Read all lines of the file
with open(file_path, 'r') as file:
    lines = file.readlines()

# Control whether the value is numeric
def is_numeric(value) -> bool:
    try:
        # Try to cast
        float(value)
        # The value is either float or integer
        return True
    except:
        # The values is not numeric
        return False

# Process each line in the dataset
for line in lines:
    if not line.isspace():
        # Initialize list of the instance's data
        instance = []

        # Remove newline characters and split by commas
        values = line.strip().split(',')
        
        for index in range(0, len(values)-1):
            value = values[index]
            # Check whether the feature is numeric
            if is_numeric(value):
                # If the value is floating-point number
                if len(value.split('.')) > 1:
                    instance.append(float(value))

                    # Process the continuous typed feature values for future encoding
                    # Get dictionary of continuous typed features at index
                    feats = continuous_typed_features.get(index)

                    # The feature is present in the dictionary
                    if feats is not None:
                        if float(value) not in list(feats.keys()):
                            continuous_typed_features[index][float(value)] = float(value)
                    # The feature is not present yet
                    else:
                        # Add the feature to dictionary
                        continuous_typed_features[index] = {
                            float(value) : float(value)
                        }
                # If the value is integer
                elif len(value.split('.')) == 1:
                    instance.append(int(value))
                    
                    # Process the integer typed feature values for future encoding
                    # Get dictionary of integer typed features at index
                    feats = integer_typed_features.get(index)

                    # The feature is present in the dictionary
                    if feats is not None:
                        if int(value) not in list(feats.keys()):
                            integer_typed_features[index][int(value)] = int(value)
                    # The feature is not present yet
                    else:
                        # Add the feature to dictionary
                        integer_typed_features[index] = {
                            int(value) : int(value)
                        }
            # In case of non-numeric values
            else:
                instance.append(value)

                # Process the categorical feature values for future encoding
                # Get dictionary of categorical features at index
                feats = categorical_features.get(index)

                # The feature is present in the dictionary
                if feats is not None:
                    if value not in list(feats.keys()):
                        categorical_features[index][value] = len(feats.keys()) + 1
                # The feature is not present yet
                else:
                    # Add the feature to dictionary
                    categorical_features[index] = {
                        value : 1
                    }

        # Process the target values for future encoding
        # Check for new target values
        if values[-1] not in list(targets.keys()):
            targets[values[-1]] = len(targets.keys()) + 1
        
        # Append the target value to the list
        instance.append(targets.get(values[-1]))

        # Add the instance to the processed dataset
        processed_dataset.append(instance)

# Display the processed rows
print('Processed dataset:')
for i in processed_dataset:
    print(i)
print()

print('Categorical features:')
for i in categorical_features.items():
    print(i)
print()

print('Integer typed features:')
for i in integer_typed_features.items():
    print(i)
print()

print('Continuous typed features:')
for i in continuous_typed_features.items():
    print(i)
print()

print('Target variables:')
for i in targets.items():
    print(i)
print()

# Function of encoding target variable (class) of the dataset
def encode_target_variable(targets_count, target_value) -> list:
    # Evaluating possible cases:
    # Possible results for 3 classes:       Possible results for 2 classes:
    # [1,0,0] : target_value=1              [1,0] : target_value=1
    # [0,1,0] : target_value=2              [0,1] : target_value=2
    # [0,0,1] : target_value=3
    
    # Initialize the list of binary logic map
    binary_logic_map = []
    # Counter for while-loop
    counter = 1

    # Fill the list by zero or one
    while counter <= targets_count:
        if target_value == counter:
            binary_logic_map.append(1)
        else:
            binary_logic_map.append(0)
        counter += 1

    return binary_logic_map

# Go through all columns of each row and store the encoding values
for col in range(0, len(processed_dataset[0])):
    for row in range(0, len(processed_dataset)):
        # Get the value
        value = processed_dataset[row][col]
        # If the value is categorical value and not already stored in dictionary
        if (not is_numeric(value)) and (not value in list(encoding_values.values())):
            # Assign the value to the incremented key value
            encoding_values[len(encoding_values)+1] = value

print('Encoding map of categorical features:')
for i in encoding_values.items():
    print(i)
print()

# Encode the target values according to the binary logic map
for i in range(0, len(processed_dataset)):
    processed_dataset[i] = processed_dataset[i][:-1] + encode_target_variable(
        len(targets.keys()), processed_dataset[i][-1]
    )

# Display the processed rows after target encoding
print('Processed dataset after target variable encoding:')
for i in processed_dataset:
    print(i)
print()

# Get the key of specified unique value of the dictionary
def get_key_of_by_value(dict, value):
    for key, val in dict.items():
        if val == value:
            return key
    return None

# Combine all variables according to their indexes
for row in range(0, len(processed_dataset)):
    # Initialize list of the instance's data
    instance = []
    
    # Go through all variables and find the right index
    for col in range(0, len(processed_dataset[row])):
        # Integer typed features
        if col in list(integer_typed_features.keys()): # Index check
            for feats in list(integer_typed_features.get(col).keys()):
                if feats == processed_dataset[row][col]:
                    instance.append(integer_typed_features[col][feats])
        
        # Continuous typed features
        elif col in list(continuous_typed_features.keys()):  # Index check
            for feats in list(continuous_typed_features.get(col).keys()):
                if feats == processed_dataset[row][col]:
                    instance.append(continuous_typed_features[col][feats])
        
        # Categorical features
        elif col in list(categorical_features.keys()):  # Index check
            for feats in list(categorical_features.get(col).keys()):
                if feats == processed_dataset[row][col]:
                    instance.append(get_key_of_by_value(encoding_values, feats))
        
        # Target values
        else:
            instance.append(processed_dataset[row][col])
    
    # Add the instance to new dataset list
    encoded_dataset.append(instance)

# Display the encoded rows
print('Encoded dataset:')
for i in encoded_dataset:
    print(i)
print()

Processed dataset:
['black', 4.7, 170, 'yes', 2, 'android', 1]
['ash grey', 7.2, 195, 'no', 4, 'android', 2]
['blue', 6.8, 205, 'no', 3, 'ios', 3]
['white', 5.7, 200, 'no', 3, 'android', 1]
['ash grey', 5.2, 295, 'no', 5, 'android', 2]
['red', 8.8, 245, 'yes', 2, 'ios', 3]

Categorical features:
(0, {'black': 1, 'ash grey': 2, 'blue': 3, 'white': 4, 'red': 5})
(3, {'yes': 1, 'no': 2})
(5, {'android': 1, 'ios': 2})

Integer typed features:
(2, {170: 170, 195: 195, 205: 205, 200: 200, 295: 295, 245: 245})
(4, {2: 2, 4: 4, 3: 3, 5: 5})

Continuous typed features:
(1, {4.7: 4.7, 7.2: 7.2, 6.8: 6.8, 5.7: 5.7, 5.2: 5.2, 8.8: 8.8})

Target variables:
('samsung', 1)
('huawei', 2)
('apple', 3)

Encoding map of categorical features:
(1, 'black')
(2, 'ash grey')
(3, 'blue')
(4, 'white')
(5, 'red')
(6, 'yes')
(7, 'no')
(8, 'android')
(9, 'ios')

Processed dataset after target variable encoding:
['black', 4.7, 170, 'yes', 2, 'android', 1, 0, 0]
['ash grey', 7.2, 195, 'no', 4, 'android', 0, 1, 0]
['

In [None]:
# targets
('samsung', 1)
('huawei', 2)
('apple', 3)

# categorical
(0, {'black': 1, 'ash grey': 2, 'blue': 3, 'white': 4, 'red': 5})
(3, {'yes': 1, 'no': 2})
(5, {'android': 1, 'ios': 2})

# integer
(2, {170: 170, 195: 195, 205: 205, 200: 200, 295: 295, 245: 245})
(4, {2: 2, 4: 4, 3: 3, 5: 5})

# continuous
(1, {4.7: 4.7, 7.2: 7.2, 6.8: 6.8, 5.7: 5.7, 5.2: 5.2, 8.8: 8.8})

# encoding map of categorical features
(1, 'black')
(2, 'ash grey')
(3, 'blue')
(4, 'white')
(5, 'red')
(6, 'yes')
(7, 'no')
(8, 'android')
(9, 'ios')

# processed dataset
['black', 4.7, 170, 'yes', 2, 'android', 1]
['ash grey', 7.2, 195, 'no', 4, 'android', 2]
['blue', 6.8, 205, 'no', 3, 'ios', 3]
['white', 5.7, 200, 'no', 3, 'android', 1]
['ash grey', 5.2, 295, 'no', 5, 'android', 2]
['red', 8.8, 245, 'yes', 2, 'ios', 3]

# processed dataset after target encoding
['black', 4.7, 170, 'yes', 2, 'android', 1, 0, 0]
['ash grey', 7.2, 195, 'no', 4, 'android', 0, 1, 0]
['blue', 6.8, 205, 'no', 3, 'ios', 0, 0, 1]
['white', 5.7, 200, 'no', 3, 'android', 1, 0, 0]
['ash grey', 5.2, 295, 'no', 5, 'android', 0, 1, 0]
['red', 8.8, 245, 'yes', 2, 'ios', 0, 0, 1]

# encoded dataset
[1, 4.7, 170, 6, 2, 8, 1, 0, 0]
[2, 7.2, 195, 7, 4, 8, 0, 1, 0]
[3, 6.8, 205, 7, 3, 9, 0, 0, 1]
[4, 5.7, 200, 7, 3, 8, 1, 0, 0]
[2, 5.2, 295, 7, 5, 8, 0, 1, 0]
[5, 8.8, 245, 6, 2, 9, 0, 0, 1]