In [128]:
# One-hot encoding mapping for the species
species_one_hot = {
    'Iris-setosa': [1, 0, 0],
    'Iris-versicolor': [0, 1, 0],
    'Iris-virginica': [0, 0, 1]
}

# Initialize a list to store the processed dataset
processed_dataset = []

# Define path of dataset file
file_path = '../Datasets/iris/iris.data'

# Read all lines of the file
with open(file_path, 'r') as file:
    lines = file.readlines()

# Process each line in the dataset
for line in lines:
    # Remove newline characters and split by commas
    parts = line.strip().split(',')
    
    # Convert numerical features to float
    numerical_values = [float(value) for value in parts[:-1]]
    
    # Get the one-hot encoding for the species
    species_encoding = species_one_hot.get(parts[-1])
    
    # Combine the numerical features and the one-hot encoded species
    if species_encoding is not None:
        processed_dataset.append(numerical_values + species_encoding)

# Display the first few processed rows
processed_dataset[:10]

[[5.1, 3.5, 1.4, 0.2, 1, 0, 0],
 [4.9, 3.0, 1.4, 0.2, 1, 0, 0],
 [4.7, 3.2, 1.3, 0.2, 1, 0, 0],
 [4.6, 3.1, 1.5, 0.2, 1, 0, 0],
 [5.0, 3.6, 1.4, 0.2, 1, 0, 0],
 [5.4, 3.9, 1.7, 0.4, 1, 0, 0],
 [4.6, 3.4, 1.4, 0.3, 1, 0, 0],
 [5.0, 3.4, 1.5, 0.2, 1, 0, 0],
 [4.4, 2.9, 1.4, 0.2, 1, 0, 0],
 [4.9, 3.1, 1.5, 0.1, 1, 0, 0]]

In [126]:
# Data Preprocessing - Encoding Algorithm

# todo 1: Combine all variables according to their indexes
# todo 2: Test on iris dataset, afterwards on test-data (file path: test-data.txt)
# todo 3: Work on the target variable index. It should be changeable and not constraint as being at the end.

# ! Assumed that the target variable (class) is located at the last index of a list !

# Initialize a list to store the processed dataset
processed_dataset = []

# Initialize a dictionary to store the continuous typed features and their indexes
# Key = feature value
# Value = index of the list
continuous_typed_features = dict()

# Initialize a dictionary to store the integer typed features and their indexes
# Key = feature value
# Value = index of the list
integer_typed_features = dict()

# Initialize a dictionary to store the categorical features and their indexes
# Key = index of the list where the feature resides
# Value = dictionary of that feature values (Key: feature value & Value: number of first occurrence in list (1,2,3,..))
categorical_features = dict()

# Initialize a dictionary to store the target variables (classes) of dataset
# Key = target value
# Value = number of first occurrence in list (1,2,3,..)
targets = dict()

# Define path of dataset file
file_path = '../Datasets/iris/iris.data'

# Read all lines of the file
with open(file_path, 'r') as file:
    lines = file.readlines()

# Control whether the value is numeric
def is_numeric(value) -> bool:
    try:
        # Try to cast
        float(value)
        # The value is either float or integer
        return True
    except:
        # The values is not numeric
        return False

# Process each line in the dataset
for line in lines:
    if not line.isspace():
        # Initialize list of continuous typed features
        cont_feat = []
        # Initialize list of integer typed features
        int_feat = []
        # Initialize list of categorical features
        cat_feat = []
        # Initialize list of target variables
        tar_feat = []

        # Remove newline characters and split by commas
        values = line.strip().split(',')
        
        for index in range(0, len(values)-1):
            value = values[index]
            # Check whether the feature is numeric
            if is_numeric(value):
                # If the value is floating-point number
                if len(value.split('.')) > 1:
                    cont_feat.append(float(value))
                    continuous_typed_features[float(value)] = index # ? Is it essential to store numeric numbers in a dictionary ?
                # If the value is integer
                elif len(value.split('.')) == 1:
                    int_feat.append(int(value))
                    integer_typed_features[int(value)] = index # ? Is it essential to store numeric numbers in a dictionary ?
            # In case of non-numeric values
            else:
                cat_feat.append(value)

                # Process the categorical feature values for future encoding
                # Get dictionary of categorical features at index
                feats = categorical_features.get(index)

                # The feature is present in the dictionary
                if feats is not None:
                    if value not in list(feats.keys()):
                        feats[value] = len(feats.keys()) + 1
                # The feature is not present yet
                else:
                    # Add the feature to dictionary
                    categorical_features[index] = {
                        value : 1
                    }

        # Process the target values for future encoding
        # Check for new target values
        if values[-1] not in list(targets.keys()):
            targets[values[-1]] = len(targets.keys()) + 1
        
        # Append the target value to the list
        tar_feat.append(targets.get(values[-1]))

        # Combine all values
        processed_dataset.append(int_feat + cont_feat + cat_feat + tar_feat) # ! Target list must be at the end !

# Function of encoding target variable (class) of the dataset
def encode_target_variable(targets_count, target_value) -> list:
    # Evaluating possible cases:
    # Possible results for 3 classes:       Possible results for 2 classes:
    # [1,0,0] : target_value=1              [1,0] : target_value=1
    # [0,1,0] : target_value=2              [0,1] : target_value=2
    # [0,0,1] : target_value=3
    
    # Initialize the list of binary logic map
    binary_logic_map = []
    # Counter for while-loop
    counter = 1

    # Fill the list by zero or one
    while counter <= targets_count:
        if target_value == counter:
            binary_logic_map.append(1)
        else:
            binary_logic_map.append(0)
        counter += 1

    return binary_logic_map

# Encode the categorical features
for row in range(0, len(processed_dataset)):
    for col in range(0, len(processed_dataset[row])-1): # ! Ignoring the target values
        if not is_numeric(processed_dataset[row][col]):
            cat_feat = processed_dataset[row][col]
            processed_dataset[row][col] = categorical_features[col][cat_feat]

# Encode the target values according to the binary logic map
for i in range(0, len(processed_dataset)):
    processed_dataset[i] = processed_dataset[i][:-1] + encode_target_variable(
        len(targets.keys()), processed_dataset[i][-1]
    )

# ! Combine all variables according to their indexes !

# Display the processed rows
processed_dataset[:]

[[5.1, 3.5, 1.4, 0.2, 1, 0, 0],
 [4.9, 3.0, 1.4, 0.2, 1, 0, 0],
 [4.7, 3.2, 1.3, 0.2, 1, 0, 0],
 [4.6, 3.1, 1.5, 0.2, 1, 0, 0],
 [5.0, 3.6, 1.4, 0.2, 1, 0, 0],
 [5.4, 3.9, 1.7, 0.4, 1, 0, 0],
 [4.6, 3.4, 1.4, 0.3, 1, 0, 0],
 [5.0, 3.4, 1.5, 0.2, 1, 0, 0],
 [4.4, 2.9, 1.4, 0.2, 1, 0, 0],
 [4.9, 3.1, 1.5, 0.1, 1, 0, 0],
 [5.4, 3.7, 1.5, 0.2, 1, 0, 0],
 [4.8, 3.4, 1.6, 0.2, 1, 0, 0],
 [4.8, 3.0, 1.4, 0.1, 1, 0, 0],
 [4.3, 3.0, 1.1, 0.1, 1, 0, 0],
 [5.8, 4.0, 1.2, 0.2, 1, 0, 0],
 [5.7, 4.4, 1.5, 0.4, 1, 0, 0],
 [5.4, 3.9, 1.3, 0.4, 1, 0, 0],
 [5.1, 3.5, 1.4, 0.3, 1, 0, 0],
 [5.7, 3.8, 1.7, 0.3, 1, 0, 0],
 [5.1, 3.8, 1.5, 0.3, 1, 0, 0],
 [5.4, 3.4, 1.7, 0.2, 1, 0, 0],
 [5.1, 3.7, 1.5, 0.4, 1, 0, 0],
 [4.6, 3.6, 1.0, 0.2, 1, 0, 0],
 [5.1, 3.3, 1.7, 0.5, 1, 0, 0],
 [4.8, 3.4, 1.9, 0.2, 1, 0, 0],
 [5.0, 3.0, 1.6, 0.2, 1, 0, 0],
 [5.0, 3.4, 1.6, 0.4, 1, 0, 0],
 [5.2, 3.5, 1.5, 0.2, 1, 0, 0],
 [5.2, 3.4, 1.4, 0.2, 1, 0, 0],
 [4.7, 3.2, 1.6, 0.2, 1, 0, 0],
 [4.8, 3.1, 1.6, 0.2, 1, 0, 0],
 [5.4, 3

In [121]:
x = {
  1: {
    'category 1 - feature sample 1': 1,
    'category 1 - feature sample 2': 2
  },
  3: {
    'category 2 - feature sample 1': 1,
    'category 2 - feature sample 2': 2,
    'category 2 - feature sample 3': 3,
  }
}

for i in x[1].keys():
  print(i)

y = {}

print(len(y.keys()))

category 1 - feature sample 1
category 1 - feature sample 2
0
