In [33]:
# Synthetic Minority Over-sampling TEchnique - SMOTE Algorithm

# The SMOTE algorithm can be described as follows:
# 1. Making random choices
#   - Take difference between a sample (could be chosen randomly) and its k nearest neighbour,
#     where k is user-defined number.
#   - Choose randomly one sample among the k nearest neighbors.
#   - Multiply the difference by a random number between 0 and 1.
#   - Add this difference to the sample to generate a new synthetic example in feature space.
#   - Continue on with next nearest neighbour up to user-defined number, basically,
#     up to the number of new samples to be generated.
#
# 2. Taking the nearest neighbor
#   - Take the difference between the feature vector (sample) under consideration and 
#     its nearest neighbor.
#   - Multiply this difference by a random number between 0 and 1.
#   - Add it to the feature vector under consideration.
#   - Perform these steps until the required new samples are generated.
#     (The formulas to find the required samples are given below.)

# The formulas used in my modified SMOTE algorithm are as follows:
# Imbalance rate = (Minority class samples / Majority class samples) * 100
# Percentage value = [(Majority class samples / Minority class samples) - 1] * 100
# New samples to be generated = [(Percentage value) * |Minority class samples|] / 100

# ? How my algorithm works ?
# - Calculate minimum and maximum values for each attribute (column) of the minority class
# - Attribute of the new synthetic example = random(0-1) * (max-min) + min
# - Continue this until the required samples are generated

# ! Caution: Assumed that the dataset has no missing value and consists of numeric values only,
# !          and the imbalanced dataset evaluated according one minority class.

from random import random

# Initialize list for the processed dataset
processed_dataset = []

# Initialize list for the dataset after over-sampling
over_sampled_dataset = []

# Initialize list for the raw dataset
raw_dataset = []

# Initialize list for the labels of dataset
labels = []

# Initialize dictionary for class count
# Key: class label
# Value: count
class_count = dict()

# Initialize list for the minority class samples
minority_class = []

# Initialize list for the majority class samples
majority_class = []

# Initialize dictionary for min-max values of each minority class attributes
# Keys: index of attribute (column index)
# Values: dictionary with 'min' and 'max' keys
min_max_values = dict()

# Determine the file path of dataset
file_path = '../Custom Datasets/imbalanced-test.csv'

# Read all lines of the file
with open(file_path, 'r') as file:
  lines = file.readlines()

# Get the line where the labels reside
for line_number in range(0, len(lines)):
    if not lines[line_number].isspace():
        # Store the labels
        labels = lines.pop(line_number).strip().split(',')
        break

print(f"Labels:\n{labels}")
print()

# Store raw line in the dataset
for line in lines:
   if not line.isspace():
      # Store sample data after removing newline characters and splitting
      raw_dataset.append(line.strip().split(","))

print("Raw dataset:")
for sample in raw_dataset:
  print(sample)
print()

# Function for detecting numeric typed values
def is_numeric(value) -> bool:
    try:
        # Try to cast
        float(value)
        # The value is either float or integer
        return True
    except:
        # The values is not numeric
        return False

# Function for detecting float typed values
def is_float(string) -> bool:
  try:
    # If the value is integer typed, casting won't throw exception
    int(string)
    return False
  except:
    # The value is float typed, casting threw exception
    return True

# Process the raw data by type casting
for row in range(0, len(raw_dataset)):
  instance = []
  for col in range(0, len(labels)):
    value = raw_dataset[row][col]
    if not is_numeric(value):
      instance.append(value)
    elif is_float(value):
      instance.append(float(value))
    else:
      instance.append(int(value))
  processed_dataset.append(instance)

print("Processed dataset:")
for sample in processed_dataset:
  print(sample)
print()

# Count the samples of each class
for sample in processed_dataset:
   for cl in sample[-1]:
     if class_count.get(cl) is not None:
        class_count[cl] += 1
     else:
        class_count[cl] = 1

print("Class counts:")
for cl in class_count.keys():
   print(f"{cl}:\t{class_count.get(cl)}")
print()

# Get the key of specified unique value of the dictionary
def get_key_by_value(dict, value):
    for key, val in dict.items():
        if val == value:
            return key
    return None

# Get sample counts of each class
majority_class_samples = max(class_count.values())
minority_class_samples = min(class_count.values())

# Get labels of each class
majority_class_label = get_key_by_value(class_count, majority_class_samples)
minority_class_label = get_key_by_value(class_count, minority_class_samples)

# Percentage value = [(Majority class samples / Minority class samples) - 1] * 100
percentage_value = ((majority_class_samples / minority_class_samples) - 1) * 100

# New samples to be generated = [(Percentage value) * |Minority class samples|] / 100
new_sample_count = int((percentage_value * minority_class_samples) / 100)

print(f"Percentage value: {percentage_value}\nNew sample count: {new_sample_count}\n")

# Separate minority and majority classes
for sample in processed_dataset:
  if sample[labels.index('Class')] == minority_class_label:
    minority_class.append(sample)
  elif sample[labels.index('Class')] == majority_class_label:
    majority_class.append(sample)

print("Minority class:")
for min_sample in minority_class:
   print(min_sample)
print()

print("Majority class:")
for max_sample in majority_class:
   print(max_sample)
print()

# Function to calculate min-max values of each attribute of the dataset
def update_minority_min_max(dataset):
  for row in range(0, len(dataset)):
    sample = dataset[row]
    for col in range(0, len(sample)):
        if col == labels.index('Class'):
          continue
        else:
          attribute = sample[col]
          # Initialize dictionary if not exists
          if min_max_values.get(col) is None:
            min_max_values[col] = {
                'min': attribute,
                'max': attribute
            }
          else:
            if min_max_values[col]['min'] > attribute:
                min_max_values[col]['min'] = attribute
            elif min_max_values[col]['max'] < attribute:
                min_max_values[col]['max'] = attribute

# Initialize min-max values of each attribute
update_minority_min_max(minority_class)

print("Min-Max values of minority class atrributes:")
for attr in range(0, len(labels[:-1])):
   print(f"{attr}. Attribute:\tMin:{min_max_values[attr]['min']}\tMax:{min_max_values[attr]['max']}")
print()

# Function to generate attribute for new synthetic sample
def generate_synthetic_attribute(attribute_index):
   min_value = min_max_values[attribute_index]['min']
   max_value = min_max_values[attribute_index]['max']
   new_synthetic_sample = (max_value - min_value) * random() + min_value
   return round(new_synthetic_sample, 2)

for row in range(0, len(minority_class)):
  if new_sample_count == 0:
    break
  else:
    new_synthetic_sample = []
    for col in range(0, len(labels)):
      if col == labels.index('Class'):
        continue
      else:   
        new_attribute = generate_synthetic_attribute(col)
        new_synthetic_sample.append(new_attribute)
    new_synthetic_sample.append(minority_class_label)
    minority_class.append(new_synthetic_sample)
    update_minority_min_max(minority_class)
    new_sample_count -= 1

print("Minority class after over-sampling:")
for sample in minority_class:
   print(sample)
print()

# Combine the classes
over_sampled_dataset = minority_class + majority_class

print("Dataset after over-sampling:")
for sample in over_sampled_dataset:
   print(sample)
print()

Labels:
['A1', 'A2', 'Class']

Raw dataset:
['4', '15', 'X']
['5', '14', 'X']
['6', '13', 'X']
['5.6', '12', 'X']
['15', '3', 'Y']
['13', '2', 'Y']
['16', '4', 'Y']
['14', '3.5', 'Y']
['15', '2.7', 'Y']
['14.5', '3.6', 'Y']
['16.5', '4.3', 'Y']
['17', '3.2', 'Y']

Processed dataset:
[4, 15, 'X']
[5, 14, 'X']
[6, 13, 'X']
[5.6, 12, 'X']
[15, 3, 'Y']
[13, 2, 'Y']
[16, 4, 'Y']
[14, 3.5, 'Y']
[15, 2.7, 'Y']
[14.5, 3.6, 'Y']
[16.5, 4.3, 'Y']
[17, 3.2, 'Y']

Class counts:
X:	4
Y:	8

Percentage value: 100.0
New sample count: 4

Minority class:
[4, 15, 'X']
[5, 14, 'X']
[6, 13, 'X']
[5.6, 12, 'X']

Majority class:
[15, 3, 'Y']
[13, 2, 'Y']
[16, 4, 'Y']
[14, 3.5, 'Y']
[15, 2.7, 'Y']
[14.5, 3.6, 'Y']
[16.5, 4.3, 'Y']
[17, 3.2, 'Y']

Min-Max values of minority class atrributes:
0. Attribute:	Min:4	Max:6
1. Attribute:	Min:12	Max:15

Minority class after over-sampling:
[4, 15, 'X']
[5, 14, 'X']
[6, 13, 'X']
[5.6, 12, 'X']
[5.96, 14.98, 'X']
[5.37, 12.47, 'X']
[5.82, 12.08, 'X']
[4.73, 14.39, 'X']

