IML 2025 Assignment 1

Jiwoo Hong MTNR1
Yeongjae Park m12446610
Yoonhyeok Lee m12448710

# Definition of input files

In [107]:
#These filename need to be defined:

#  current dataset directory : data/alldigits.csv
#  amount of data: 6000
#  structure of each datum: [28x28 pixels with value 0x00 ~ 0xFF, 1 label with value 0 ~ 9]

datafile_name = 'data/alldigits.csv'

# Your sections ...

# Calc1: Read data

In [108]:
import numpy as np


all_digits = np.loadtxt(datafile_name, delimiter=',', skiprows=1)



# Preprocessing: Data split

Defined Oracle to perform uncertainty sampling and keep track of data that the label has been used for training/model selection

In [None]:
import math

DATASET_SIZE = 6000  #  precomputated value

class Oracle:
    def __init__(self, data, train_ratio, validate_ratio, test_ratio):
        self.data = data

        assert train_ratio + validate_ratio + test_ratio == 1.0, "Ratios must sum to 1"
        self.train_ratio = train_ratio
        self.validate_ratio = validate_ratio
        self.test_ratio = test_ratio

        self.train_data = []
        self.validate_data = []
        self.test_data = []

        self.train_used_markers = []
        self.validate_used_markers = []

    def split_data(self, shuffle=True):
        x = self.data[:, :-1]
        y = self.data[:, -1]

        if shuffle:
            indices = np.arange(DATASET_SIZE)
            np.random.shuffle(indices)
            x = x[indices]
            y = y[indices]

        train_size = int(DATASET_SIZE * self.train_ratio)
        validate_size = int(DATASET_SIZE * self.validate_ratio)
        test_size = DATASET_SIZE - train_size - validate_size

        self.train_data = (x[:train_size], y[:train_size])
        self.validate_data = (x[train_size:train_size + validate_size], y[train_size:train_size + validate_size])
        self.test_data = (x[train_size + validate_size:], y[train_size + validate_size:])

        # Markers for used data
        self.train_used_markers = [False] * train_size
        self.validate_used_markers = [False] * validate_size

        return self.train_data, self.validate_data, self.test_data

    def retrieve_uncertain_training_data(self, probabilty_matrix, num_samples=1):
        # Calculate the uncertainty for each data point
        uncertainty = np.zeros(probabilty_matrix.shape[0])
        for i in range(probabilty_matrix.shape[0]):
            uncertainty[i] = self.entropy(probabilty_matrix[i])

        # Sort the data points by uncertainty
        uncertain_indices = np.argsort(uncertainty)[::-1]
        uncertain_data = []
        uncertain_labels = []
        for i in range(num_samples):
            index = uncertain_indices[i]
            uncertain_data.append(self.train_data[0][index])
            uncertain_labels.append(self.train_data[1][index])
            self.train_used_markers[index] = True

        return np.array(uncertain_data), np.array(uncertain_labels)

    def retreive_initial_training_data(self):
        # get each data of all labels
        initial_data = []
        initial_labels = []
        for label in range(10):
            label_indices = np.where(self.train_data[1] == label)[0]
            if len(label_indices) > 0:
                initial_index = label_indices[0]
                initial_data.append(self.train_data[0][initial_index])
                initial_labels.append(self.train_data[1][initial_index])
                self.train_used_markers[initial_index] = True

        return np.array(initial_data), np.array(initial_labels)

    def entropy(self, probabilty_vector):
        return -np.sum(probabilty_vector * np.log(probabilty_vector + 1e-10))  # Adding a small value to avoid log(0)

    def count_used_data(self):
        train_used_count = sum(self.train_used_markers)
        validate_used_count = sum(self.validate_used_markers)

        return train_used_count, validate_used_count


# Explore

#### Import Classifiers and Utils

In [None]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm

from random import choices

#### Initialize Oracle

In [111]:

oracle = Oracle(
    data=all_digits,
    train_ratio=0.7,
    validate_ratio=0.1,
    test_ratio=0.2
)

train_data, validate_data, test_data = oracle.split_data(shuffle=True)
initial_data, initial_labels = oracle.retreive_initial_training_data()
count_of_initial_data = len(initial_data)

#### Validation results for each classifier

In [112]:
val_acc = {
    'SVC': [],
    'LogisticRegression': [],
    'GradientBoostingClassifier': [],
    'RandomForestClassifier': [],
    'GaussianNB': [],
    'DecisionTreeClassifier': []
}

validation_iterations = 10

Use Active Learning using uncentainty sampling

# Calc2: Naive Bayes

In [None]:
TRAIN_ITERATIONS = 800  #  precomputated value
SAMPLE_BATCH_SIZE = 10

X_train = initial_data
y_train = initial_labels

X_test = test_data[0]
y_test = test_data[1]

pbar = tqdm(range((TRAIN_ITERATIONS - count_of_initial_data) // SAMPLE_BATCH_SIZE), desc="training iterations")

gnb = GaussianNB().partial_fit(X_train, y_train, classes=np.unique(y_train))

for i in pbar:

    # Get the probabilty predictions for the training data
    gnb_pred_proba = gnb.predict_proba(train_data[0])

    # Retrieve uncertain data
    uncertain_data, uncertain_labels = oracle.retrieve_uncertain_training_data(gnb_pred_proba, num_samples=SAMPLE_BATCH_SIZE)

    # Add the uncertain data to the training set
    X_train = uncertain_data
    y_train = uncertain_labels

    # Train the gaussian Naive Bayes classifier
    gnb.partial_fit(X_train, y_train)

y_pred_test = gnb.predict(test_data[0])
accuracy = accuracy_score(test_data[1], y_pred_test)
print(f"Test accuracy: {accuracy * 100:.2f}%")

training iterations: 100%|██████████| 79/79 [00:26<00:00,  2.93it/s]

Test accuracy: 48.75%





# Calc2: SVM

In [None]:
TRAIN_ITERATIONS = 800  #  precomputated value
SAMPLE_BATCH_SIZE = 10

X_train = initial_data
y_train = initial_labels

X_test = test_data[0]
y_test = test_data[1]


pbar = tqdm(range((TRAIN_ITERATIONS - count_of_initial_data) // SAMPLE_BATCH_SIZE), desc="training iterations")

for i in pbar:
    # Train the SVM model
    svm = SVC(kernel='linear', C=1, probability=True).fit(X_train, y_train)

    # Get the probability predictions for the training data
    y_pred_proba = svm.predict_proba(train_data[0])

    # Retrieve uncertain data
    uncertain_data, uncertain_label = oracle.retrieve_uncertain_training_data(y_pred_proba, num_samples=SAMPLE_BATCH_SIZE)

    # Add the uncertain data to the training set
    X_train = np.vstack((X_train, uncertain_data))
    y_train = np.hstack((y_train, uncertain_label))


y_pred_test = svm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_test)
print(f"Test accuracy: {accuracy * 100:.2f}%")

training iterations: 100%|██████████| 79/79 [00:28<00:00,  2.77it/s]

Test accuracy: 88.67%





# Calc2: LogisticRegression

In [None]:
TRAIN_ITERATIONS = 800  #  precomputated value
SAMPLE_BATCH_SIZE = 10

X_train = initial_data
y_train = initial_labels

X_test = test_data[0]
y_test = test_data[1]

pbar = tqdm(range((TRAIN_ITERATIONS - count_of_initial_data) // SAMPLE_BATCH_SIZE), desc="training iterations")

for i in pbar:
    # Train the Logistic Regression model
    lr = LogisticRegression().fit(X_train, y_train)

    # Get the probability predictions for the training data
    y_pred_proba = lr.predict_proba(train_data[0])

    # Retrieve uncertain data
    uncertain_data, uncertain_label = oracle.retrieve_uncertain_training_data(y_pred_proba, num_samples=SAMPLE_BATCH_SIZE)

    # Add the uncertain data to the training set
    X_train = np.vstack((X_train, uncertain_data))
    y_train = np.hstack((y_train, uncertain_label))


y_pred_test = lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_test)
print(f"Test accuracy: {accuracy * 100:.2f}%")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Test accuracy: 87.58%





# Calc2: Gradient Boosting

In [None]:
TRAIN_ITERATIONS = 800  #  precomputated value
SAMPLE_BATCH_SIZE = 10

X_train = initial_data
y_train = initial_labels

X_test = test_data[0]
y_test = test_data[1]

pbar = tqdm(range((TRAIN_ITERATIONS - count_of_initial_data) // SAMPLE_BATCH_SIZE), desc="training iterations")

for i in pbar:
    # Train the Gradient Boosting model
    gb = GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=5,
        subsample=0.8,
        max_features='sqrt',
        min_samples_leaf=3
    ).fit(X_train, y_train)

    # Get the probability predictions for the training data
    y_pred_proba = gb.predict_proba(train_data[0])

    # Retrieve uncertain data
    uncertain_data, uncertain_label = oracle.retrieve_uncertain_training_data(y_pred_proba, num_samples=SAMPLE_BATCH_SIZE)

    # Add the uncertain data to the training set
    X_train = np.vstack((X_train, uncertain_data))
    y_train = np.hstack((y_train, uncertain_label))


y_pred_test = gb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_test)
print(f"Test accuracy: {accuracy * 100:.2f}%")

training iterations: 100%|██████████| 79/79 [07:19<00:00,  5.56s/it]

Test accuracy: 92.50%





# Calc2: Random Forest

In [None]:
TRAIN_ITERATIONS = 800  #  precomputated value
SAMPLE_BATCH_SIZE = 10

X_train = initial_data
y_train = initial_labels

X_test = test_data[0]
y_test = test_data[1]

pbar = tqdm(range((TRAIN_ITERATIONS - count_of_initial_data) // SAMPLE_BATCH_SIZE), desc="training iterations")

for i in pbar:
    # Train the Gradient Boosting model
    rf = RandomForestClassifier().fit(X_train, y_train)

    # Get the probability predictions for the training data
    y_pred_proba = rf.predict_proba(train_data[0])

    # Retrieve uncertain data
    uncertain_data, uncertain_label = oracle.retrieve_uncertain_training_data(y_pred_proba, num_samples=SAMPLE_BATCH_SIZE)

    # Add the uncertain data to the training set
    X_train = np.vstack((X_train, uncertain_data))
    y_train = np.hstack((y_train, uncertain_label))


y_pred_test = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_test)
print(f"Test accuracy: {accuracy * 100:.2f}%")

training iterations: 100%|██████████| 79/79 [00:46<00:00,  1.69it/s]

Test accuracy: 92.25%





# Calc2: Decision Tree

In [None]:
TRAIN_ITERATIONS = 800  #  precomputated value
SAMPLE_BATCH_SIZE = 10

X_train = initial_data
y_train = initial_labels

X_test = test_data[0]
y_test = test_data[1]

pbar = tqdm(range((TRAIN_ITERATIONS - count_of_initial_data) // SAMPLE_BATCH_SIZE), desc="training iterations")

for i in pbar:
    # Train the Gradient Boosting model
    dt = DecisionTreeClassifier(
        criterion='gini',
        max_depth=10,
        min_samples_split=10,
        min_samples_leaf=5,
        max_features='sqrt',
    ).fit(X_train, y_train)

    # Get the probability predictions for the training data
    y_pred_proba = dt.predict_proba(train_data[0])

    # Retrieve uncertain data
    uncertain_data, uncertain_label = oracle.retrieve_uncertain_training_data(y_pred_proba, num_samples=SAMPLE_BATCH_SIZE)

    # Add the uncertain data to the training set
    X_train = np.vstack((X_train, uncertain_data))
    y_train = np.hstack((y_train, uncertain_label))


y_pred_test = dt.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_test)
print(f"Test accuracy: {accuracy * 100:.2f}%")

training iterations: 100%|██████████| 79/79 [00:02<00:00, 28.55it/s]

Test accuracy: 54.67%





In [119]:
"""
import matplotlib.pyplot as plt

px = digits_train_x[:,400]
px.sort()
plt.plot(px)
"""

'\nimport matplotlib.pyplot as plt\n\npx = digits_train_x[:,400]\npx.sort()\nplt.plot(px)\n'

In [120]:
"""
import math

#  variable parameters - can be adjusted
RATIO_TRAIN_DATA = 0.4

RATIO_VALIDATE_DATA = 0.1
RATIO_TEST_DATA = 0.5

#  constants - must be fixed
DATASET_SIZE = 6000  #  precomputated value


if RATIO_TRAIN_DATA + RATIO_VALIDATE_DATA + RATIO_TEST_DATA != 1.0:
    raise AssertionError("Sum of ratios must be 1.0")


all_digits_by_labels = [[] for _ in range(10)]

for dg in all_digits:
    dg = [dg[:-1], int(dg[-1])]
    all_digits_by_labels[dg[1]].append(dg)
for one_labeled_digits in all_digits_by_labels:
    np.random.shuffle(one_labeled_digits)

digits_train, digits_validate, digits_test = [], [], []


for i in range(10):
    target_labeled_set = all_digits_by_labels[i]
    idx_train_end = math.floor(len(target_labeled_set) * RATIO_TRAIN_DATA)
    idx_validate_end = idx_train_end + math.floor(len(target_labeled_set) * RATIO_VALIDATE_DATA)
    idx_test_end = idx_validate_end + math.floor(len(target_labeled_set) * RATIO_TEST_DATA)

    digits_train += target_labeled_set[:idx_train_end]
    digits_validate += target_labeled_set[idx_train_end:idx_validate_end]
    digits_test += target_labeled_set[idx_validate_end:idx_test_end]
    digits_train += target_labeled_set[idx_test_end:]  #  TODO: how can we distribute the remained data equally?

for digits in [digits_train, digits_validate, digits_test]:
    np.random.shuffle(digits)


digits_train_x, digits_train_y = np.array([d[0] for d in digits_train]), np.array([d[1] for d in digits_train])
digits_validate_x, digits_validate_y = np.array([d[0] for d in digits_validate]), np.array([d[1] for d in digits_validate])
digits_test_x, digits_test_y = np.array([d[0] for d in digits_test]), np.array([d[1] for d in digits_test])
"""



'\nimport math\n\n#  variable parameters - can be adjusted\nRATIO_TRAIN_DATA = 0.4\n\nRATIO_VALIDATE_DATA = 0.1\nRATIO_TEST_DATA = 0.5\n\n#  constants - must be fixed\nDATASET_SIZE = 6000  #  precomputated value\n\n\nif RATIO_TRAIN_DATA + RATIO_VALIDATE_DATA + RATIO_TEST_DATA != 1.0:\n    raise AssertionError("Sum of ratios must be 1.0")\n\n\nall_digits_by_labels = [[] for _ in range(10)]\n\nfor dg in all_digits:\n    dg = [dg[:-1], int(dg[-1])]\n    all_digits_by_labels[dg[1]].append(dg)\nfor one_labeled_digits in all_digits_by_labels:\n    np.random.shuffle(one_labeled_digits)\n\ndigits_train, digits_validate, digits_test = [], [], []\n\n\nfor i in range(10):\n    target_labeled_set = all_digits_by_labels[i]\n    idx_train_end = math.floor(len(target_labeled_set) * RATIO_TRAIN_DATA)\n    idx_validate_end = idx_train_end + math.floor(len(target_labeled_set) * RATIO_VALIDATE_DATA)\n    idx_test_end = idx_validate_end + math.floor(len(target_labeled_set) * RATIO_TEST_DATA)\n\n    digits

# Model Selection

Use Active Testing to validate which model is better on val set

In [None]:
class Surrogate:
    def __init__(self, test_data, surrogate_init_data_ratio=0.1):
        self.test_size = len(test_data)
        self.LRmodel = LogisticRegression()
        self.X_data = test_data[0]
        self.Y_data = test_data[1]
        self.test_used_markers = [False] * self.test_size

        initial_data_size = int(self.test_size * surrogate_init_data_ratio)
        self.tested_X = self.X_data[:initial_data_size]
        self.tested_Y = self.Y_data[:initial_data_size]

        for i in range(initial_data_size):
            self.test_used_markers[i] = True

        self.LRmodel.fit(self.tested_X, self.tested_Y)

    def acquire_data(self, num_samples=1, update_surr_immediate=True):
        lr_pred_proba = self.LRmodel.predict_proba(self.X_data)
        uncertainty = np.zeros(lr_pred_proba.shape[0])
        for i in range(lr_pred_proba.shape[0]):
            uncertainty[i] = self.entropy(lr_pred_proba[i])

        uncertain_indices = np.argsort(uncertainty)[::-1]
        uncertain_data = []
        uncertain_labels = []
        new_update_data = []
        new_update_labels = []

        for i in range(self.test_size):
            index = uncertain_indices[i]
            uncertain_data.append(self.X_data[index])
            uncertain_labels.append(self.Y_data[index])

            if update_surr_immediate and self.test_used_markers[index] is False:
                new_update_data.append(self.X_data[index])
                new_update_labels.append(self.Y_data[index])
                self.test_used_markers[index] = True

            if len(uncertain_data) == num_samples:
                break

        if update_surr_immediate:
            self.X_data = np.vstack((self.X_data, new_update_data))
            self.Y_data = np.hstack((self.Y_data, new_update_labels))
            self.LRmodel.fit(self.X_data, self.Y_data)

        return np.array(uncertain_data), np.array(uncertain_labels)


    def entropy(self, probabilty_vector):
        return -np.sum(probabilty_vector * np.log(probabilty_vector + 1e-10))  # Adding a small value to avoid log(0)





# ...

In [121]:
#This classifier needs to be defined:
digits_SVM = gb