In [4]:
import os
import numpy as np
import pandas as pd
from scipy.special import logsumexp
from sklearn.decomposition import PCA
import gzip

from my_gmm import GaussianMixtureModel

In [5]:
data_path = ''

# Đường dẫn
images_train_path = os.path.join(data_path, 'train-images-idx3-ubyte.gz')
labels_train_path = os.path.join(data_path, 'train-labels-idx1-ubyte.gz')

images_test_path = os.path.join(data_path, 't10k-images-idx3-ubyte.gz')
labels_test_path = os.path.join(data_path, 't10k-labels-idx1-ubyte.gz')


def get_mnist_data_as_dataframe(images_path, labels_path, shuffle=False, image_size=28):
    # Đọc dữ liệu ảnh
    with gzip.open(images_path, 'r') as f_images:
        # Bỏ qua 16 byte đầu tiên vì đây không phải là dữ liệu, chỉ là thông tin header
        f_images.read(16)

        # Đọc tất cả dữ liệu sau khi bỏ đi phần head
        buf_images = f_images.read()

        # Chuyển dữ liệu thành numpy array và đổi dtype thành float32
        images = np.frombuffer(buf_images, dtype=np.uint8).astype(np.float32)

        # Reshape dữ liệu thành dạng (num_images, image_size*image_size)
        images = images.reshape(-1, image_size * image_size)

    # Đọc tệp labels
    with gzip.open(labels_path, 'r') as f_labels:
        f_labels.read(8)
        buf_labels = f_labels.read()
        labels = np.frombuffer(buf_labels, dtype=np.uint8).astype(np.int64)

    # Tạo DataFrame từ dữ liệu ảnh
    df_images = pd.DataFrame(images)

    # Thêm cột label vào DataFrame dữ liệu ảnh
    df_images['label'] = labels

    # Trộn dữ liệu trong dataframe
    if shuffle:
        df_images = df_images.sample(frac=1).reset_index(drop=True)

    return df_images


# dataframe train
mnist_train_df = get_mnist_data_as_dataframe(
    images_train_path, labels_train_path, shuffle=True)

# dataframe test
mnist_test_df = get_mnist_data_as_dataframe(
    images_test_path, labels_test_path, shuffle=True)

# Convert DataFrames to NumPy arrays
y_train = mnist_train_df['label'].values
X_train = mnist_train_df.drop(columns=['label']).values / 255.0

y_test = mnist_test_df['label'].values
X_test = mnist_test_df.drop(columns=['label']).values / 255.0

In [16]:
print(X_train.shape)
print(y_train.shape)

(60000, 784)
(60000,)


In [17]:
print(X_test.shape)
print(y_test.shape)

(10000, 784)
(10000,)


In [6]:
# Thực hiện GMM trên dữ liệu gốc
# Gaussian Mixture Model (GMM) with EM algorithm
gm_num = 10  # number of Gaussian Models in each GMM
GMMs = {}
log_likelihood_loss = {}
for digit in range(10):  # assuming there are 10 digits (0-9)
    print('Fitting GMM to digit', digit)
    X_digit_train = X_train[y_train == digit]
    GMMs[digit] = GaussianMixtureModel(num_components=gm_num)
    GMMs[digit].fit(X_digit_train)
    print('GMM parameters computed for digit =', digit)

# Predictions
class_probab_list = np.zeros((10, len(X_test)))
for digit in range(10):
    class_probab_list[digit] = GMMs[digit].predict_proba(X_test)[digit]

predictions = np.argmax(class_probab_list, axis=0)
accuracy = np.mean(predictions == y_test)
print("Accuracy:", accuracy)

Fitting GMM to digit 0
EM for GMM converged after  35 iteration, with loss:  -3043751.3928405656
GMM parameters computed for digit = 0
Fitting GMM to digit 1
EM for GMM converged after  23 iteration, with loss:  -3980924.7846314334
GMM parameters computed for digit = 1
Fitting GMM to digit 2
EM for GMM converged after  15 iteration, with loss:  -2956922.774048416
GMM parameters computed for digit = 2
Fitting GMM to digit 3
EM for GMM converged after  12 iteration, with loss:  -3128223.390950055
GMM parameters computed for digit = 3
Fitting GMM to digit 4
EM for GMM converged after  33 iteration, with loss:  -3061590.937292721
GMM parameters computed for digit = 4
Fitting GMM to digit 5
EM for GMM converged after  14 iteration, with loss:  -2771131.0710311267
GMM parameters computed for digit = 5
Fitting GMM to digit 6
EM for GMM converged after  34 iteration, with loss:  -3113249.49307268
GMM parameters computed for digit = 6
Fitting GMM to digit 7
EM for GMM converged after  24 iterat

In [7]:
from sklearn.metrics import davies_bouldin_score

# Calculate Davies-Bouldin Index
davies_bouldin_idx = davies_bouldin_score(X_test, predictions)
print("Davies-Bouldin Index:", davies_bouldin_idx)

Davies-Bouldin Index: 3.672608096618955


In [12]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.mixture import GaussianMixture as GMM

# Define the number of effective features
effective_features = 25

# Initialize arrays to store accuracies and number of dimensions
accuracies = []
num_dimensions = []

# Loop over different number of dimensions
for dim in range(1, 784, 1):
    # Perform PCA
    pca = PCA(n_components=dim)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    # Initialize GMMs
    GMMs = {}
    for digit in range(10):
        X_digit_train = X_train_pca[y_train == digit]
        GMMs[digit] = GMM(n_components=gm_num)
        GMMs[digit].fit(X_digit_train)

    # Predictions
    class_probab_list = np.zeros((10, len(X_test_pca)))
    for digit in range(10):
        class_probab_list[digit] = GMMs[digit].score_samples(X_test_pca)

    predictions = np.argmax(class_probab_list, axis=0)
    accuracy = accuracy_score(y_test, predictions)

    # Store accuracy and number of dimensions
    accuracies.append(accuracy)
    num_dimensions.append(dim)

# Plot accuracy vs. number of dimensions
plt.plot(num_dimensions, accuracies)
plt.xlabel('Number of Dimensions')
plt.ylabel('Accuracy')
plt.title('Accuracy vs. Number of Dimensions after PCA')
plt.grid(True)
plt.show()

