In [3]:
import os
import gzip
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
data_path = ''

# Đường dẫn
images_train_path = os.path.join(data_path, 'train-images-idx3-ubyte.gz')
labels_train_path = os.path.join(data_path, 'train-labels-idx1-ubyte.gz')

images_test_path = os.path.join(data_path, 't10k-images-idx3-ubyte.gz')
labels_test_path = os.path.join(data_path, 't10k-labels-idx1-ubyte.gz')


def get_mnist_data_as_dataframe(images_path, labels_path, shuffle=False, image_size=28):
    # Đọc dữ liệu ảnh
    with gzip.open(images_path, 'r') as f_images:
        # Bỏ qua 16 byte đầu tiên vì đây không phải là dữ liệu, chỉ là thông tin header
        f_images.read(16)
        
        # Đọc tất cả dữ liệu sau khi bỏ đi phần head
        buf_images = f_images.read()
        
        # Chuyển dữ liệu thành numpy array và đổi dtype thành float32
        images = np.frombuffer(buf_images, dtype=np.uint8).astype(np.float32)
        
        # Reshape dữ liệu thành dạng (num_images, image_size*image_size)
        images = images.reshape(-1, image_size * image_size)
    
    # Đọc tệp labels
    with gzip.open(labels_path, 'r') as f_labels:
        f_labels.read(8)
        buf_labels = f_labels.read()
        labels = np.frombuffer(buf_labels, dtype=np.uint8).astype(np.int64)
    
    # Tạo DataFrame từ dữ liệu ảnh
    df_images = pd.DataFrame(images)
    
    # Thêm cột label vào DataFrame dữ liệu ảnh
    df_images['label'] = labels
    
    # Trộn dữ liệu trong dataframe
    if shuffle:
        df_images = df_images.sample(frac=1).reset_index(drop=True)
    
    return df_images

# Sử dụng hàm để đọc dữ liệu
# dataframe train
mnist_train_df = get_mnist_data_as_dataframe(images_train_path, labels_train_path, shuffle=True)

# dataframe test
mnist_test_df = get_mnist_data_as_dataframe(images_test_path, labels_test_path, shuffle=True)

In [5]:
mnist_train_df.shape

(60000, 785)

In [6]:
mnist_train_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,label
count,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,...,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0
mean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.088867,0.045633,0.019283,0.015117,0.002,0.0,0.0,0.0,0.0,4.453933
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.956189,2.839845,1.68677,1.678283,0.3466,0.0,0.0,0.0,0.0,2.88927
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0
max,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,254.0,253.0,253.0,254.0,62.0,0.0,0.0,0.0,0.0,9.0


In [7]:
mnist_train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3


In [8]:
mnist_train_df.isna().any().any() # kiểm tra xem DF có chứa giá trị NaN nào không

False

In [None]:
mnist_test_df.shape

In [None]:
mnist_test_df.describe()

In [None]:
mnist_test_df.head()

In [None]:
mnist_test_df.isnull().any().any() # kiểm tra các cột trong dữ liệu test xem có cột nào có chứa NaN/Null không

In [14]:
y_train = mnist_train_df['label'] 
X_train = mnist_train_df.drop('label', axis=1) 
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,783
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# hiển thị random 20 ảnh trong tập train
random_indices = np.random.randint(0, len(X_train), size=20)

fig, axis = plt.subplots(2, 10, figsize=(20, 5))
for i, ax in zip(random_indices, axis.flat):
    grid_data = X_train.iloc[i].values.reshape(28, 28)
    ax.imshow(X_train.iloc[i].values.reshape(28, 28),
              interpolation='none', cmap='gray')

    number = y_train[i]
    ax.set(title=f"Number {number}")

In [None]:
# Vẽ biểu đồ histogram thống kê số lượng ảnh mỗi nhãn trên tập train
plt.figure(figsize=(8, 6))
counts, _, _ = plt.hist(y_train, bins=range(
    11), align='left', rwidth=0.8, color='lightblue', edgecolor='black')
plt.title('Histogram of Labels In Training Set')
plt.xlabel('Label')
plt.ylabel('Frequency')
plt.xticks(range(10))

# Hiển thị số lượng trên đỉnh của từng cột
for i, count in enumerate(counts):
    plt.text(i, count + 0.5, str(int(count)),
             fontsize=10, color='black', ha='center', va='bottom')

plt.grid(axis='y', alpha=0.5)
plt.show()

In [None]:
y_test = mnist_test_df['label']
X_test = mnist_test_df.drop('label', axis=1)

In [None]:
# Vẽ biểu đồ histogram thống kê số lượng nhãn ứng với mỗi nhãn trong tập test
plt.figure(figsize=(8, 6))
counts, _, _ = plt.hist(y_test, bins=range(
    11), align='left', rwidth=0.8, color='lightblue', edgecolor='black')
plt.title('Histogram of Labels In Testing Set')
plt.xlabel('Label')
plt.ylabel('Frequency')
plt.xticks(range(10))

# Hiển thị số lượng trên đỉnh của từng cột
for i, count in enumerate(counts):
    plt.text(i, count + 0.5, str(int(count)),
             fontsize=10, color='black', ha='center', va='bottom')

plt.grid(axis='y', alpha=0.5)
plt.show()

In [None]:
# hiển thị random 20 ảnh trong tập test
random_indices = np.random.randint(0, len(X_test), size=20)

fig, axis = plt.subplots(2, 10, figsize=(20, 5))
for i, ax in zip(random_indices, axis.flat):
    grid_data = X_test.iloc[i].values.reshape(28, 28)
    ax.imshow(X_test.iloc[i].values.reshape(28, 28),
              interpolation='none', cmap='gray')
   
    number = y_test[i]
    ax.set(title=f"Number {number}")

In [None]:
# Kết hợp tập train, test -> tập dữ liệu
y = np.concatenate((y_train, y_test))
y.shape

In [None]:
# Vẽ biểu đồ histogram thống kê số lượng ảnh trong mỗi class trên toàn bộ bộ dữ liệu MNIST
plt.figure(figsize=(8, 6))
counts, _, _ = plt.hist(y, bins=range(
    11), align='left', rwidth=0.8, color='lightblue', edgecolor='black')
plt.title('Histogram of Labels In MNIST')
plt.xlabel('Label')
plt.ylabel('Frequency')
plt.xticks(range(10))

# Hiển thị số lượng trên đỉnh của từng cột
for i, count in enumerate(counts):
    plt.text(i, count + 0.5, str(int(count)),
             fontsize=10, color='black', ha='center', va='bottom')

plt.grid(axis='y', alpha=0.5)
plt.show()

In [None]:
# Gộp X_train và X_test thành X
X = np.concatenate((X_train, X_test))
X.shape

In [None]:
# hiển thị random 20 ảnh trong tập MNIST
random_indices = np.random.randint(0, len(X), size=20)

fig, axis = plt.subplots(2, 10, figsize=(20, 5))
for i, ax in zip(random_indices, axis.flat):
    grid_data = X[i].reshape(28, 28)
    ax.imshow(grid_data,
              interpolation='none', cmap='gray')
    
    number = y[i]
    ax.set(title=f"Number {number}")

In [None]:
print(np.mean(X, axis=1))

In [None]:
from my_pca import MyPCA
my_pca_2d = MyPCA(n_components=2)
my_pca_2d.fit(X)

values_2d, vectors_2d = my_pca_2d.eigenvalues_, my_pca_2d.eigenvectors_

print(f'First 10 eigenvalues: {values_2d[:10]}')
print(f'\n\nLast 10 eigenvalues: {values_2d[-10:]}')

In [None]:
print('Components:\n', my_pca_2d.components_)
print('Explained variance ratio:\n', my_pca_2d.explained_variance_ratio_)

In [None]:
my_pca_2d.components_.shape

In [None]:
cum_explained_variance = my_pca_2d.cum_explained_variance_
print('Cumulative explained variance:\n', cum_explained_variance)

In [None]:
d = X.shape[1]
d

In [None]:
percentage_var_explained = my_pca_2d.explained_variance_ratio_all / \
    np.sum(my_pca_2d.explained_variance_ratio_all)

cum_var_explained = np.cumsum(percentage_var_explained)

In [None]:
cum_var_explained

In [None]:
# Plot the PCA spectrum
plt.figure(1, figsize=(6, 4))

plt.clf()
plt.plot(cum_var_explained, linewidth=2)
plt.axis('tight')
plt.grid()
plt.xlabel('n_components')
plt.ylabel('% explained variance')
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
plt.stem(values_2d[:d+1])
plt.xlabel('Eigen value index')
plt.ylabel('Eigen value')
plt.show()

In [None]:
X_pca_2d = my_pca_2d.transform(X)  # Apply dimensionality reduction to X.
print('Transformed data shape:', X_pca_2d.shape)

In [None]:
X_pca_2d[:, 0], X_pca_2d[:, 1]

In [16]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
X = X_train.to_numpy()
X_std = StandardScaler().fit_transform(X)

pca = PCA(n_components=50).fit(X_std)

print('Components:\n', pca.components_)
print('Explained variance ratio:\n', pca.explained_variance_ratio_)

cum_explained_variance = np.cumsum(pca.explained_variance_ratio_)
print('Cumulative explained variance:\n', cum_explained_variance[49])

X_pca = pca.transform(X_std)  # Apply dimensionality reduction to X.
print('Transformed data shape:', X_pca.shape)

Components:
 [[-2.2390335e-08 -4.4697447e-11 -1.9923056e-11 ... -0.0000000e+00
  -0.0000000e+00 -0.0000000e+00]
 [ 1.5466993e-08 -1.3784145e-08  5.0210569e-10 ... -0.0000000e+00
  -0.0000000e+00 -0.0000000e+00]
 [ 7.4152126e-08  4.5150816e-09 -7.5528965e-11 ... -0.0000000e+00
  -0.0000000e+00 -0.0000000e+00]
 ...
 [ 1.5216401e-08  5.5324372e-09  8.7840437e-09 ...  0.0000000e+00
   0.0000000e+00  0.0000000e+00]
 [-8.5356193e-09  4.6953104e-09  3.3190766e-08 ...  0.0000000e+00
   0.0000000e+00  0.0000000e+00]
 [ 1.0828573e-08  1.7619438e-08 -9.3258623e-10 ... -0.0000000e+00
  -0.0000000e+00 -0.0000000e+00]]
Explained variance ratio:
 [0.05646716 0.04078269 0.03739382 0.02885119 0.0252111  0.0219427
 0.01923346 0.017458   0.01535094 0.01401721 0.01341743 0.01203743
 0.01114571 0.01089924 0.0102865  0.00994486 0.00936383 0.00921046
 0.00893437 0.00869912 0.00827362 0.00803412 0.0076484  0.00741769
 0.00715273 0.00691806 0.00684108 0.00656618 0.00631592 0.00612864
 0.0059586  0.0058757  0.0

In [None]:
X_pca[:, 0], X_pca[:, 1]

In [None]:
plt.figure(figsize=(10, 8))


plt.scatter(X_pca_2d[:, 0], X_pca_2d[:, 1], c=y, edgecolor="none")
plt.xlabel("PC 1")
plt.ylabel("PC 2")
plt.title('2 components, captures {:.4f}% of total variation'.format(
    cum_explained_variance[1]*100))
plt.colorbar()
plt.show()

In [None]:
pca_3d = MyPCA(3)
pca_3d.fit(X)

values_3d, vectors_3d = pca_3d.eigenvalues_, pca_3d.eigenvectors_

print(f'First 10 eigenvalues: {values_3d[:10]}')
print(f'\n\nLast 10 eigenvalues: {values_3d[-10:]}')

X_pca_3d = pca_3d.transform(X)  # Apply dimensionality reduction to X.
print('Transformed data shape:', X_pca_3d.shape)

In [None]:
X_pca_3d[:, 0], X_pca_3d[:, 1], X_pca_3d[:,2]

In [None]:
cum_explained_variance_3d = pca_3d.cum_explained_variance_
cum_explained_variance_3d

In [None]:
from mpl_toolkits.mplot3d import Axes3D

# Tạo một figure mới với kích thước lớn hơn
fig = plt.figure(figsize=(10, 8))

# Tạo subplot 3D
ax = fig.add_subplot(111, projection='3d')


# Vẽ scatter plot trong không gian 3D
scatter = ax.scatter(X_pca_3d[:, 0], X_pca_3d[:, 1], X_pca_3d[:, 2], c=y, edgecolor="none",
                     alpha=1, cmap=plt.cm.get_cmap("viridis", 10))


# Đặt nhãn cho các trục
ax.set_xlabel('PC 1')
ax.set_ylabel('PC 2')
ax.set_zlabel('PC 3')

# Đặt tiêu đề cho biểu đồ
ax.set_title('3 components, captures {:.2f}% of total variation'.format(
    cum_explained_variance_3d[2].round(4)*100))

# Thêm colorbar
plt.colorbar(scatter, ax=ax, label='Label')

# Hiển thị biểu đồ
plt.show()

In [None]:
# Tính giá trị trung bình của mỗi cột
mean_X= np.mean(X, axis=0)

# Tính độ lệch chuẩn của từng cột thuộc tính
std_X = np.std(X, axis=0)
std_X[std_X == 0] = 1e-16 # thay std tại những cột có std = 0 để tránh xảy ra lỗi khi thực hiện chuẩn hóa

# khôi phục cấu trúc dữ liệu dạng chuẩn X từ dữ liệu đã được đưa về không gian 2 chiều
X_reconstructed_2d = np.dot(X_pca_2d, vectors_2d[:2])

# Từ cấu trúc dữ liệu dạng chuẩn -> đưa về cấu trúc dạng ban đầu
data_reconstructed_2d = (X_reconstructed_2d * std_X) + mean_X

In [None]:
# hiển thị random 10 ảnh trong tập dữ liệu
random_indices = np.random.randint(0, len(X), size=10)

fig, axis = plt.subplots(1, 10, figsize=(20, 5))
for i, ax in zip(random_indices, axis.flat):
    grid_data = X[i].reshape(28, 28)
    ax.imshow(grid_data,
              interpolation='none', cmap='binary')
    
    number = y[i]
    ax.set(title=f"Number {number}")

In [None]:
# hiển thị 10 được khôi phục từ không gian 2D
fig, axis = plt.subplots(1, 10, figsize=(20, 5))
for i, ax in zip(random_indices, axis.flat):
    grid_data = data_reconstructed_2d[i].reshape(28, 28)
    ax.imshow(grid_data,
              interpolation='none', cmap='binary')
    
    number = y[i]
    ax.set(title=f"Number {number}")

In [None]:
X_reconstructed_3d = np.dot(X_pca_3d, vectors_3d[:3])
data_reconstructed_3d = (X_reconstructed_3d * std_X) + mean_X

# hiển thị 10 ảnh được khôi phục từ 3D
fig, axis = plt.subplots(1, 10, figsize=(20, 5))
for i, ax in zip(random_indices, axis.flat):
    grid_data = data_reconstructed_3d[i].reshape(28, 28)
    ax.imshow(grid_data,
              interpolation='none', cmap='binary')

    number = y[i]
    ax.set(title=f"Number {number}")

In [None]:
pca_100d = MyPCA(100)
X_pca_100d = pca_100d.fit_transform(X)
print(f"Raito Varicane: {pca_100d.cum_explained_variance_[99] * 100:.4f} %")

In [None]:
X_reconstructed_100d = np.dot(X_pca_100d, pca_100d.eigenvectors_[:100])
data_reconstructed_100d = (X_reconstructed_3d * std_X) + mean_X
reconstructed_images_100d = np.reshape(data_reconstructed_100d, (-1, 28, 28))

# hiển thị 10 ảnh được khôi phục từ không gian 100D
fig, axis = plt.subplots(1, 10, figsize=(20, 5))
for i, ax in zip(random_indices, axis.flat):
    grid_data = data_reconstructed_100d[i].reshape(28, 28)
    ax.imshow(grid_data,
              interpolation='none', cmap='binary')

    number = y[i]
    ax.set(title=f"Number {number}")