In [2]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import plotly.express as px
from scipy.ndimage import gaussian_filter, median_filter

## PREPROCESSING

In [51]:
X = pd.read_csv("src/noisy_images.csv")

In [46]:
X.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,-176.321245,-52.74385,143.604939,-53.567749,80.289381,20.686379,-197.423519,-229.401206,-221.900009,64.194187,...,34.40707,1.950735,-25.095565,133.684095,-21.664094,-94.305438,-55.987821,-89.929231,40.394774,-214.75448
1,-158.421239,16.371695,62.810879,263.533916,-193.92032,-25.366668,107.062706,125.403427,83.536343,-55.71364,...,-5.419961,-123.58403,20.240434,-25.699206,-128.54593,52.525885,-54.214887,-133.842624,-30.141215,210.408665
2,-290.34375,81.5865,12.615232,146.567851,111.233602,-188.989259,-101.464605,-107.015195,-13.069827,-245.921093,...,-139.909318,-85.214133,167.495617,62.402411,-144.40297,152.26395,-4.687051,-59.270131,-93.1936,188.229794
3,-208.84059,136.190431,38.552191,-67.825346,24.316303,176.103673,31.581298,-163.582673,29.777077,-110.969396,...,-131.385192,40.329733,-10.111639,163.497435,41.010287,-21.408008,328.274235,-15.341672,121.570863,151.757537
4,-328.876288,-42.8629,174.651874,-228.833439,71.909654,-97.206392,48.048853,-34.071313,3.820465,137.80773,...,-135.713199,-71.396796,155.237981,-141.860908,155.657335,166.60976,-52.911774,267.150703,-36.749672,131.913772


## DISPLAY

In [20]:
def show_images(data: pd.DataFrame, title="Result", col_wrap=10):
    side_length = int(np.sqrt(data.shape[1]))
    fig = px.imshow(data.values.reshape(-1, side_length, side_length),
                    binary_string=True,
                    facet_col=0,
                    facet_col_wrap=col_wrap,
                    title=title)
    fig.show()

## DENOISE

### FILTERS

In [21]:
def apply_gaussian_filter(images, side_length, sigma=1):
    rows = []
    for image in images.values:
        image = image.reshape(side_length, side_length)
        filtered_image = gaussian_filter(image, sigma=sigma)
        rows.append(filtered_image.flatten())
    return rows


def apply_median_filter(images, side_length, size=3):
    rows = []
    for image in images.values:
        image = image.reshape(side_length, side_length)
        filtered_image = median_filter(image, size=size)
        rows.append(filtered_image.flatten())
    return rows

def apply_filters(images: pd.DataFrame, side_length: int, sigma: int = 1, size: int = 3) -> pd.DataFrame:
    images = pd.DataFrame(apply_median_filter(images, side_length, size=size))
    images = pd.DataFrame(apply_gaussian_filter(images, side_length, sigma=sigma))
    return images

### PCA + APPLICATION

In [56]:
# sizes of images
col_wrap = 10

# show base images
show_images(X, "Base", col_wrap=col_wrap)

def denoise(images: pd.DataFrame,
            pca_components: float = .6,
            gaussian_sigma: int = 1,
            median_size: int = 1,
            filters: bool = True):
    side_length = int(np.sqrt(images.shape[1]))

    pca = PCA(pca_components)
    images = pca.fit_transform(images.values)
    images = pd.DataFrame(pca.inverse_transform(images))

    if filters :
        images = apply_filters(images,
                            side_length,
                            sigma=gaussian_sigma,
                            size=median_size)

    return images

result = denoise(X)

show_images(result, "Result", col_wrap=col_wrap)

## EXPORT

In [7]:
df_reconstructed = pd.DataFrame(result.values, columns=X.columns, index=X.index)
df_reconstructed.to_csv("output/denoised_images.csv", index=False)

## REIMPORT

In [8]:
result_df = pd.read_csv("output/denoised_images.csv")

fig_noise = px.imshow(result_df.values.reshape(-1, 28, 28),
                      binary_string=True,
                      facet_col=0,
                      facet_col_wrap=10,
                      title="Result")
fig_noise.show()

Ok, faisons une PCA manuelle maintenant !

## MANUAL

In [94]:
class MyPCA:
    def __init__(self, n_components: int = None) -> None:
        self._n_components = n_components
        self._covariance_matrix = None
        self._eigen_values = None
        self._eigen_vectors = None
        self._sorted_eigen_values = None
        self._X_mean = None
        self._X_std_dev = None

    def standardize_data(self, X):
        mean = np.mean(X, axis=0)
        std_dev = np.std(X, axis=0)
        X_standardized = (X - mean) / std_dev

        return X_standardized, mean, std_dev

    def calculate_covariance_matrix(self, X):
        return np.cov(X, rowvar=False)

    def get_eig(self):
        return np.linalg.eig(self._covariance_matrix)

    def _get_sorted_eigen_vectors(self, sorted_indices):
        return self._eigen_vectors[:, sorted_indices]

    def get_sorted_indices(self):
        return np.argsort(self._eigen_values)[::-1]

    def _get_sorted_eigen_values(self, sorted_indices):
        return self._eigen_values[sorted_indices]

    def fit_transform(self, X):
        if self._n_components is None:
            self._n_components = X.shape[1]

        X_standardized, self._X_mean, self._X_std_dev = self.standardize_data(X)

        self._covariance_matrix = self.calculate_covariance_matrix(X_standardized)

        self._eigen_values, self._eigen_vectors = self.get_eig()

        sorted_indices = self.get_sorted_indices()
        sorted_eigen_vectors = self._get_sorted_eigen_vectors(sorted_indices)
        self._sorted_eigen_values = self._get_sorted_eigen_values(sorted_indices)
        self._selected_eigen_vectors = sorted_eigen_vectors[:, :self._n_components]

        return np.dot(X_standardized, self._selected_eigen_vectors)

    def get_explained_variance_ratio(self):
        return self._sorted_eigen_values / np.sum(self._sorted_eigen_values)

    def inverse_transform(self, X_pca):
        X_original = np.dot(X_pca, self._selected_eigen_vectors.T)
        X_original = (X_original * self._X_std_dev) + self._X_mean
        X_original = np.real(X_original)  # avoid complex issues

        return X_original


def apply_filters(images, side_length, sigma: int = None, size: int = None):
    rows = []
    for image in images.values:
        image = image.reshape(side_length, side_length)
        if size:
            image = median_filter(image, size=size)
        if sigma:
            image = gaussian_filter(image, sigma=sigma)
        rows.append(image.flatten())

    return pd.DataFrame(rows)

def denoise(images: pd.DataFrame,
            pca_components: float = .6,
            gaussian_sigma: int = 1,
            median_size: int = 1,
            filters: bool = True):
    side_length = int(np.sqrt(images.shape[1]))

    if filters:
        images = apply_filters(images, side_length, sigma=gaussian_sigma, size=median_size)

    pca = MyPCA(n_components=pca_components)
    images = pca.fit_transform(images.values)
    images = pca.inverse_transform(images)

    if not np.isrealobj(images):
        raise ValueError("Complex values")

    images = pd.DataFrame(images)

    return images


In [110]:
col_wrap = 10

show_images(X, "Base", col_wrap=col_wrap)

result = denoise(X, pca_components=20, gaussian_sigma=1, median_size=1)

show_images(result, "Result", col_wrap=col_wrap)