# Plots

In [1]:
import os
import warnings
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [2]:
from tqdm import tqdm
from datetime import datetime
from sklearn.decomposition import PCA
from typing import List, Iterator, Dict

In [3]:
from scope.compressors import get_compressor
from scope.matrix import MatrixFactory as MatrixFactory
from scope.samples.sample_generator import generate_samples

In [4]:
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
plt.rcParams['figure.max_open_warning'] = 0
plt.ioff()
np.random.seed(42)

In [5]:
FILE_NAME: str = 'clintox'
FILE_PATH: str = os.path.join('data', 'dataset', f'{FILE_NAME}.csv')
RESULTS_PATH: str = os.path.join('data', 'results')
IMAGES_RESULTS_PATH: str = os.path.join(RESULTS_PATH, 'Images')
SMILES_COLUMN: str = 'smiles'
LABEL_COLUMN: str = 'CT_TOX'

In [6]:
STR_SEPARATOR: str = '\t'

MIN_SAMPLES: int = 50

COMPRESSION_DISTANCES_TO_EVALUATE: List[str] = [
    'ncd',
    'cdm',
    'clm',
    'mse'
]

COMPRESSORS_FUNCTIONS_TO_EVALUATE: List[str] = [
    'gzip',
    'bz2',
    'lz77',
    'zstandard'
]

BEST_SIGMA: List[bool] = [True, False]
USE_MATCHING_METHOD: List[bool] = [True, False]

In [7]:
def generate_combinations() -> Iterator[Dict]:
    """
    Generador que produce combinaciones una a la vez.
    Útil para datasets grandes donde no quieres cargar todas las combinaciones en memoria.
    
    Yields:
        Dict: Diccionario con una combinación de parámetros
    """
    for distance, compressor, best_sigma, use_matching in itertools.product(
        COMPRESSION_DISTANCES_TO_EVALUATE,
        COMPRESSORS_FUNCTIONS_TO_EVALUATE,
        BEST_SIGMA,
        USE_MATCHING_METHOD
    ):
        yield {
            'str_separator': STR_SEPARATOR,
            'min_samples': MIN_SAMPLES,
            'compression_distance': distance,
            'compressor': compressor,
            'best_sigma': best_sigma,
            'use_matching_method': use_matching
        }

In [8]:
dataframe: pd.DataFrame = pd.read_csv(FILE_PATH)
X: np.ndarray = dataframe[SMILES_COLUMN].values
Y: np.ndarray = dataframe[LABEL_COLUMN].values

In [9]:
if not os.path.exists(RESULTS_PATH):
    os.makedirs(RESULTS_PATH)
    
if not os.path.exists(IMAGES_RESULTS_PATH):
    os.makedirs(IMAGES_RESULTS_PATH)

if not os.path.exists(os.path.join(IMAGES_RESULTS_PATH, "compression_matrix")):
    os.makedirs(os.path.join(IMAGES_RESULTS_PATH, "compression_matrix"))

if not os.path.exists(os.path.join(IMAGES_RESULTS_PATH, "pca_2d")):
    os.makedirs(os.path.join(IMAGES_RESULTS_PATH, "pca_2d"))

if not os.path.exists(os.path.join(IMAGES_RESULTS_PATH, "pca_3d")):
    os.makedirs(os.path.join(IMAGES_RESULTS_PATH, "pca_3d"))

In [10]:
combinations_list = list(generate_combinations())

## Compression Matrix Plots

In [11]:
def get_compression_plot(combination: dict, matrix_result_test: dict, y_target_test: float, figname: str, save: bool=False, show: bool=True) -> None:
    
    figname = datetime.now().strftime("%Y%m%d_%H%M%S")
    fig, axes = plt.subplots(ncols=2, nrows=2, figsize=(14, 12), dpi=300)
    fig.suptitle(f"ScOPE Compression Matrix Visualization\nQuery Sample Classification (True Class: {y_target_test})",
                     fontsize=14, fontweight='bold', y=0.98)
    
    for index, key in enumerate(matrix_result_test):
        if index == 4:
            continue
        row = index // 2
        col = index % 2
        ax = axes[row, col]

        if 'ScOPEC_' in key:
            class_name = key.replace('ScOPEC_', '')

            ax.text(0.5, -0.1, f'Reference Samples (Known Property of {class_name.replace("_", " ").title()})', 
                    transform=ax.transAxes, ha='center', va='top',
                    fontsize=13, style='italic', color='darkgreen',
                    bbox=dict(boxstyle="round,pad=0.3", facecolor='lightgreen', alpha=0.3))
            
            img = ax.imshow(matrix_result_test[key], cmap="plasma", aspect='auto')
            
        elif 'ScOPES_' in key:
            # Add text annotation
            ax.text(0.5, -10, 'Query Sample (Unknown Property)', 
                    transform=ax.transAxes, ha='center', va='top',
                    fontsize=13, style='italic', color='darkred',
                    bbox=dict(boxstyle="round,pad=0.3", facecolor='lightcoral', alpha=0.3))
            
            img = ax.imshow(matrix_result_test[key], cmap="plasma",)

        ax.set_yticks([])
        ax.set_yticklabels([])
        ax.set_xticks([])
        ax.set_xticklabels([])
        
        for spine in ax.spines.values():
            spine.set_linewidth(2)
            if 'ScOPEC_' in key:
                spine.set_color('darkgreen')
            else:
                spine.set_color('darkred')

    cbar_ax = fig.add_axes([0.92, 0.25, 0.02, 0.5])
    cbar = fig.colorbar(img, cax=cbar_ax)
    cbar.set_label('Compression Distance\n(Information Content Shared)', 
                    rotation=90, fontsize=13, labelpad=15)
    
    method_info = (
        f"Samples: {combination['min_samples']} | "
        f"Method: {combination['compressor'].title()} | "
        f"Distance: {combination['compression_distance'].upper()} | "
        f"Separator: {repr(combination['str_separator'])} | "
        f"Best Sigma: {combination['best_sigma']} | "
        f"Matching: {combination['use_matching_method']}")
    
    fig.text(0.5, -0.05, method_info, ha='center', va='bottom',
            fontsize=10, style='italic', color='gray',
            bbox=dict(boxstyle="round,pad=0.5", facecolor='lightgray', alpha=0.5))
    
    plt.subplots_adjust(left=0.07, right=0.90, top=0.9, bottom=0.08, wspace=0.3, hspace=0.4)
    
    if save:
        plt.savefig(
            os.path.join(IMAGES_RESULTS_PATH, "compression_matrix" ,f"{figname}.png"),
            dpi=300,
            bbox_inches='tight'
        )
    
    if show:
        plt.show()
    
    plt.close(fig)

## PCA

### 2D

In [12]:
def get_pca2d_plot(combination: dict, matrix_result_test: dict, y_target_test: float, figname: str, save: bool=False, show: bool=True) -> None:
    pca = PCA(n_components=2, whiten=False)

    plt.figure(figsize=(14, 8), dpi=300)
    colors = ['#2E86AB', '#A23B72', '#F18F01', '#C73E1D', '#8E44AD', '#27AE60']
        
    figname = datetime.now().strftime("%Y%m%d_%H%M%S")
        
                
    grouped_classes = {}
    
    for class_name, values in matrix_result_test.items():
        suffix = class_name.find('class_')
        if suffix == -1:
            continue
        
        class_suffix = class_name[suffix:]
        if class_suffix not in grouped_classes:
            grouped_classes[class_suffix] = []

        grouped_classes[class_suffix].extend(values)
        
    pca_results = {}
        
    for class_name, arrays in grouped_classes.items():
        data_matrix = np.vstack(arrays)
        pca_result = pca.fit_transform(data_matrix)
        pca_results[class_name] = pca_result
    
    prototypes = {}

    for idx, (class_name, pca_transformed) in enumerate(pca_results.items()):
        reference_points = pca_transformed[:-1, :]
        prototype = reference_points.mean(axis=0)
        prototypes[class_name] = prototype

        # puntos de referencia tenues
        plt.scatter(
            reference_points[:, 0],
            reference_points[:, 1],
            alpha=0.3,
            s=40,
            color=colors[idx % len(colors)],
            label=None
        )

        # prototipo (centroide)
        plt.scatter(
            prototype[0],
            prototype[1],
            label=f'{class_name.replace("_", " ").title()} (Prototype)',
            alpha=0.9,
            s=120,
            color=colors[idx % len(colors)],
            marker='o',
            edgecolors='black',
            linewidth=1.5
        )
    
    # Muestras de prueba
    sample_pca_class_0 = pca_results['class_0'][-1, :]
    sample_pca_class_1 = pca_results['class_1'][-1, :]

    plt.scatter(
        sample_pca_class_0[0],
        sample_pca_class_0[1],
        label="Sample based on class 0",
        color=colors[0],
        marker='X',
        s=200,
        alpha=1,
        edgecolors='black',
        linewidth=2
    )
    plt.scatter(
        sample_pca_class_1[0],
        sample_pca_class_1[1],
        label="Sample based on class 1",
        color=colors[1],
        marker='X',
        s=200,
        alpha=1,
        edgecolors='black',
        linewidth=2
    )

    # Línea y distancia class_0
    plt.plot(
        [prototypes['class_0'][0], sample_pca_class_0[0]],
        [prototypes['class_0'][1], sample_pca_class_0[1]],
        color=colors[0],
        linestyle='--',
        linewidth=2,
        alpha=0.5
    )
    dist_0 = np.linalg.norm(sample_pca_class_0 - prototypes['class_0'])
    midpoint_0 = (sample_pca_class_0 + prototypes['class_0']) / 2
    plt.text(
        midpoint_0[0], midpoint_0[1],
        f"{dist_0:.2f}",
        fontsize=10,
        color=colors[0],
        fontweight='bold',
        bbox=dict(facecolor='white', edgecolor=colors[0], boxstyle='round,pad=0.3', alpha=0.8)
    )

    # Línea y distancia class_1
    plt.plot(
        [prototypes['class_1'][0], sample_pca_class_1[0]],
        [prototypes['class_1'][1], sample_pca_class_1[1]],
        color=colors[1],
        linestyle='--',
        linewidth=2,
        alpha=0.6
    )
    dist_1 = np.linalg.norm(sample_pca_class_1 - prototypes['class_1'])
    midpoint_1 = (sample_pca_class_1 + prototypes['class_1']) / 2
    plt.text(
        midpoint_1[0], midpoint_1[1],
        f"{dist_1:.2f}",
        fontsize=10,
        color=colors[1],
        fontweight='bold',
        bbox=dict(facecolor='white', edgecolor=colors[1], boxstyle='round,pad=0.3', alpha=0.8)
    )

    plt.title(f"ScOPE PCA Visualization\nQuery Sample Classification (True Class: {y_target_test})", 
                fontsize=14, fontweight='bold', pad=20)
    
    plt.xlabel("Principal Component 1", fontsize=14)
    plt.ylabel("Principal Component 2", fontsize=14)
    
    plt.legend(loc="center left", bbox_to_anchor=(1, 0.5), 
                frameon=True, fancybox=True, shadow=True, fontsize=11)
    
    plt.grid(True, alpha=0.3, linestyle='--')
    plt.gca().set_facecolor('#fafafa')
    
    method_info = (
        f"Samples: {combination['min_samples']} | "
        f"Method: {combination['compressor'].title()} | "
        f"Distance: {combination['compression_distance'].upper()} | "
        f"Separator: {repr(combination['str_separator'])} | "
        f"Best Sigma: {combination['best_sigma']} | "
        f"Matching: {combination['use_matching_method']}"
    )
    
    plt.figtext(0.5, 0.02, method_info, ha='center', va='bottom',
                fontsize=10, style='italic', color='gray',
                bbox=dict(boxstyle="round,pad=0.3", facecolor='lightgray', alpha=0.7))
    
    plt.tight_layout()
    plt.subplots_adjust(bottom=0.15)
    
    if save:
        plt.savefig(
            os.path.join(IMAGES_RESULTS_PATH, "pca_2d", f"{figname}.png"),
            dpi=300,
            bbox_inches='tight'
        )

    if show:
        plt.show()
        
    plt.close()

### 3D

In [13]:
def get_pca3d_plot(combination: dict, matrix_result_test: dict, y_target_test: float, figname: str, save: bool=False, show: bool=True) -> None:
    pca = PCA(n_components=3, whiten=False)

    fig = plt.figure(figsize=(16, 12), dpi=300)
    ax = fig.add_subplot(111, projection='3d')
    
    colors = ['#2E86AB', '#A23B72', '#F18F01', '#C73E1D', '#8E44AD', '#27AE60']
    
    grouped_classes = {}
    
    for class_name, values in matrix_result_test.items():
        suffix = class_name.find('class_')
        if suffix == -1:
            continue
        
        class_suffix = class_name[suffix:]
        if class_suffix not in grouped_classes:
            grouped_classes[class_suffix] = []

        grouped_classes[class_suffix].extend(values)
        
    pca_results = {}
    
    for class_name, arrays in grouped_classes.items():
        data_matrix = np.vstack(arrays)
        pca_result = pca.fit_transform(data_matrix)
        pca_results[class_name] = pca_result
    
    prototypes = {}

    for idx, (class_name, pca_transformed) in enumerate(pca_results.items()):
        reference_points = pca_transformed[:-1, :]  # todos menos el último punto
        prototype = reference_points.mean(axis=0)
        prototypes[class_name] = prototype

        # puntos de referencia tenues
        ax.scatter(
            reference_points[:, 0],
            reference_points[:, 1],
            reference_points[:, 2],
            alpha=0.3,
            s=40,
            color=colors[idx % len(colors)],
            label=None
        )

        # prototipo (centroide)
        ax.scatter(
            prototype[0],
            prototype[1],
            prototype[2],
            label=f'{class_name.replace("_", " ").title()} (Prototype)',
            alpha=0.9,
            s=80,
            color=colors[idx % len(colors)],
            marker='o',
            edgecolors='black',
            linewidth=1.5
        )
    
    # Muestras de prueba
    sample_pca_class_0 = pca_results['class_0'][-1, :]
    sample_pca_class_1 = pca_results['class_1'][-1, :]

    ax.scatter(
        sample_pca_class_0[0],
        sample_pca_class_0[1],
        sample_pca_class_0[2],
        label="Sample based on class 0",
        color=colors[0],
        marker='X',
        s=200,
        alpha=1,
        edgecolors='black',
        linewidth=2
    )
    ax.scatter(
        sample_pca_class_1[0],
        sample_pca_class_1[1],
        sample_pca_class_1[2],
        label="Sample based on class 1",
        color=colors[1],
        marker='X',
        s=200,
        alpha=1,
        edgecolors='black',
        linewidth=2
    )

    # Línea y distancia class_0 (3D)
    ax.plot(
        [prototypes['class_0'][0], sample_pca_class_0[0]],
        [prototypes['class_0'][1], sample_pca_class_0[1]],
        [prototypes['class_0'][2], sample_pca_class_0[2]],
        color=colors[0],
        linestyle='--',
        linewidth=2,
        alpha=0.5
    )
    dist_0 = np.linalg.norm(sample_pca_class_0 - prototypes['class_0'])
    midpoint_0 = (sample_pca_class_0 + prototypes['class_0']) / 2
    ax.text(
        midpoint_0[0], midpoint_0[1], midpoint_0[2],
        f"{dist_0:.2f}",
        fontsize=10,
        color=colors[0],
        fontweight='bold',
        bbox=dict(facecolor='white', edgecolor=colors[0], boxstyle='round,pad=0.3', alpha=0.8)
    )

    # Línea y distancia class_1 (3D)
    ax.plot(
        [prototypes['class_1'][0], sample_pca_class_1[0]],
        [prototypes['class_1'][1], sample_pca_class_1[1]],
        [prototypes['class_1'][2], sample_pca_class_1[2]],
        color=colors[1],
        linestyle='--',
        linewidth=2,
        alpha=0.6
    )
    dist_1 = np.linalg.norm(sample_pca_class_1 - prototypes['class_1'])
    midpoint_1 = (sample_pca_class_1 + prototypes['class_1']) / 2
    ax.text(
        midpoint_1[0], midpoint_1[1], midpoint_1[2],
        f"{dist_1:.2f}",
        fontsize=10,
        color=colors[1],
        fontweight='bold',
        bbox=dict(facecolor='white', edgecolor=colors[1], boxstyle='round,pad=0.3', alpha=0.8)
    )

    ax.set_title(f"ScOPE PCA 3D Visualization\nQuery Sample Classification (True Class: {y_target_test})", 
                    fontsize=14, fontweight='bold', pad=20)
    
    ax.set_xlabel("Principal Component 1")
    ax.set_ylabel("Principal Component 2")
    ax.set_zlabel("Principal Component 3")
    
    ax.legend(loc="center left", bbox_to_anchor=(1, 0.5), 
                frameon=True, fancybox=True, shadow=True, fontsize=11)
    
    ax.grid(True, alpha=0.3)
        
    ax.xaxis.pane.fill = False
    ax.yaxis.pane.fill = False
    ax.zaxis.pane.fill = False
    
    ax.xaxis.pane.set_edgecolor('gray')
    ax.yaxis.pane.set_edgecolor('gray')
    ax.zaxis.pane.set_edgecolor('gray')
    ax.xaxis.pane.set_alpha(0.1)
    ax.yaxis.pane.set_alpha(0.1)
    ax.zaxis.pane.set_alpha(0.1)
    
    method_info = (
        f"Samples: {combination['min_samples']} | "
        f"Method: {combination['compressor'].title()} | "
        f"Distance: {combination['compression_distance'].upper()} | "
        f"Separator: {repr(combination['str_separator'])} | "
        f"Best Sigma: {combination['best_sigma']} | "
        f"Matching: {combination['use_matching_method']}"
    )
    
    plt.figtext(0.5, 0.02, method_info, ha='center', va='bottom',
                fontsize=10, style='italic', color='gray',
                bbox=dict(boxstyle="round,pad=0.3", facecolor='lightgray', alpha=0.7))
    
    plt.tight_layout()
    
    if save:
        plt.savefig(
            os.path.join(IMAGES_RESULTS_PATH, "pca_3d", f"{figname}.png"),
            dpi=300,
            bbox_inches='tight'
        )
    
    if show:
        plt.show()
    
    plt.close(fig)

## Gen Plots

In [None]:
show = False
save = True
for x_test, y_target_test, kw_samples_test in generate_samples(data=X, labels=Y, num_samples=MIN_SAMPLES):
    for combination in tqdm(combinations_list, desc="Generating Plots"):
        
        compressor = get_compressor(name=combination['compressor'])
        name_distance_function = combination['compression_distance']
        str_separator = combination['str_separator']
        
        matrix_test: MatrixFactory = MatrixFactory(
            compressor_module=compressor,
            name_distance_function=name_distance_function,
            str_separator=str_separator
        )
        
        matrix_result_test: dict = matrix_test(x_test, kw_samples_test, get_best_sigma=combination['best_sigma'])
        
        figname = datetime.now().strftime("%Y%m%d_%H%M%S")

        get_compression_plot(combination, matrix_result_test, y_target_test, figname=figname, show=show, save=save)

        get_pca2d_plot(combination, matrix_result_test, y_target_test, figname=figname, show=show, save=save)

        get_pca3d_plot(combination, matrix_result_test, y_target_test, figname=figname, show=show, save=save)

    break

Generating Plots:   0%|          | 0/64 [00:00<?, ?it/s]

Generating Plots:  86%|████████▌ | 55/64 [15:49<28:20, 188.90s/it]