In [2]:
import numpy as np
from scipy.spatial.distance import pdist, squareform
import pandas as pd
import os
import ot
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from IPython.display import display, Math

def rbf_kernel(X, Y=None, gamma=1.0):
    """Compute the RBF (Gaussian) kernel between X and Y."""
    if Y is None:
        Y = X
    dist = np.sum(X**2, axis=1)[:, np.newaxis] + np.sum(Y**2, axis=1)[np.newaxis, :] - 2 * np.dot(X, Y.T)
    return np.exp(-gamma * dist)

def compute_mmd(X, Y, gamma=1.0):
    """Compute Maximum Mean Discrepancy (MMD) between two sets of samples."""
    K_XX = rbf_kernel(X, X, gamma=gamma)
    K_YY = rbf_kernel(Y, Y, gamma=gamma)
    K_XY = rbf_kernel(X, Y, gamma=gamma)
    mmd = np.mean(K_XX) + np.mean(K_YY) - 2 * np.mean(K_XY)
    return mmd

def compute_wasserstein_nd(X, Y):
    """Compute the Wasserstein distance between two multi-dimensional distributions."""
    M = ot.dist(X, Y, metric='euclidean')
    n = X.shape[0]
    m = Y.shape[0]
    p = np.ones(n) / n  # Uniform distribution for X
    q = np.ones(m) / m  # Uniform distribution for Y
    dist = ot.emd2(p, q, M)
    return dist

def compute_proxy_a_distance(X, Y):
    """Compute Proxy-A-Distance (PAD) between two distributions using a Random Forest classifier."""
    # Combine datasets
    X_combined = np.vstack([X, Y])
    y_combined = np.hstack([np.zeros(X.shape[0]), np.ones(Y.shape[0])])  # 0 for X, 1 for Y
    
    # Train a Random Forest classifier and compute cross-validated accuracy
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    error_rates = 1 - cross_val_score(clf, X_combined, y_combined, cv=5, scoring='accuracy')
    avg_error_rate = np.mean(error_rates)
    
    # Compute PAD based on classification error rate
    pad_value = 2 * (1 - 2 * avg_error_rate)
    return pad_value

def dataset_feature_vectors(eval_path, columns_to_concat):
    df = pd.read_parquet(eval_path)
#     print(df)
    df = df.dropna(subset=columns_to_concat)
    return np.vstack(df[columns_to_concat].apply(lambda row: np.array(row.values.tolist()), axis=1))

# Configuration
columns_to_concat = [
    'cell_params.a', 
    'cell_params.b',
    'cell_params.c',
    'cell_params.alpha', 
    'cell_params.beta', 
    'cell_params.gamma', 
    'cell_params.implied_vol',
    'cell_params.gen_vol',
    'seq_len',
]
eval_paths = [
    '../nomodel/crystal_1k/nmax8_lmax5/crystal_train_1000.eval',
    '../cross-contamination/deciferdataset_experiment/boundarymasking/boundary_masking_100.eval',
    '../cross-contamination/deciferdataset_experiment/no_boundarymasking/no_boundary_masking_100.eval',
]
eval_names = [os.path.basename(path) for path in eval_paths]

# Extract feature vectors
feature_vectors = []
for path in eval_paths:
    vec = dataset_feature_vectors(path, columns_to_concat)
    feature_vectors.append(vec)

# Initialize results table
results = []

# Compute pairwise MMD, Wasserstein, and PAD
for i in range(len(eval_names)):
    for j in range(i + 1, len(eval_names)):
        mmd_value = compute_mmd(feature_vectors[i], feature_vectors[j], gamma=1.0)
        dist_nd = compute_wasserstein_nd(feature_vectors[i], feature_vectors[j])
        pad_value = compute_proxy_a_distance(feature_vectors[i], feature_vectors[j])
        
        # Store results in a list
        results.append({
            "Dataset 1": eval_names[i].split(".")[0],
            "Dataset 2": eval_names[j].split(".")[0],
            "MMD": mmd_value,
            "Wasserstein Distance": dist_nd,
            "Proxy-A-Distance": pad_value
        })

# Convert results to a DataFrame and display the table
results_df = pd.DataFrame(results)

print(results_df.to_latex())

from IPython.display import display, Latex

# Create LaTeX-like string to display
table_str = r"""
\begin{aligned}
& \text{Table: MMD, Wasserstein, and Proxy-A-Distance between datasets}\\
&\begin{array}{|c|c|c|c|c|}
\hline
\text{Dataset 1} & \text{Dataset 2} & \text{MMD}\;\downarrow & \text{WD}\;\downarrow & \text{PAD-RF}\;\uparrow \\
\hline
"""

# Add rows from DataFrame to the LaTeX string
for _, row in results_df.iterrows():
    table_str += f"\\text{{{row['Dataset 1']}}} & \\text{{{row['Dataset 2']}}} & {row['MMD']:.3f} & {row['Wasserstein Distance']:.2f} & {row['Proxy-A-Distance']:.2f} \\\\\n"
    table_str += r"\hline" + "\n"

# Close the table
table_str += r"\end{array}\end{aligned}"

# Display the LaTeX-like table
display(Latex(table_str))

\begin{tabular}{lllrrr}
\toprule
 & Dataset 1 & Dataset 2 & MMD & Wasserstein Distance & Proxy-A-Distance \\
\midrule
0 & crystal_train_1000 & boundary_masking_100 & 0.011466 & 633.618072 & 1.631831 \\
1 & crystal_train_1000 & no_boundary_masking_100 & 0.011954 & 583.466831 & 1.697086 \\
2 & boundary_masking_100 & no_boundary_masking_100 & 0.021374 & 127.511372 & 0.447503 \\
\bottomrule
\end{tabular}



<IPython.core.display.Latex object>

In [3]:
import subprocess
import os

def latex_to_png(latex_code, output_filename='output.png', dpi=300):
    # Step 1: Write LaTeX code to a .tex file
    tex_filename = 'latex_input.tex'
    with open(tex_filename, 'w') as f:
        # Ensure content is wrapped properly in a full LaTeX document
        f.write(r"""
        \documentclass{standalone}
        \usepackage{amsmath, booktabs} % Add more packages if necessary
        \begin{document}
        """ + latex_code + r"""
        \end{document}
        """)

    try:
        # Step 2: Compile the .tex file to a DVI file using `latex`
        subprocess.run(['latex', tex_filename], check=True)

        # Step 3: Convert the .dvi file to PNG using `dvipng`
        dvi_filename = tex_filename.replace('.tex', '.dvi')
        subprocess.run(['dvipng', '-D', str(dpi), '-T', 'tight', '-o', output_filename, dvi_filename], check=True)

        print(f"PNG image successfully created: {output_filename}")

    except subprocess.CalledProcessError as e:
        print(f"An error occurred: {e}")
    finally:
        # Optional: Clean up the intermediate files (DVI, AUX, LOG)
        for ext in ['aux', 'log', 'dvi', 'tex']:
            if os.path.exists(f'latex_input.{ext}'):
                os.remove(f'latex_input.{ext}')

# Example usage with a table or other complex LaTeX content
latex_code = r"""
\begin{tabular}{lccc}
    \toprule
    & \textbf{Metric 1} & \textbf{Metric 2} & \textbf{Metric 3} \\
    \midrule
    \text{Dataset 1} & 0.123 & 0.456 & 0.789 \\
    \text{Dataset 2} & 0.987 & 0.654 & 0.321 \\
    \bottomrule
\end{tabular}
"""

latex_to_png(latex_code, output_filename='table_example.png')


This is pdfTeX, Version 3.14159265-2.6-1.40.20 (TeX Live 2019/Debian) (preloaded format=latex)
 restricted \write18 enabled.
entering extended mode
(./latex_input.tex
LaTeX2e <2020-02-02> patch level 2
L3 programming layer <2020-02-14>
(/usr/share/texlive/texmf-dist/tex/latex/standalone/standalone.cls
Document Class: standalone 2018/03/26 v1.3a Class to compile TeX sub-files stan
dalone
(/usr/share/texlive/texmf-dist/tex/latex/tools/shellesc.sty)
(/usr/share/texlive/texmf-dist/tex/generic/iftex/ifluatex.sty
(/usr/share/texlive/texmf-dist/tex/generic/iftex/iftex.sty))
(/usr/share/texlive/texmf-dist/tex/latex/xkeyval/xkeyval.sty
(/usr/share/texlive/texmf-dist/tex/generic/xkeyval/xkeyval.tex
(/usr/share/texlive/texmf-dist/tex/generic/xkeyval/xkvutils.tex
(/usr/share/texlive/texmf-dist/tex/generic/xkeyval/keyval.tex))))
(/usr/share/texlive/texmf-dist/tex/latex/standalone/standalone.cfg)
(/usr/share/texlive/texmf-dist/tex/latex/base/article.cls
Document Class: article 2019/12/20 v1.4l Stand

