In [1]:
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt

def load_data_flexible(data_path):
    """
    Load data flexibly - handles individual JSON files, JSON arrays, and JSON Lines format
    """
    json_data = []
    json_files = glob.glob(os.path.join(data_path, '*.json'))
    
    print(f"Found {len(json_files)} JSON files")
    
    for file in json_files:
        with open(file, 'r') as f:
            try:
                # Try to load as regular JSON first (single object or array)
                data = json.load(f)
                if isinstance(data, list):
                    # If it's an array of results, extend the main list
                    json_data.extend(data)
                else:
                    # Single result
                    json_data.append(data)
            except json.JSONDecodeError:
                # If that fails, try JSON Lines format (one JSON object per line)
                f.seek(0)  # Reset file pointer
                line_count = 0
                for line in f:
                    line = line.strip()
                    if line:  # Skip empty lines
                        try:
                            data = json.loads(line)
                            json_data.append(data)
                            line_count += 1
                        except json.JSONDecodeError as e:
                            print(f"Error parsing line in {file}: {e}")
                print(f"Loaded {line_count} results from JSON Lines file: {os.path.basename(file)}")
    
    return json_data

# Load data from json_data folder
data_path = '/scratch/ty296/json_data'
json_data = load_data_flexible(data_path)
print(f"Loaded {len(json_data)} total data points")


Found 41013 JSON files
Loaded 200 results from JSON Lines file: 44412695_a0_L14.json
Loaded 200 results from JSON Lines file: 44413219_a0_L16.json
Loaded 200 results from JSON Lines file: 44412989_a0_L14.json
Loaded 200 results from JSON Lines file: 44412766_a0_L14.json
Loaded 200 results from JSON Lines file: 44421552_a0_L18.json
Loaded 200 results from JSON Lines file: 44414029_a0_L16.json
Loaded 0 results from JSON Lines file: 44414298_a0_L18.json
Loaded 200 results from JSON Lines file: 44412370_a0_L12.json
Loaded 200 results from JSON Lines file: 44413133_a0_L16.json
Loaded 200 results from JSON Lines file: 44414229_a0_L18.json
Loaded 200 results from JSON Lines file: 44413189_a0_L16.json
Loaded 200 results from JSON Lines file: 44412497_a0_L12.json
Loaded 200 results from JSON Lines file: 44413973_a0_L16.json
Loaded 200 results from JSON Lines file: 44412996_a0_L14.json
Loaded 200 results from JSON Lines file: 44414027_a0_L16.json
Loaded 0 results from JSON Lines file: 44414180_a

In [2]:
def group_data_by_params(json_data):
    # Dictionary to store grouped data
    grouped_data = {}
    
    for data in json_data:
        # Create a key tuple with the parameters we want to group by
        key = (
            data['args']['L'],
            data['args']['ancilla'],
            data['p_ctrl'],
            data['p_proj']
        )
        
        # If this parameter combination hasn't been seen before, initialize lists
        if key not in grouped_data:
            grouped_data[key] = {
                'EE': [],
                'O': [],
                'max_bond': []
            }
        
        # Append the values to their respective lists
        grouped_data[key]['EE'].append(data['EE'])
        grouped_data[key]['O'].append(data['O'])
        grouped_data[key]['max_bond'].append(data['max_bond'])
    
    return grouped_data

# Group the data
grouped_results = group_data_by_params(json_data)

# # Print results for each parameter combination
# for params, values in grouped_results.items():
#     L, ancilla, p_ctrl, p_proj = params
#     print(f"\nParameters:")
#     print(f"L = {L}, ancilla = {ancilla}, p_ctrl = {p_ctrl}, p_proj = {p_proj}")
#     print(f"Number of samples: {len(values['EE'])}")
#     print(f"EE values: {values['EE']}")
#     print(f"O values: {values['O']}")
#     print(f"Max bond values: {values['max_bond']}")
#     print("-" * 50)

# Example: Plot histograms for a specific parameter set
# Find the first parameter set in our grouped data
for key, values in grouped_results.items():

    fig, axes = plt.subplots(1, 3, figsize=(15, 5))

    # Plot EE histogram
    axes[0].hist(values['EE'], bins=10)
    axes[0].set_title('EE Distribution')
    axes[0].set_xlabel('EE')
    axes[0].set_ylabel('Frequency')
    # plt.hist(values['EE'], bins=10)
    # plt.savefig(f'/scratch/ty296/plots/EE_histogram_{key}.png')
    # plt.close()

    # Plot O histogram
    axes[1].hist(values['O'], bins=10)
    axes[1].set_title('O Distribution')
    axes[1].set_xlabel('O')
    axes[1].set_ylabel('Frequency')
    # plt.hist(values['O'], bins=10)
    # plt.savefig(f'/scratch/ty296/plots/O_histogram_{key}.png')
    # plt.close()

    # Plot max_bond histogram
    axes[2].hist(values['max_bond'], bins=10)
    axes[2].set_title('Max Bond Distribution')
    axes[2].set_xlabel('Max Bond')
    axes[2].set_ylabel('Frequency')
    # plt.hist(values['max_bond'], bins=10)
    # plt.savefig(f'/scratch/ty296/plots/max_bond_histogram_{key[0]}_a{key[1]}_p_ctrl{key[2]}_p_proj{key[3]}.png')
    # plt.close()

    fig.tight_layout()
    fig.savefig(f'/scratch/ty296/plots/histogram_{key[0]:03d}_a{key[1]:03d}_p_ctrl{key[2]:.3f}_p_proj{key[3]:.3f}.png')
    plt.close()
    # plt.tight_layout()
    # plt.show()

    # Print the parameters for the plotted data
    L, ancilla, p_ctrl, p_proj = key
    print(f"\nHistograms shown for parameters:")
    print(f"L = {L}, ancilla = {ancilla}, p_ctrl = {p_ctrl}, p_proj = {p_proj}")





Histograms shown for parameters:
L = 10, ancilla = 0, p_ctrl = 0.7894736842105263, p_proj = 0.5

Histograms shown for parameters:
L = 10, ancilla = 0, p_ctrl = 0.3684210526315789, p_proj = 0.5

Histograms shown for parameters:
L = 10, ancilla = 0, p_ctrl = 0.15789473684210525, p_proj = 0.5

Histograms shown for parameters:
L = 10, ancilla = 0, p_ctrl = 0.2631578947368421, p_proj = 0.5

Histograms shown for parameters:
L = 10, ancilla = 0, p_ctrl = 0.47368421052631576, p_proj = 0.5

Histograms shown for parameters:
L = 10, ancilla = 0, p_ctrl = 0.3157894736842105, p_proj = 0.5

Histograms shown for parameters:
L = 10, ancilla = 0, p_ctrl = 0.10526315789473684, p_proj = 0.5

Histograms shown for parameters:
L = 14, ancilla = 0, p_ctrl = 0.0, p_proj = 0.5

Histograms shown for parameters:
L = 14, ancilla = 0, p_ctrl = 0.05263157894736842, p_proj = 0.5

Histograms shown for parameters:
L = 14, ancilla = 0, p_ctrl = 0.10526315789473684, p_proj = 0.5

Histograms shown for parameters:
L = 14

In [3]:
import numpy as np
import matplotlib.pyplot as plt
# from scipy import stats  # Skip scipy for now

def calculate_sem(values):
    """Calculate standard error of the mean without scipy"""
    values = np.array(values)
    return np.std(values) / np.sqrt(len(values))

def group_by_L_and_p_ctrl(json_data):
    """Group data by L and p_ctrl to calculate statistics"""
    grouped = {}
    
    for data in json_data:
        L = data['args']['L']
        p_ctrl = data['p_ctrl']
        key = (L, p_ctrl)
        
        if key not in grouped:
            grouped[key] = []
        
        grouped[key].append(data['EE'])
    
    return grouped

def calculate_stats(ee_values):
    """Calculate mean and standard error of the mean"""
    ee_array = np.array(ee_values)
    mean = np.mean(ee_array)
    std_err = calculate_sem(ee_array)  # Use our custom function
    return mean, std_err

# Group data by L and p_ctrl
ee_grouped = group_by_L_and_p_ctrl(json_data)

# Organize data for plotting
plot_data = {}
for (L, p_ctrl), ee_values in ee_grouped.items():
    if L not in plot_data:
        plot_data[L] = {'p_ctrl': [], 'mean_EE': [], 'sem_EE': []}
    
    mean_ee, sem_ee = calculate_stats(ee_values)
    plot_data[L]['p_ctrl'].append(p_ctrl)
    plot_data[L]['mean_EE'].append(mean_ee)
    plot_data[L]['sem_EE'].append(sem_ee)

# Sort data by p_ctrl for each L
for L in plot_data:
    # Sort by p_ctrl
    sorted_indices = np.argsort(plot_data[L]['p_ctrl'])
    plot_data[L]['p_ctrl'] = np.array(plot_data[L]['p_ctrl'])[sorted_indices]
    plot_data[L]['mean_EE'] = np.array(plot_data[L]['mean_EE'])[sorted_indices]
    plot_data[L]['sem_EE'] = np.array(plot_data[L]['sem_EE'])[sorted_indices]

# Create the plot
fig, ax = plt.subplots(figsize=(10, 6))

colors = ['blue', 'red', 'green', 'orange', 'purple']
markers = ['o', 's', '^', 'D', 'v']

for i, L in enumerate(sorted(plot_data.keys())):
    color = colors[i % len(colors)]
    marker = markers[i % len(markers)]
    
    ax.errorbar(plot_data[L]['p_ctrl'], 
                plot_data[L]['mean_EE'], 
                yerr=plot_data[L]['sem_EE'],
                label=f'L = {L}',
                color=color,
                marker=marker,
                markersize=6,
                linewidth=2,
                capsize=3,
                capthick=1)

ax.set_xlabel('p_ctrl', fontsize=12)
ax.set_ylabel('Average EE', fontsize=12)
ax.set_title('Average Entanglement Entropy vs Control Probability', fontsize=14)
ax.legend()
ax.grid(True, alpha=0.3)

# Save the plot without displaying
fig.tight_layout()
fig.savefig('/scratch/ty296/plots/average_EE_vs_p_ctrl.png', dpi=300, bbox_inches='tight')
plt.close(fig)

# Print some statistics
print("Data summary:")
for L in sorted(plot_data.keys()):
    n_points = len(plot_data[L]['p_ctrl'])
    print(f"L = {L}: {n_points} data points")
    print(f"  p_ctrl range: {plot_data[L]['p_ctrl'].min():.3f} to {plot_data[L]['p_ctrl'].max():.3f}")
    print(f"  EE range: {plot_data[L]['mean_EE'].min():.3f} ± {plot_data[L]['sem_EE'][np.argmin(plot_data[L]['mean_EE'])]:.3f} to {plot_data[L]['mean_EE'].max():.3f} ± {plot_data[L]['sem_EE'][np.argmax(plot_data[L]['mean_EE'])]:.3f}")

print("Plot saved successfully!")


Data summary:
L = 10: 20 data points
  p_ctrl range: 0.000 to 1.000
  EE range: 0.000 ± 0.000 to 0.800 ± 0.011
L = 12: 20 data points
  p_ctrl range: 0.000 to 1.000
  EE range: 0.000 ± 0.000 to 0.933 ± 0.012
L = 14: 20 data points
  p_ctrl range: 0.000 to 1.000
  EE range: 0.000 ± 0.000 to 1.095 ± 0.009
L = 16: 20 data points
  p_ctrl range: 0.000 to 1.000
  EE range: 0.000 ± 0.000 to 1.262 ± 0.010
L = 18: 20 data points
  p_ctrl range: 0.000 to 1.000
  EE range: 0.000 ± 0.000 to 1.385 ± 0.014
Plot saved successfully!


In [4]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

def group_by_L_and_p_ctrl(json_data):
    """Group data by L and p_ctrl to calculate statistics"""
    grouped = {}
    
    for data in json_data:
        L = data['args']['L']
        p_ctrl = data['p_ctrl']
        key = (L, p_ctrl)
        
        if key not in grouped:
            grouped[key] = []
        
        grouped[key].append(data['EE'])
    
    return grouped

def calculate_stats(ee_values):
    """Calculate mean and standard error of the mean"""
    ee_array = np.array(ee_values)
    mean = np.mean(ee_array)
    std_err = stats.sem(ee_array)  # Standard error of the mean
    return mean, std_err

# Group data by L and p_ctrl
ee_grouped = group_by_L_and_p_ctrl(json_data)

# Organize data for plotting
plot_data = {}
for (L, p_ctrl), ee_values in ee_grouped.items():
    if L not in plot_data:
        plot_data[L] = {'p_ctrl': [], 'mean_EE': [], 'sem_EE': []}
    
    print('L = ', L, 'p_ctrl = ', p_ctrl, 'ensemble size: ', len(ee_values))
    mean_ee, sem_ee = calculate_stats(ee_values)
    plot_data[L]['p_ctrl'].append(p_ctrl)
    plot_data[L]['mean_EE'].append(mean_ee)
    plot_data[L]['sem_EE'].append(sem_ee)

# Sort data by p_ctrl for each L
for L in plot_data:
    # Sort by p_ctrl
    sorted_indices = np.argsort(plot_data[L]['p_ctrl'])
    plot_data[L]['p_ctrl'] = np.array(plot_data[L]['p_ctrl'])[sorted_indices]
    plot_data[L]['mean_EE'] = np.array(plot_data[L]['mean_EE'])[sorted_indices]
    plot_data[L]['sem_EE'] = np.array(plot_data[L]['sem_EE'])[sorted_indices]

# Create the plot
fig, ax = plt.subplots(figsize=(10, 6))

colors = ['blue', 'red', 'green', 'orange', 'purple']
markers = ['o', 's', '^', 'D', 'v']

for i, L in enumerate(sorted(plot_data.keys())):
    color = colors[i % len(colors)]
    marker = markers[i % len(markers)]
    
    ax.errorbar(plot_data[L]['p_ctrl'], 
                plot_data[L]['mean_EE'], 
                yerr=plot_data[L]['sem_EE'],
                label=f'L = {L}',
                color=color,
                marker=marker,
                markersize=6,
                linewidth=2,
                capsize=3,
                capthick=1)

ax.set_xlabel('p_ctrl', fontsize=12)
ax.set_ylabel('Average EE', fontsize=12)
ax.set_title('Average Entanglement Entropy vs Control Probability', fontsize=14)
ax.legend()
ax.grid(True, alpha=0.3)

# Save the plot
fig.tight_layout()
fig.savefig('/scratch/ty296/plots/average_EE_vs_p_ctrl.png', dpi=300, bbox_inches='tight')
plt.close()

# Print some statistics
print("Data summary:")
for L in sorted(plot_data.keys()):
    n_points = len(plot_data[L]['p_ctrl'])
    print(f"L = {L}: {n_points} data points")
    print(f"  p_ctrl range: {plot_data[L]['p_ctrl'].min():.3f} to {plot_data[L]['p_ctrl'].max():.3f}")
    print(f"  EE range: {plot_data[L]['mean_EE'].min():.3f} ± {plot_data[L]['sem_EE'][np.argmin(plot_data[L]['mean_EE'])]:.3f} to {plot_data[L]['mean_EE'].max():.3f} ± {plot_data[L]['sem_EE'][np.argmax(plot_data[L]['mean_EE'])]:.3f}")

L =  10 p_ctrl =  0.7894736842105263 ensemble size:  1993
L =  10 p_ctrl =  0.3684210526315789 ensemble size:  1975
L =  10 p_ctrl =  0.15789473684210525 ensemble size:  1966
L =  10 p_ctrl =  0.2631578947368421 ensemble size:  1976
L =  10 p_ctrl =  0.47368421052631576 ensemble size:  1982
L =  10 p_ctrl =  0.3157894736842105 ensemble size:  1970
L =  10 p_ctrl =  0.10526315789473684 ensemble size:  1962
L =  14 p_ctrl =  0.0 ensemble size:  4000
L =  14 p_ctrl =  0.05263157894736842 ensemble size:  4000
L =  14 p_ctrl =  0.10526315789473684 ensemble size:  4000
L =  14 p_ctrl =  0.15789473684210525 ensemble size:  4000
L =  14 p_ctrl =  0.21052631578947367 ensemble size:  4000
L =  14 p_ctrl =  0.2631578947368421 ensemble size:  4000
L =  14 p_ctrl =  0.3157894736842105 ensemble size:  4000
L =  14 p_ctrl =  0.3684210526315789 ensemble size:  4000
L =  14 p_ctrl =  0.42105263157894735 ensemble size:  4000
L =  14 p_ctrl =  0.47368421052631576 ensemble size:  4000
L =  14 p_ctrl =  0.