# Exploratory Data Analysis

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import sys
import seaborn as sns
import math
from pathlib import Path
import plotly.graph_objects as go
import numpy as np

# Add the root project directory to the Python path
project_root = Path.cwd().parent  # This will get the project root since the notebook is in 'notebooks/'
sys.path.append(str(project_root))

from configs.path_config import EXTRACTED_DATA_DIR, OUTPUT_DIR
print(EXTRACTED_DATA_DIR)

In [None]:
# Base directory (could be 'EXTRACTED_DATA_DIR' or your project root directory)
base_dir = Path(EXTRACTED_DATA_DIR)

# Path to the folder containing CSV files
folder = base_dir / 'group_alvbrodel_shifted'

# Get the list of files using pathlib
files = [file for file in folder.iterdir() if file.is_file() and file.suffix == '.csv']

dfs = {}

for file in files:
    try:
        # Extract the file name without the extension using pathlib's `stem` (no need for rsplit)
        name = file.stem  # This gives the name without the extension
        dfs[name] = pd.read_csv(file, parse_dates=['Time'])
        print(f"Loaded: {file.name} as dfs[{name}]")
    except FileNotFoundError:
        print(f"The file was not found: {file}")

print(f'\n\nFound {len(files)} files in the folder {folder}.')


In [None]:
from src.processing import preprocessing

interpolate_threshold = 0
for name in dfs:
    print(f'----{name}')
    dfs[name] = preprocessing.preprocessing_pipeline(dfs[name], interpolate_threshold=interpolate_threshold)


In [None]:
import random
from datetime import datetime, timedelta

def generate_random_timestamps(n):
    start = datetime.strptime("20090605000000", "%Y%m%d%H%M%S")
    end = datetime.strptime("20210611160000", "%Y%m%d%H%M%S")
    
    # Generate all possible timestamps at four-hour intervals
    timestamps = []
    current = start
    while current <= end:
        timestamps.append(current.strftime("%Y%m%d%H0000"))
        current += timedelta(hours=4)
    
    # Sample n random timestamps
    return random.sample(timestamps, n)

# Example usage:
n = 10  # Change this to the number of timestamps you need
random_timestamps = generate_random_timestamps(n)
print(random_timestamps)


In [None]:
for name, df in dfs.items():
    print(df.describe())

In [None]:
num_dfs = len(dfs)
cols = 3  
rows = math.ceil(num_dfs / cols) 

fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))  
axes = axes.flatten()  # Flatten in case of a single row

for i, (name, df) in enumerate(dfs.items()):
    sns.histplot(df['Strain'], bins=30, kde=True, ax=axes[i])  
    axes[i].set_title(f'Distribution of Strain\n{name}')
    axes[i].set_xlabel('Strain')
    axes[i].set_ylabel('Frequency')

# Hide any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])  # Remove extra subplots

plt.tight_layout()
plt.show()

In [None]:
num_dfs = len(dfs)
cols = 3 
rows = math.ceil(num_dfs / cols) 


fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
axes = axes.flatten()  # Flatten in case of a single row

for i, (name, df) in enumerate(dfs.items()):
    sns.boxplot(y=df['Strain'], ax=axes[i])  
    axes[i].set_title(f'Boxplot of Strain\n{name}')
    axes[i].set_ylabel('Strain')

# Hide any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])  # Remove extra subplots

plt.tight_layout()
plt.show()

In [None]:
def valid_regions(df, threshold_valid):
    valid_indices = df[df['Strain'].notna()].index.tolist()
    consecutive_valid_regions = []
    start_idx = None

    for i in range(len(valid_indices)):
        if start_idx is None:
            start_idx = valid_indices[i]
        if i == len(valid_indices) - 1 or valid_indices[i] + 1 != valid_indices[i + 1]:
            end_idx = valid_indices[i]
            if (end_idx - start_idx + 1) >= threshold_valid:  # Threshold for region length
                consecutive_valid_regions.append((start_idx, end_idx))
            start_idx = None

    valid_regions_sorted = [
            (start, end, df.loc[start, 'Time'], df.loc[end, 'Time'], end - start + 1)
            for start, end in consecutive_valid_regions
        ]
    
    valid_regions_sorted.sort(key=lambda x: x[4], reverse=True)

    return consecutive_valid_regions

In [None]:
from matplotlib.cm import get_cmap
import matplotlib.colors as mcolors

tab10_color = mcolors.to_hex(get_cmap('tab10')(0))  # Pick the first tab10 color

# Compute valid regions for each dataset
for name, df in dfs.items():
    df = df[df['Time']<= '2015-06-11 16:00:00']  # Filter the DataFrame to only include data up to the specified date
    df = df[df['Time']>= '2009-10-16 16:00:00']  # Filter the DataFrame to only include data up to the specified date

    fig = go.Figure()
    fig.update_layout(width=1600, height=400)  # Set figure size
    
    # Add traces for both raw strain data and rolling mean
    fig.add_trace(go.Scatter(x=df['Time'], y=df['Strain'], mode='lines', name='Raw Strain Data', line=dict(color=tab10_color)))

    fig.update_layout(
        title=f'{name}',
        xaxis_title='Time',
        yaxis_title='Strain',
        xaxis_title_font=dict(size=20),
        yaxis_title_font=dict(size=20),
        xaxis_rangeslider_visible=False,
        xaxis=dict(
            tickformat='%b %Y',          # Format as MM-DD
            tickfont=dict(size=14),
            dtick="M1",                  # Tick every month
            tickangle=45
        ),
        yaxis=dict(
            tickfont=dict(size=14)
        )
)


    # Show figure
    fig.show()
    #save figure as pdf
    save_path = OUTPUT_DIR / f'{name}_time_dependencies.pdf'
    fig.write_image(save_path, format='pdf')
    print(f"Figure saved as {save_path}")

In [None]:
threshold_valid = 100  # Apply threshold after finding overlapping regions
consecutive_regions_dict = {}  # Store consecutive valid regions for each dataset

# Define window size for the rolling mean (adjust this depending on your data)
rolling_window_daily = 6 
rolling_window_weekly = 42

# Compute valid regions for each dataset
for name, df in dfs.items():
    consecutive_valid_regions = valid_regions(df, threshold_valid)
    consecutive_regions_dict[name] = consecutive_valid_regions

    # Compute rolling mean for the 'Strain' data
    df['strain_rolling_mean_daily'] = df['Strain'].rolling(window=rolling_window_daily, min_periods=1).mean()
    df['strain_rolling_mean_weekly'] = df['Strain'].rolling(window=rolling_window_weekly, min_periods=1).mean()

    fig = go.Figure()
    fig.update_layout(width=1600, height=400)  # Set figure size
    
    # Add traces for both raw strain data and rolling mean
    fig.add_trace(go.Scatter(x=df['Time'], y=df['Strain'], mode='lines', name='Raw Strain Data'))
    fig.add_trace(go.Scatter(x=df['Time'], y=df['strain_rolling_mean_daily'], mode='lines', name='Rolling Mean (daily)', line=dict(dash='dashdot')))
    fig.add_trace(go.Scatter(x=df['Time'], y=df['strain_rolling_mean_weekly'], mode='lines', name='Rolling Mean (monthly)', line=dict(dash='dashdot')))
    
    # Shade regions with NaNs
    for start, end in consecutive_valid_regions:
        fig.add_vrect(x0=df['Time'].loc[start], x1=df['Time'].loc[end], fillcolor='green', opacity=0.3, line_width=0)

    # Labels and title
    fig.update_layout(
        title=f'Consecutive regions of more than {threshold_valid} valid values (imputed up to {interpolate_threshold} steps) <br> {name}',
        xaxis_title='Time',
        yaxis_title='Strain',
        xaxis_rangeslider_visible=True  # Optional: adds a range slider at the bottom
    )   

    # Show figure
    fig.show()


def find_overlapping_regions(*series_regions):
    """
    Find overlapping regions where all time series overlap.
    
    Args:
        *series_regions: Variable number of lists with (start, end) tuples for each time series.
        
    Returns:
        overlapping_regions: List of (start, end) tuples where all series overlap.
    """
    if not series_regions:
        return []

    overlapping_regions = series_regions[0]  # Start with the first series' valid regions

    for regions in series_regions[1:]:
        new_overlapping = []
        for s1, e1 in overlapping_regions:
            for s2, e2 in regions:
                # Compute overlap between two regions
                start_overlap = max(s1, s2)
                end_overlap = min(e1, e2)

                # If a valid overlap exists, store it
                if start_overlap <= end_overlap:
                    new_overlapping.append((start_overlap, end_overlap))

        overlapping_regions = new_overlapping  # Update overlapping regions

        if not overlapping_regions:  # Stop if no overlapping region remains
            break

    return overlapping_regions

# Compute overlapping regions across all datasets
overlapping_regions = find_overlapping_regions(*consecutive_regions_dict.values())
print(overlapping_regions)

# **Apply threshold on overlapping regions**
filtered_overlapping_regions = [
    (start, end) for start, end in overlapping_regions if (end - start) >= threshold_valid
]

filtered_overlapping_regions = np.array(filtered_overlapping_regions)  # Convert list to NumPy array
if overlapping_regions != []:
    filtered_overlapping_regions_length = filtered_overlapping_regions[:, 1] - filtered_overlapping_regions[:, 0]
    print(f'Found {len(filtered_overlapping_regions_length)} overlapping regions with the lengths >{threshold_valid}: {filtered_overlapping_regions_length}')
else: 
    print('No overlapping regions found')

# Create figure
fig = go.Figure()
fig.update_layout(
    width=1600,
    height=600,
    legend=dict(
        orientation="h",  # Horizontal orientation
        yanchor="top",  # Anchor the legend to the top of its container
        y=-0.2,  # Move the legend below the plot
        xanchor="center",  
        x=0.5  # Center the legend horizontally
    )
)

# Add each time series to the figure
for name, df in dfs.items():
    fig.add_trace(go.Scatter(
        x=df['Time'], 
        y=df['Strain'], 
        mode='lines', 
        name=name  # Dynamic name
    ))


first_key = next(iter(dfs))  # Get the first key from the dictionary
first_df = dfs[first_key]  # Retrieve the corresponding DataFrame

# Highlight the filtered overlapping regions using the first dataset's time reference
for start, end in filtered_overlapping_regions:
    fig.add_vrect(
        x0=first_df['Time'].loc[start],  # Use first df for time axis reference
        x1=first_df['Time'].loc[end], 
        fillcolor='green', 
        opacity=0.3, 
        line_width=0
    )
# Show figure
fig.show()

In [None]:
import pandas as pd
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import plotly.graph_objects as go
from pathlib import Path

# Merge DataFrames
merged_df = pd.concat([df.set_index("Time")["Strain"].rename(name) for name, df in dfs.items()], axis=1)
corr_matrix = merged_df.corr()
corr_matrix_formatted = corr_matrix.applymap(lambda x: f'{x:.2f}')

tab10 = cm.get_cmap('tab10')

# Choose two specific tab10 colors
color_start = mcolors.to_hex(tab10(3))  # Blue
color_end = mcolors.to_hex(tab10(0))    # Orange

# Create a custom continuous scale with just those two
custom_scale = [
    [0.0, color_start],
    [0.5, 'white'],
    [1.0, color_end]
]

# Create annotations for each cell
annotations = []
for i in range(corr_matrix.shape[0]):
    for j in range(corr_matrix.shape[1]):
        annotations.append(
            dict(
                x=corr_matrix.columns[j],
                y=corr_matrix.index[i],
                text=corr_matrix_formatted.iat[i, j],
                showarrow=False,
                font=dict(color="black")
            )
        )

# Create heatmap
fig = go.Figure(data=go.Heatmap(
    z=corr_matrix.values,
    x=corr_matrix.columns,
    y=corr_matrix.index,
    colorscale=custom_scale,
    zmin=-1,
    zmax=1,
    colorbar=dict(title="Correlation Index",
                   tickvals=[-1, -0.5, 0, 0.5, 1],
                   ticktext=["-1", "-0.5", "0", "0.5", "1"],
                   title_font=dict(size=18),
                   tickfont=dict(size=16),
                   xanchor="left",
                   titleside="right"
                  ),
))

fig.update_layout(
    title="Strain Correlation for All Points",
    title_font=dict(size=20),
    annotations=annotations,
    width=1000,
    height=1000,
    title_x=0.5,
    xaxis=dict(
        title="",
        tickfont=dict(size=12),
        tickangle=45,
        ticks="outside",
        showgrid=False
    ),
    yaxis=dict(
        title="",
        tickfont=dict(size=12),
        ticks="outside",
        showgrid=False
    ),
    
)

fig.show()

# Save
save_path = Path(OUTPUT_DIR / 'figures' / 'correlation_heatmaps' / 'correlation_heatmap_all.pdf')
fig.write_image(save_path, width=1000, height=1000)


In [None]:
import pandas as pd
import plotly.express as px

# Example: Assume 'dfs' is a dictionary with beam names as keys and DataFrames as values
# Each DataFrame has 'timestamp' and 'strain' columns

# Merge DataFrames on 'timestamp' to align them
merged_df = pd.concat([df.set_index("Time")["Strain"].rename(name) for name, df in dfs.items()], axis=1)

# Compute the correlation matrix with missing values (using pairwise deletion)
corr_matrix = merged_df.corr()

# Format the correlation matrix to two decimal places
corr_matrix_formatted = corr_matrix.applymap(lambda x: f'{x:.2f}')

# Create a Plotly heatmap and include the formatted numbers in each cell
fig = px.imshow(corr_matrix_formatted.astype(float),  # Convert formatted strings back to float for heatmap
                color_continuous_scale='RdBu',
                labels={'x': 'Loop', 'y': 'Loop', 'color': 'Correlation'},
                title="Strain Correlation Between Strain Time Series (support V)",
                text_auto=True,  # Add correlation values in each cell
                zmin=-1,  # Fix the minimum value of the color scale to -1
                zmax=1   # Fix the maximum value of the color scale to 1
)
# Adjust the layout of the figure
fig.update_layout(width=1000, height=1000, title_x=0.5)  # Adjust the width and height of the plot

# Show the figure
fig.show()

save_path = Path(OUTPUT_DIR / 'figures' / 'correlation_heatmaps' / 'correlation_heatmap_support_V.pdf')
fig.write_image(save_path, width=1000, height=1000)

### Availability of data

In [None]:
# import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
# import matplotlib.dates as mdates

# def heatmap(loops_to_keep):
#     # Read and process data
#     df = pd.read_csv('txt/filelist.txt', delimiter=',', header=None)
#     df.rename(columns={df.columns[0]: 'Date_path'}, inplace=True)
#     df['Datetime'] = pd.to_datetime(df['Date_path'], format='%Y%m%d%H%M%S')
#     df.rename(columns={df.columns[1]: 'Loop'}, inplace=True)
#     dates = df['Datetime'].unique()

#     loop_counts = df['Loop'].value_counts()
#     print(f'Number of channels: {len(loop_counts)}')

#     if loops_to_keep == 'all':
#         # Keep all rows if 'all' is selected
#         pass
#     elif isinstance(loops_to_keep, str) and loops_to_keep.endswith('.txt'):
#         # If it's a file path, read the loop names from the file
#         with open(loops_to_keep, 'r') as f:
#             loops_to_keep = [line.strip() for line in f.readlines()]  # Read and clean loop names
#         # Filter the DataFrame to only keep the rows with the loops in the 'loops_to_keep' list
#         df = df[df['Loop'].isin(loops_to_keep)]
#     else:
#         # If it's a list of loop names, use that list
#         df = df[df['Loop'].isin(loops_to_keep)]

#     # Create a boolean matrix using pivot_table()
#     boolean_matrix = df.pivot_table(index='Loop', columns='Datetime', aggfunc=lambda x: 1, fill_value=0)
#     boolean_matrix.columns = boolean_matrix.columns.droplevel(0)  # Drop the 'Date_path' level

#     # Plot heatmap with horizontal lines only
#     plt.figure(figsize=(160, 0.3*len(loops_to_keep)))
#     ax = sns.heatmap(boolean_matrix, cmap="Blues", cbar=False)

#     plt.title("Availability of Data Over Time")

#     # Ensure labels are not rotated
#     step = int(len(dates) / (len(dates)/30.5/6))
#     ax.set_xticks(range(235, len(dates), step))  # Set ticks at intervals (235 is when September begins so that the ticks are at the beginning of each month)
#     ax.set_xticklabels([dates[i].strftime('%Y-%m') for i in range(235, len(dates), step)], rotation=90, fontsize=30)

#     ax.hlines(y=[i + 1 for i in range(len(boolean_matrix))], xmin=0, xmax=len(boolean_matrix.columns), color='black', linewidth=0.5)

#     ax.set_yticklabels(boolean_matrix.index, rotation=0)

#     # Show plot
#     plt.show()


# # Define the option to read loops from a text file or specify 'all'
# # loops_to_keep = 'N-B_Far_Comp.txt'  # Option: 'all' or path to a text file (e.g., 'loops.txt')
# loops_to_keep = 'txt/comp_loops_notEI_filtered.txt'  # Option: 'all' or path to a text file (e.g., 'loops.txt')
# title = 'Heatmap'
# heatmap(loops_to_keep)

In [None]:
# import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
# import matplotlib.dates as mdates

# def heatmap(loops_to_keep):

#     # Ensure loops_to_keep is a list, even if it's a single string
#     if isinstance(loops_to_keep, str):
#         loops_to_keep = [loops_to_keep]

#     # Read and process data
#     df = pd.read_csv('txt/filelist.txt', delimiter=',', header=None)
#     df.rename(columns={df.columns[0]: 'Date_path'}, inplace=True)
#     df['Datetime'] = pd.to_datetime(df['Date_path'], format='%Y%m%d%H%M%S')
#     df.rename(columns={df.columns[1]: 'Loop'}, inplace=True)
#     dates = df['Datetime'].unique()

#     loop_counts = df['Loop'].value_counts()
#     print(f'Number of channels: {len(loop_counts)}')

#     print(type(loops_to_keep))

#     df = df[df['Loop'].isin(loops_to_keep)]

#     # Create a boolean matrix using pivot_table()
#     boolean_matrix = df.pivot_table(index='Loop', columns='Datetime', aggfunc=lambda x: 1, fill_value=0)
#     boolean_matrix.columns = boolean_matrix.columns.droplevel(0)  # Drop the 'Date_path' level

#     # Plot heatmap with horizontal lines only
#     plt.figure(figsize=(160, 0.3*len(loops_to_keep)))
#     ax = sns.heatmap(boolean_matrix, cmap="Blues", cbar=False)

#     plt.title("Availability of Data Over Time")

#     # Ensure labels are not rotated
#     step = int(len(dates) / (len(dates)/30.5/6))
#     ax.set_xticks(range(235, len(dates), step))  # Set ticks at intervals (235 is when September begins so that the ticks are at the beginning of each month)
#     ax.set_xticklabels([dates[i].strftime('%Y-%m') for i in range(235, len(dates), step)], rotation=90, fontsize=30)

#     ax.hlines(y=[i + 1 for i in range(len(boolean_matrix))], xmin=0, xmax=len(boolean_matrix.columns), color='black', linewidth=0.5)

#     ax.set_yticklabels(boolean_matrix.index, rotation=0)

#     # Show plot
#     plt.show()


# # Define the option to read loops from a text file or specify 'all'
# # Read the loop names from the .txt file before calling the function
# # with open('txt/comp_loops_notEI_filtered1.txt', 'r') as f:
# #     loops_to_keep = [line.strip() for line in f.readlines()]
# loops_to_keep = ['N-B_Far_Comp.txt', 'N-Klaff_Comp.txt']  # Option: 'all' or path to a text file (e.g., 'loops.txt')
# print(f'Loops to keep: {loops_to_keep}')
# heatmap(loops_to_keep)