In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from tqdm import tqdm


### Data Preparation

In [17]:
# Find absolute path of ./data/train
root_path = os.path.abspath(os.path.dirname(os.getcwd()))
train_path = os.path.join(root_path, 'data', 'train')
test_path = os.path.join(root_path, 'data', 'test')
processed_path = os.path.join(root_path, 'data', 'processed')
plots_path = os.path.join(root_path, 'plots')

In [None]:
file_ids = ['1', '2', '3', '4', '6', '7', '8', '9', '10', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60']
for file_id in file_ids:
    globals()[f'run_data_{file_id}'] = pd.read_parquet(os.path.join(train_path, f"run_data_{file_id}.parquet"))
    globals()[f'incoming_run_data_{file_id}'] = pd.read_parquet(os.path.join(train_path, f"incoming_run_data_{file_id}.parquet"))
    globals()[f'metrology_data_{file_id}'] = pd.read_parquet(os.path.join(train_path, f"metrology_data{file_id}.parquet"))

In [None]:
test_run_data = pd.read_parquet(os.path.join(test_path, "run_data.parquet"))
test_incoming_run_data = pd.read_parquet(os.path.join(test_path, "incoming_run_data.parquet"))

In [None]:
s = 0
for file_id in file_ids:
    s += len(globals()[f'run_data_{file_id}'])
print(f"Total number of entries in run_data: {s}")

## Basic Properties

### Tool ID
* A single and unique Tool ID for each run/incoming run file
* Corresponding run/incoming run files share the same Tool ID

In [None]:
unique_tools = set()
for file_id in file_ids:
    unique_tool_run = globals()[f'run_data_{file_id}']['Tool ID'].unique()
    unique_tool_incoming_run = globals()[f'incoming_run_data_{file_id}']['Tool ID'].unique()
    print(unique_tool_run == unique_tool_incoming_run)
    unique_tools.add(unique_tool_run[0])

### Run IDs


In [None]:
for file_id in file_ids:
    run_data = globals()[f'run_data_{file_id}']
    incoming_run_data = globals()[f'incoming_run_data_{file_id}']
    run_ids_run = run_data['Run ID']
    unique_run_ids_run = set(run_ids_run.unique())
    run_ids_incoming_run = incoming_run_data['Run ID']
    unique_run_ids_incoming = set(run_ids_incoming_run.unique())
    metrology_data = globals()[f'metrology_data_{file_id}']
    run_ids_metrology = metrology_data['Run ID']
    unique_run_ids_metrology = set(run_ids_metrology.unique())

    print(file_id)
    print(len(unique_run_ids_run))
    print(len(run_ids_run))
    print(len(run_ids_incoming_run))
    print(len(run_ids_metrology))
    print(unique_run_ids_run == unique_run_ids_incoming == unique_run_ids_metrology)
    print()

### Sensor Names

In [None]:
print(run_data_1['Sensor Name'].unique())
for file_id in file_ids:
    run_data = globals()[f'run_data_{file_id}']
    print(run_data['Sensor Name'].unique() == run_data_1['Sensor Name'].unique())

In [None]:
print(incoming_run_data_1['Sensor Name'].unique())
for file_id in file_ids:
    incoming_run_data = globals()[f'incoming_run_data_{file_id}']
    print(incoming_run_data['Sensor Name'].unique() == incoming_run_data_1['Sensor Name'].unique())

In [None]:
print(run_data_1.head(5))

In [None]:
print(run_data_1['Consumable Life'].nunique())

In [None]:
print(incoming_run_data_1.head(5))

### Process Step

* All run_data file entries share the same `Process Step` (intuitively since run_data focuses on the specific current process).
* All incoming_run_data file entries also share the same `Process Step`.

In [None]:
unique_processes = set()
for file_id in file_ids:
    run_data = globals()[f'run_data_{file_id}']
    unique_processes.add(run_data['Process Step'].nunique())
    unique_processes.add(run_data['Process Step'].unique()[0])
print(unique_processes)

In [None]:
unique_processes = set()
for file_id in file_ids:
    incoming_run_data = globals()[f'incoming_run_data_{file_id}']
    unique_processes.add(incoming_run_data['Process Step'].nunique())
    unique_processes.add(incoming_run_data['Process Step'].unique()[0])
print(unique_processes)

### Time Stamp

In [None]:
for file_id in file_ids:
    run_data = globals()[f'run_data_{file_id}']
    incoming_run_data = globals()[f'incoming_run_data_{file_id}']
    print(run_data['Run Start Time'].nunique())

* Every run has `no_time_stamps` * 15 entries for `run_data` and `no_time_stamps` * 41 entries for `incoming_run_data` (in two cases `no_time_stamps` are different)

In [None]:
run_1 = run_data_1[run_data_1['Run ID'] == run_data_1['Run ID'].unique()[1]]
print(len(run_1))
print(run_1['Time Stamp'].value_counts())

In [None]:
print(run_1['Step ID'].value_counts())

In [None]:
incoming_run_1 = incoming_run_data_1[incoming_run_data_1['Run ID'] == run_data_1['Run ID'].unique()[1]]
print(len(incoming_run_1))

In [None]:
from tqdm import tqdm

for file_id in file_ids:
    run_data = globals()[f'run_data_{file_id}']
    unique_runs = run_data['Run ID'].unique()
    incoming_run_data = globals()[f'incoming_run_data_{file_id}']
    for run_id in tqdm(unique_runs, desc=f'Checking run data for file {file_id}'):
        run = run_data[run_data['Run ID'] == run_id]
        incoming_run = incoming_run_data[incoming_run_data['Run ID'] == run_id]
        if len(run) != run['Time Stamp'].nunique() * 15:
            print("Mismatch in run data for Run ID:", run_id)
        if len(incoming_run) != incoming_run['Time Stamp'].nunique() * 41:
            print("Mismatch in incoming run data for Run ID:", run_id)

# No mismatches found, all runs have the expected number of time stamps.

* Within each run in `run_data` or `incoming_run_data`, sensors start measuring since the beginning of the run and record data every second. However, they stop recording before the end of the run.

In [None]:
# Taking run_data_1 and incoming_run_data_1 as an example
from tqdm import tqdm
import matplotlib.pyplot as plt

run_durations = []
run_ending_early = []
incoming_run_durations = []
incoming_run_ending_early = []
for run_id in tqdm(run_data_1['Run ID'].unique()):
    run = run_data_1[run_data_1['Run ID'] == run_id]
    run_durations.append((run['Run End Time'].iloc[0] - run['Run Start Time'].iloc[0]).total_seconds())
    run_ending_early.append((run['Run End Time'].iloc[0] - run['Time Stamp'].max()).total_seconds())

    incoming_run = incoming_run_data_1[incoming_run_data_1['Run ID'] == run_id]
    incoming_run_durations.append((incoming_run['Run End Time'].iloc[0] - incoming_run['Run Start Time'].iloc[0]).total_seconds())
    incoming_run_ending_early.append((incoming_run['Run End Time'].iloc[0] - incoming_run['Time Stamp'].max()).total_seconds())

In [None]:
print((np.array(run_durations) == 755.0).all())
print((np.array(incoming_run_durations) == 742.0).all())

In [None]:
print(run_ending_early)

In [None]:
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.hist(run_ending_early, bins=100)
plt.subplot(1, 2, 2)
plt.hist(incoming_run_ending_early, bins=100)
plt.show()

In [None]:
run_1 = run_data_1[run_data_1['Run ID'] == run_data_1['Run ID'].unique()[1]]
# run_1['Time Stamp'] = (run_1['Time Stamp'] - run_1['Time Stamp'].min()).dt.total_seconds().round().astype(int)
incoming_run_1 = incoming_run_data_1[incoming_run_data_1['Run ID'] == run_data_1['Run ID'].unique()[1]]
# incoming_run_1['Time Stamp'] = (incoming_run_1['Time Stamp'] - incoming_run_1['Time Stamp'].min()).dt.total_seconds().round().astype(int)

In [None]:
print(run_data_1['Run ID'].nunique())

### Consumable Life

In [None]:
start_times = []
consumable_lives = []
for run_id in run_data_1['Run ID'].unique():
    run = run_data_1[run_data_1['Run ID'] == run_id]
    start_times.append(run['Run Start Time'].iloc[0])
    consumable_lives.append(run['Consumable Life'].iloc[0])

In [None]:
plt.scatter(start_times, consumable_lives)

In [None]:
for file_id in file_ids:
    run_data = globals()[f'run_data_{file_id}']
    run_1 = run_data[run_data['Run ID'] == run_data['Run ID'].unique()[1]]
    print(run_1['Consumable Life'].unique())

In [None]:
print(run_1['Run Start Time'].iloc[0])
print((run_1['Run End Time'].iloc[0] - run_1['Run Start Time'].iloc[0]).total_seconds())
print(run_1['Run End Time'].iloc[0])
print(incoming_run_1['Run Start Time'].iloc[0])
print((incoming_run_1['Run End Time'].iloc[0] - incoming_run_1['Run Start Time'].iloc[0]).total_seconds())
print(incoming_run_1['Run End Time'].iloc[0])

In [None]:
for file_id in file_ids:
    run_data = globals()[f'run_data_{file_id}']
    incoming_run_data = globals()[f'incoming_run_data_{file_id}']
    print(len(run_data) / len(incoming_run_data))

## Feature Engineering

### Initialization

In [None]:
sample_run_data = run_data_1
sample_incoming_run_data = incoming_run_data_1

### Column Modifications

#### Drop `Process Step`

In [None]:
sample_run_data = sample_run_data.drop(columns=['Process Step'])
sample_incoming_run_data = sample_incoming_run_data.drop(columns=['Process Step'])

#### Replace `Run End Time` with duration of the run and rename it to `Run Duration`

In [None]:
sample_run_data['Run End Time'] = (sample_run_data['Run End Time'] - sample_run_data['Run Start Time']).dt.total_seconds().astype(int)
sample_run_data.rename(columns={'Run End Time': 'Run Duration'}, inplace=True)
print(sample_run_data.head(5))

#### Replace `Run Start Time` with number representing seconds elapsed since the start of the first run

In [None]:
start_time = sample_run_data['Run Start Time'].min()
sample_run_data['Run Start Time'] = (sample_run_data['Run Start Time'] - start_time).dt.total_seconds().astype(int)

print(sample_run_data.head(5))

#### Test if `Time Stamp` is in seconds

In [None]:
second_threshold = 0.0001
start_time = sample_run_data['Time Stamp'].min()
nearest_second = (sample_run_data['Time Stamp'] - start_time).dt.total_seconds().round() * pd.Timedelta(seconds=1) + start_time
deviations = (nearest_second - sample_run_data['Time Stamp']).abs()
print(deviations.max() < pd.Timedelta(seconds=second_threshold))

#### Replace `Time Stamp` with seconds elapsed since the start of the first run

In [None]:
sample_run_data['Time Stamp'] = (sample_run_data['Time Stamp'] - start_time).dt.total_seconds().round().astype(int)

#### Replace `Tool ID` with a categorical number

In [None]:
sample_run_data['Tool ID'] = sample_run_data['Tool ID'].astype('category').cat.codes

print(sample_run_data.head(5))

#### Step ID uniqueness

In [None]:
for file_id in file_ids:
    run_data = globals()[f'run_data_{file_id}']
    if run_data['Step ID'].nunique() != 13:
        print('Not all step counts are the same')
        break
else:
    print('All step counts are the same')

for file_id in file_ids:
    incoming_run_data = globals()[f'incoming_run_data_{file_id}']
    if incoming_run_data['Step ID'].nunique() != 8:
        print('Not all incoming step counts are the same')
        break
else:
    print('All incoming step counts are the same')

#### Replace `Run ID`, `Step ID` and `Sensor Name` with a categorical number

In [None]:
sample_run_data['Run ID'] = sample_run_data['Run ID'].astype('category').cat.codes
sample_run_data['Step ID'] = sample_run_data['Step ID'].astype('category').cat.codes
sample_run_data['Sensor Name'] = sample_run_data['Sensor Name'].astype('category').cat.codes

print(sample_run_data.head(5))

### Column Collinearity (VIF)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
def calculate_vif(dataframe):
    vif_data = pd.DataFrame()
    vif_data['feature'] = dataframe.columns
    vif_data['VIF'] = [variance_inflation_factor(dataframe.values, i) for i in range(dataframe.shape[1])]
    return vif_data

print(calculate_vif(sample_run_data[['Run Start Time', 'Run Duration', 'Run ID', 'Consumable Life', 'Step ID', 'Time Stamp', 'Sensor Name', 'Sensor Value']]))

In [None]:
import matplotlib.pyplot as plt
plt.scatter(sample_run_data['Run Start Time'], sample_run_data['Time Stamp'], alpha=0.1)

# Calculate the correlation coefficient
correlation = sample_run_data['Run Start Time'].corr(sample_run_data['Time Stamp'])
print(f"Correlation between 'Run Start Time' and 'Time Stamp': {correlation:.10f}")

#### Possible to remove `Time Stamp` and `Run Duration`

In [None]:
print(calculate_vif(sample_run_data[['Run Start Time', 'Run ID', 'Consumable Life', 'Step ID', 'Sensor Name', 'Sensor Value']]))

In [None]:
# `Time Stamp` is almost y=x
print(sample_run_data['Time Stamp'].corr(pd.Series(np.arange(len(sample_run_data['Time Stamp'])))))

## Test Dataset

* Contains data for 20 different tools (1 more than train datasets)
* Contains the missing runs from the train datasets, so when combined every tool has 250 different runs

In [None]:
set(test_run_data['Tool ID'].unique()) == set(test_incoming_run_data['Tool ID'].unique())

In [None]:
unique_tools.issubset(set(test_run_data['Tool ID'].unique()))

In [None]:
set(test_run_data['Tool ID'].unique()).difference(unique_tools)

In [None]:
print(run_data_1['Tool ID'][0])

In [None]:
test_run_data[test_run_data['Tool ID'] == '8060e8e1-504a-5138-a9f0-e2770bd61ba1'].head()

In [None]:
run_data_1.head()

In [None]:
for key, tool in test_run_data.groupby('Tool ID'):
    print(key)
    print(tool['Run ID'].nunique())
    print(len(tool))

## Plotting

In [3]:
processed_global_run_data = pd.read_parquet(os.path.join(processed_path, 'processed_global_run_data.parquet'))
processed_global_incoming_run_data = pd.read_parquet(os.path.join(processed_path, 'processed_global_incoming_run_data.parquet'))
processed_global_metrology_data = pd.read_parquet(os.path.join(processed_path, 'processed_global_metrology_data.parquet'))

processed_test_run_data = pd.read_parquet(os.path.join(processed_path, 'processed_test_run_data.parquet'))
processed_test_incoming_run_data = pd.read_parquet(os.path.join(processed_path, 'processed_test_incoming_run_data.parquet'))

In [None]:
# --- Plotting Task 1 ---

# 1. Filter for Tool ID == 1
tool_1_data = processed_global_run_data[processed_global_run_data['Tool ID'] == 1]

# 2. Get unique sensor names
sensors = tool_1_data['Sensor Name'].unique()
num_runs_to_plot = 50

# 3. Loop through each sensor to create a plot
for sensor in sensors:
    print(f"Processing sensor: {sensor}")
    plt.figure(figsize=(12, 8))

    sensor_data = tool_1_data[tool_1_data['Sensor Name'] == sensor]
    unique_runs = sensor_data['Run ID'].unique()

    if len(unique_runs) > num_runs_to_plot:
        runs_to_plot = np.random.choice(unique_runs, num_runs_to_plot, replace=False)
    else:
        runs_to_plot = unique_runs

    run_data_to_plot = sensor_data[sensor_data['Run ID'].isin(runs_to_plot)]

    for run_id in runs_to_plot:
        run_data = run_data_to_plot[run_data_to_plot['Run ID'] == run_id]
        plt.scatter(run_data['Time Stamp'], run_data['Sensor Value'], label=f'Run {run_id}')

    # --- AXIS CONTROLS ---

    # 1. Add labels to the x and y axes
    plt.xlabel('Time Stamp')  # This shows and labels the x-axis
    plt.ylabel('Sensor Value') # This shows and labels the y-axis

    # 2. Add a title to the plot
    plt.title(f'Sensor Value vs. Time Stamp for Sensor {sensor} (Tool ID 1)')

    # 3. Add a grid for better readability
    plt.grid(True)

    # (Optional) Set custom limits for the axes
    # plt.xlim(0, 100)
    # plt.ylim(-4, 4)

    # --- DISPLAYING THE PLOT ---
    # 1. Save the figure first. This saves the plot with all its titles and labels.
    output_filename = f'tool_1_sensor_{sensor}_50_runs.png'
    plt.savefig(os.path.join(plots_path, output_filename))
    print(f"Plot saved as {output_filename}")

    # 2. Then, display the plot in a pop-up window.
    plt.show()

    # 3. Finally, close the figure to free up memory.
    plt.close()

In [19]:
# --- Plotting Task 2 ---

# 1. Calculate 'relative_time'
processed_global_run_data['relative_time'] = processed_global_run_data['Time Stamp'] - processed_global_run_data['Run Start Time']

# 2. Get unique sensors and tools from the dataset
unique_sensors = processed_global_run_data['Sensor Name'].unique()
unique_tools = sorted(processed_global_run_data['Tool ID'].unique())

# 3. Create a color map for the tools
# We'll use a built-in colormap to get distinct colors for each tool
colors = plt.cm.get_cmap('viridis', len(unique_tools))
tool_color_map = {tool_id: colors(i) for i, tool_id in enumerate(unique_tools)}

# 4. Loop through each sensor and create a plot
for sensor in unique_sensors:
    print(f"--- Generating plot for Sensor: {sensor} ---")

    # Use the more robust object-oriented style for plotting
    fig, ax = plt.subplots(figsize=(14, 9))

    # 5. For the current sensor, loop through each tool
    for tool_id in unique_tools:
        # Filter data for the current sensor and tool
        tool_sensor_data = processed_global_run_data[
            (processed_global_run_data['Sensor Name'] == sensor) &
            (processed_global_run_data['Tool ID'] == tool_id)
        ]

        if tool_sensor_data.empty:
            print(f"  No data for Tool {tool_id} and Sensor {sensor}. Skipping.")
            continue

        # Find the unique runs available for this tool-sensor combination
        available_runs = tool_sensor_data['Run ID'].unique()

        # 6. Randomly select 2 runs, or fewer if not enough are available
        if len(available_runs) >= 2:
            runs_to_plot = np.random.choice(available_runs, 2, replace=False)
        else:
            runs_to_plot = available_runs

        if len(runs_to_plot) == 0:
            print(f"  No runs to plot for Tool {tool_id} after filtering. Skipping.")
            continue

        print(f"  Plotting {len(runs_to_plot)} run(s) from Tool {tool_id}. Run IDs: {runs_to_plot}")

        # Filter the data for the selected runs
        plot_data = tool_sensor_data[tool_sensor_data['Run ID'].isin(runs_to_plot)]

        # 7. Plot the data for this tool
        ax.scatter(
            plot_data['relative_time'],
            plot_data['Sensor Value'],
            color=tool_color_map[tool_id],
            label=f'Tool ID {tool_id}',
            s=10
        )

    # 8. Finalize and decorate the plot
    # To avoid duplicate labels in the legend, we create a custom legend
    handles, labels = ax.get_legend_handles_labels()
    unique_labels = dict(zip(labels, handles))
    ax.legend(unique_labels.values(), unique_labels.keys(), title="Tool ID")

    ax.set_title(f'Sensor {sensor}: Value vs. Relative Time (2 Runs per Tool)', fontsize=16)
    ax.set_xlabel('Relative Time (Time Stamp - Run Start Time)', fontsize=12)
    ax.set_ylabel('Sensor Value', fontsize=12)
    ax.grid(True, which='both', linestyle='--', linewidth=0.5)

    # 9. Save and show the plot
    output_filename = f'task2_sensor_{sensor}_by_tool.png'
    fig.savefig(os.path.join(plots_path, output_filename))
    print(f"Plot saved as {output_filename}\n")

    plt.close(fig)

--- Generating plot for Sensor: 0 ---
  Plotting 2 run(s) from Tool 1. Run IDs: [2606 4014]
  Plotting 2 run(s) from Tool 2. Run IDs: [ 334 2931]
  Plotting 2 run(s) from Tool 3. Run IDs: [2314 2545]
  Plotting 2 run(s) from Tool 4. Run IDs: [4007 3260]
  Plotting 2 run(s) from Tool 6. Run IDs: [  57 2594]
  Plotting 2 run(s) from Tool 7. Run IDs: [ 386 3196]
  Plotting 2 run(s) from Tool 8. Run IDs: [  42 1998]
  Plotting 2 run(s) from Tool 9. Run IDs: [4177 4627]
  Plotting 2 run(s) from Tool 10. Run IDs: [2300 1574]
  Plotting 2 run(s) from Tool 51. Run IDs: [3537 2336]
  Plotting 2 run(s) from Tool 52. Run IDs: [3358 3029]
  Plotting 2 run(s) from Tool 53. Run IDs: [3212 1180]
  Plotting 2 run(s) from Tool 54. Run IDs: [  51 3063]
  Plotting 2 run(s) from Tool 55. Run IDs: [  14 3243]
  Plotting 2 run(s) from Tool 56. Run IDs: [1588 2680]
  Plotting 2 run(s) from Tool 57. Run IDs: [3387 4448]
  Plotting 2 run(s) from Tool 58. Run IDs: [4211 1825]
  Plotting 2 run(s) from Tool 59. R

In [None]:
# --- Plotting Task 3 ---

# 1. Calculate 'relative_time' for both dataframes
processed_global_run_data['relative_time'] = (processed_global_run_data['Time Stamp'] - processed_global_run_data['Run Start Time'])
processed_test_run_data['relative_time'] = (processed_test_run_data['Time Stamp'] - processed_test_run_data['Run Start Time'])

# 2. Get unique sensors and tools from the GLOBAL dataset to establish a base
unique_sensors = processed_global_run_data['Sensor Name'].unique()
unique_tools_global = sorted(processed_global_run_data['Tool ID'].unique())

# 3. Create a base color map for the globally known tools (for the left plot)
colors = plt.cm.get_cmap('viridis', len(unique_tools_global))
tool_color_map = {tool_id: colors(i) for i, tool_id in enumerate(unique_tools_global)}

# 4. Loop through each sensor and create a plot
for sensor in unique_sensors:
    print(f"--- Generating plot for Sensor: {sensor} ---")

    fig, axes = plt.subplots(1, 2, figsize=(28, 9), sharey=True)

    # --- Left Subplot: processed_global_run_data (Multi-color) ---
    ax_left = axes[0]
    for tool_id in unique_tools_global:
        # (Logic for the left plot is unchanged)
        tool_sensor_data = processed_global_run_data[
            (processed_global_run_data['Sensor Name'] == sensor) &
            (processed_global_run_data['Tool ID'] == tool_id)
        ]
        if not tool_sensor_data.empty:
            available_runs = tool_sensor_data['Run ID'].unique()
            runs_to_plot = np.random.choice(available_runs, min(len(available_runs), 2), replace=False)
            if len(runs_to_plot) > 0:
                plot_data = tool_sensor_data[tool_sensor_data['Run ID'].isin(runs_to_plot)]
                ax_left.scatter(plot_data['relative_time'], plot_data['Sensor Value'], color=tool_color_map[tool_id], label=f'Tool ID {tool_id}', s=10)

    handles, labels = ax_left.get_legend_handles_labels()
    unique_labels = dict(zip(labels, handles))
    ax_left.legend(unique_labels.values(), unique_labels.keys(), title="Tool ID")
    ax_left.set_title('Global Run Data', fontsize=16)
    ax_left.set_xlabel('Relative Time (seconds)', fontsize=12)
    ax_left.set_ylabel('Sensor Value', fontsize=12)
    ax_left.grid(True, which='both', linestyle='--', linewidth=0.5)

    # --- Right Subplot: processed_test_run_data (Two-color highlight) ---
    ax_right = axes[1]
    highlight_tool = 5
    other_tools_color = 'blue'
    highlight_color = 'red'

    unique_tools_test = sorted(processed_test_run_data[processed_test_run_data['Sensor Name'] == sensor]['Tool ID'].unique())

    for tool_id in unique_tools_test:
        tool_sensor_data = processed_test_run_data[
            (processed_test_run_data['Sensor Name'] == sensor) &
            (processed_test_run_data['Tool ID'] == tool_id)
        ]
        if not tool_sensor_data.empty:
            available_runs = tool_sensor_data['Run ID'].unique()
            runs_to_plot = np.random.choice(available_runs, min(len(available_runs), 5), replace=False)
            if tool_id == highlight_tool:
                runs_to_plot = np.random.choice(available_runs, min(len(available_runs), 15), replace=False)
            if len(runs_to_plot) > 0:
                plot_data = tool_sensor_data[tool_sensor_data['Run ID'].isin(runs_to_plot)]

                # *** NEW: Simplified two-color logic ***
                if tool_id == highlight_tool:
                    color_to_use = highlight_color
                    label_to_use = f'Tool ID {tool_id}'
                else:
                    color_to_use = other_tools_color
                    label_to_use = 'Other Tools' # Same label for all others

                ax_right.scatter(plot_data['relative_time'], plot_data['Sensor Value'], color=color_to_use, label=label_to_use, s=10)

    handles, labels = ax_right.get_legend_handles_labels()
    if handles:
        unique_labels = dict(zip(labels, handles)) # This groups the 'Other Tools' label automatically
        ax_right.legend(unique_labels.values(), unique_labels.keys(), title="Tool ID")

    ax_right.set_title(f'Test Run Data (Highlighting {highlight_tool})', fontsize=16)
    ax_right.set_xlabel('Relative Time (seconds)', fontsize=12)
    ax_right.grid(True, which='both', linestyle='--', linewidth=0.5)

    # 8. Finalize and decorate the overall plot
    fig.suptitle(f'Sensor {sensor}: Value vs. Relative Time Comparison', fontsize=20)
    fig.tight_layout(rect=[0, 0.03, 1, 0.95])

    # 9. Save the combined plot
    output_filename = f'task3_sensor_{sensor}_comparison.png'
    fig.savefig(os.path.join(plots_path, output_filename))
    print(f"Plot saved as {output_filename}\n")

    plt.show()
    plt.close(fig)

In [None]:
# --- Plotting Task 4 ---

# 1. Calculate 'relative_time' for both dataframes
processed_global_incoming_run_data['relative_time'] = (processed_global_incoming_run_data['Time Stamp'] - processed_global_incoming_run_data['Run Start Time'])
processed_test_incoming_run_data['relative_time'] = (processed_test_incoming_run_data['Time Stamp'] - processed_test_incoming_run_data['Run Start Time'])

# 2. Get unique sensors and tools from the GLOBAL dataset to establish a base
unique_sensors = processed_global_incoming_run_data['Sensor Name'].unique()[:20]
unique_tools_global = sorted(processed_global_incoming_run_data['Tool ID'].unique())

# 3. Create a base color map for the globally known tools (for the left plot)
colors = plt.cm.get_cmap('viridis', len(unique_tools_global))
tool_color_map = {tool_id: colors(i) for i, tool_id in enumerate(unique_tools_global)}

# 4. Loop through each sensor and create a plot
for sensor in unique_sensors:
    print(f"--- Generating plot for Sensor: {sensor} ---")

    fig, axes = plt.subplots(1, 2, figsize=(28, 9), sharey=True)

    # --- Left Subplot: processed_global_run_data (Multi-color) ---
    ax_left = axes[0]
    for tool_id in unique_tools_global:
        # (Logic for the left plot is unchanged)
        tool_sensor_data = processed_global_incoming_run_data[
            (processed_global_incoming_run_data['Sensor Name'] == sensor) &
            (processed_global_incoming_run_data['Tool ID'] == tool_id)
        ]
        if not tool_sensor_data.empty:
            available_runs = tool_sensor_data['Run ID'].unique()
            runs_to_plot = np.random.choice(available_runs, min(len(available_runs), 2), replace=False)
            if len(runs_to_plot) > 0:
                plot_data = tool_sensor_data[tool_sensor_data['Run ID'].isin(runs_to_plot)]
                ax_left.scatter(plot_data['relative_time'], plot_data['Sensor Value'], color=tool_color_map[tool_id], label=f'Tool ID {tool_id}', s=10)

    handles, labels = ax_left.get_legend_handles_labels()
    unique_labels = dict(zip(labels, handles))
    ax_left.legend(unique_labels.values(), unique_labels.keys(), title="Tool ID")
    ax_left.set_title('Global Incoming Run Data', fontsize=16)
    ax_left.set_xlabel('Relative Time (seconds)', fontsize=12)
    ax_left.set_ylabel('Sensor Value', fontsize=12)
    ax_left.grid(True, which='both', linestyle='--', linewidth=0.5)

    # --- Right Subplot: processed_test_run_data (Two-color highlight) ---
    ax_right = axes[1]
    highlight_tool = 5
    other_tools_color = 'blue'
    highlight_color = 'red'

    unique_tools_test = sorted(processed_test_incoming_run_data[processed_test_incoming_run_data['Sensor Name'] == sensor]['Tool ID'].unique())

    for tool_id in unique_tools_test:
        tool_sensor_data = processed_test_incoming_run_data[
            (processed_test_incoming_run_data['Sensor Name'] == sensor) &
            (processed_test_incoming_run_data['Tool ID'] == tool_id)
        ]
        if not tool_sensor_data.empty:
            available_runs = tool_sensor_data['Run ID'].unique()
            runs_to_plot = np.random.choice(available_runs, min(len(available_runs), 5), replace=False)
            if tool_id == highlight_tool:
                runs_to_plot = np.random.choice(available_runs, min(len(available_runs), 15), replace=False)
            if len(runs_to_plot) > 0:
                plot_data = tool_sensor_data[tool_sensor_data['Run ID'].isin(runs_to_plot)]

                # *** NEW: Simplified two-color logic ***
                if tool_id == highlight_tool:
                    color_to_use = highlight_color
                    label_to_use = f'Tool ID {tool_id}'
                else:
                    color_to_use = other_tools_color
                    label_to_use = 'Other Tools' # Same label for all others

                ax_right.scatter(plot_data['relative_time'], plot_data['Sensor Value'], color=color_to_use, label=label_to_use, s=10)

    handles, labels = ax_right.get_legend_handles_labels()
    if handles:
        unique_labels = dict(zip(labels, handles)) # This groups the 'Other Tools' label automatically
        ax_right.legend(unique_labels.values(), unique_labels.keys(), title="Tool ID")

    ax_right.set_title(f'Test Run Data (Highlighting {highlight_tool})', fontsize=16)
    ax_right.set_xlabel('Relative Time (seconds)', fontsize=12)
    ax_right.grid(True, which='both', linestyle='--', linewidth=0.5)

    # 8. Finalize and decorate the overall plot
    fig.suptitle(f'Sensor {sensor}: Value vs. Relative Time Comparison', fontsize=20)
    fig.tight_layout(rect=[0, 0.03, 1, 0.95])

    # 9. Save the combined plot
    output_filename = f'task4_sensor_{sensor}_comparison_incoming_run.png'
    fig.savefig(os.path.join(plots_path, output_filename))
    print(f"Plot saved as {output_filename}\n")

    plt.show()
    plt.close(fig)

In [30]:
# --- Corrected Plotting Task 5 ---

# 1. Prepare the data by merging to get Tool IDs
run_to_tool_map = processed_global_run_data[['Run ID', 'Tool ID']].drop_duplicates().reset_index(drop=True)
merged_metrology_data = pd.merge(
    processed_global_metrology_data,
    run_to_tool_map,
    on='Run ID',
    how='left'
)
merged_metrology_data.dropna(subset=['Tool ID'], inplace=True) # Ensure we only plot data with a known tool

# Get unique tools and create a color map
unique_tools = sorted(merged_metrology_data['Tool ID'].unique())
colors = plt.cm.get_cmap('viridis', len(unique_tools))
tool_color_map = {tool_id: colors(i) for i, tool_id in enumerate(unique_tools)}

# Create the plot
fig, ax = plt.subplots(figsize=(14, 9))

print("--- Plotting Runs per Tool as Line Plots ---")
# 2. For each tool, plot lines for 10 randomly selected runs
for tool_id in unique_tools:
    tool_data = merged_metrology_data[merged_metrology_data['Tool ID'] == tool_id]
    available_runs = tool_data['Run ID'].unique()

    if len(available_runs) > 10:
        runs_to_plot = np.random.choice(available_runs, 10, replace=False)
    else:
        runs_to_plot = available_runs

    print(f"  Tool {tool_id}: Plotting {len(runs_to_plot)} runs.")

    label_plotted_for_tool = False # Flag to handle the legend correctly

    # --- CORE CORRECTION: Loop through each run to plot a separate line ---
    for run_id in runs_to_plot:
        # Filter data for this one specific run
        run_data = tool_data[tool_data['Run ID'] == run_id]

        # Sort by Point Index to draw the line correctly
        run_data_sorted = run_data.sort_values(by='Point Index')

        # Use a label only for the first run of a tool to avoid duplicate legend entries
        if not label_plotted_for_tool:
            label_to_use = f'{tool_id}'
            label_plotted_for_tool = True
        else:
            label_to_use = None

        ax.plot(
            run_data_sorted['Point Index'],
            run_data_sorted['Measurement'],
            color=tool_color_map[tool_id],
            label=label_to_use,
            alpha=0.6,
            marker='o',       # Add markers to show the actual points
            markersize=1,     # Make markers small
            linestyle='-'     # Explicitly specify a solid line
        )

# Finalize and decorate the plot
ax.set_title('Measurement Profile vs. Point Index (10 Random Runs per Tool)', fontsize=16)
ax.set_xlabel('Point Index', fontsize=12)
ax.set_ylabel('Measurement', fontsize=12)
ax.legend(title="Tool ID")
ax.grid(True, which='both', linestyle='--', linewidth=0.5)

# Save and show the plot
output_filename = 'task5_metrology_by_tool.png'
fig.savefig(os.path.join(plots_path, output_filename))
print(f"\nPlot saved as {output_filename}")

plt.close(fig)

--- Plotting Runs per Tool as Line Plots ---
  Tool 1: Plotting 10 runs.
  Tool 2: Plotting 10 runs.
  Tool 3: Plotting 10 runs.
  Tool 4: Plotting 10 runs.
  Tool 6: Plotting 10 runs.
  Tool 7: Plotting 10 runs.
  Tool 8: Plotting 10 runs.
  Tool 9: Plotting 10 runs.
  Tool 10: Plotting 10 runs.
  Tool 51: Plotting 10 runs.
  Tool 52: Plotting 10 runs.
  Tool 53: Plotting 10 runs.
  Tool 54: Plotting 10 runs.
  Tool 55: Plotting 10 runs.
  Tool 56: Plotting 10 runs.
  Tool 57: Plotting 10 runs.
  Tool 58: Plotting 10 runs.
  Tool 59: Plotting 10 runs.
  Tool 60: Plotting 10 runs.

Plot saved as task5_metrology_by_tool.png
