In [1]:
import os
from joblib import dump, load
import torch
import pandas as pd
import torch.nn as nn
from math import sqrt
from tqdm import tqdm

In [2]:
root_path = os.path.abspath(os.path.dirname(os.getcwd()))
train_path = os.path.join(root_path, 'data', 'train')
test_path = os.path.join(root_path, 'data', 'test')
processed_path = os.path.join(root_path, 'data', 'processed')

### Load `pred_y`

In [29]:
pred_y = load('predicted_y.joblib')

In [30]:
pred_y_df = pd.DataFrame(pred_y).astype('float')

In [31]:
print(pred_y)

tensor([[10.4952, 10.3826, 10.6015,  ..., 10.6266, 10.5513, 10.3331],
        [10.0296, 10.1089, 10.3224,  ..., 10.2380, 10.1275, 10.0406],
        [10.0531, 10.1337, 10.3471,  ..., 10.2625, 10.1519, 10.0650],
        ...,
        [10.0562, 10.1334, 10.3471,  ..., 10.2644, 10.1542, 10.0652],
        [10.0891, 10.1563, 10.3699,  ..., 10.2939, 10.1849, 10.0901],
        [10.1774, 10.2140, 10.4296,  ..., 10.3714, 10.2667, 10.1496]])


In [32]:
pred_y_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,39,40,41,42,43,44,45,46,47,48
0,10.495191,10.382628,10.601487,10.352983,10.543009,10.423900,10.483061,10.377560,10.341709,10.749933,...,10.581545,10.465165,10.601503,10.742821,10.315522,10.563719,10.673756,10.626579,10.551268,10.333144
1,10.029622,10.108922,10.322433,10.043105,10.052326,10.122750,10.175968,10.089038,10.065432,10.091758,...,10.132690,10.043344,10.099069,10.090364,10.043598,10.104724,10.043008,10.237971,10.127537,10.040628
2,10.053119,10.133695,10.347086,10.066875,10.075651,10.147285,10.200396,10.114105,10.090060,10.115000,...,10.157490,10.068276,10.121858,10.113609,10.067628,10.128441,10.066393,10.262506,10.151894,10.064986
3,10.142610,10.194576,10.409370,10.136428,10.165567,10.213579,10.265693,10.177104,10.150143,10.234119,...,10.246224,10.149991,10.215325,10.225896,10.126668,10.214641,10.176174,10.342428,10.235356,10.129261
4,10.110282,10.159244,10.373860,10.102459,10.133588,10.178567,10.231182,10.141286,10.114824,10.203655,...,10.211823,10.115654,10.183740,10.195577,10.092172,10.181979,10.145129,10.308019,10.201427,10.094573
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
855,10.119021,10.177829,10.391777,10.118673,10.141361,10.195525,10.248160,10.159936,10.133983,10.202458,...,10.222991,10.128450,10.189649,10.195686,10.110589,10.190974,10.146664,10.321049,10.213050,10.112513
856,10.065771,10.135524,10.349226,10.073285,10.088234,10.151229,10.204144,10.116503,10.092018,10.137246,...,10.168978,10.077149,10.135781,10.133353,10.069645,10.139150,10.085307,10.271059,10.161737,10.068846
857,10.056247,10.133447,10.347143,10.067738,10.078939,10.147893,10.200625,10.114117,10.089796,10.121414,...,10.160314,10.070246,10.125678,10.119163,10.067410,10.131215,10.071895,10.264416,10.154208,10.065197
858,10.089136,10.156340,10.369944,10.094872,10.111371,10.172492,10.225388,10.137780,10.112810,10.163585,...,10.193137,10.100577,10.158762,10.158821,10.089854,10.161986,10.110595,10.293901,10.184855,10.090113


In [33]:
processed_global_run_data = pd.read_parquet(os.path.join(processed_path, 'processed_global_run_data.parquet'))
processed_global_incoming_run_data = pd.read_parquet(os.path.join(processed_path, 'processed_global_incoming_run_data.parquet'))
processed_global_metrology_data = pd.read_parquet(os.path.join(processed_path, 'processed_global_metrology_data.parquet'))

processed_test_run_data = pd.read_parquet(os.path.join(processed_path, 'processed_test_run_data.parquet'))
processed_test_incoming_run_data = pd.read_parquet(os.path.join(processed_path, 'processed_test_incoming_run_data.parquet'))

In [34]:
sorted_processed_test_run_data = processed_test_run_data.sort_values(
    by=['Tool ID', 'Run ID', 'Time Stamp'],
    ascending=[True, True, True]
)

sorted_processed_test_incoming_run_data = processed_test_incoming_run_data.sort_values(
    by=['Tool ID', 'Run ID', 'Time Stamp'],
    ascending=[True, True, True]
)

In [35]:
run_ids = []
for key, run in sorted_processed_test_run_data.groupby('Run ID'):
    run_ids.append(key)

In [36]:
pred_y_df['Run ID'] = run_ids
pred_y_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,Run ID
0,10.495191,10.382628,10.601487,10.352983,10.543009,10.423900,10.483061,10.377560,10.341709,10.749933,...,10.465165,10.601503,10.742821,10.315522,10.563719,10.673756,10.626579,10.551268,10.333144,6
1,10.029622,10.108922,10.322433,10.043105,10.052326,10.122750,10.175968,10.089038,10.065432,10.091758,...,10.043344,10.099069,10.090364,10.043598,10.104724,10.043008,10.237971,10.127537,10.040628,9
2,10.053119,10.133695,10.347086,10.066875,10.075651,10.147285,10.200396,10.114105,10.090060,10.115000,...,10.068276,10.121858,10.113609,10.067628,10.128441,10.066393,10.262506,10.151894,10.064986,22
3,10.142610,10.194576,10.409370,10.136428,10.165567,10.213579,10.265693,10.177104,10.150143,10.234119,...,10.149991,10.215325,10.225896,10.126668,10.214641,10.176174,10.342428,10.235356,10.129261,32
4,10.110282,10.159244,10.373860,10.102459,10.133588,10.178567,10.231182,10.141286,10.114824,10.203655,...,10.115654,10.183740,10.195577,10.092172,10.181979,10.145129,10.308019,10.201427,10.094573,34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
855,10.119021,10.177829,10.391777,10.118673,10.141361,10.195525,10.248160,10.159936,10.133983,10.202458,...,10.128450,10.189649,10.195686,10.110589,10.190974,10.146664,10.321049,10.213050,10.112513,4898
856,10.065771,10.135524,10.349226,10.073285,10.088234,10.151229,10.204144,10.116503,10.092018,10.137246,...,10.077149,10.135781,10.133353,10.069645,10.139150,10.085307,10.271059,10.161737,10.068846,4899
857,10.056247,10.133447,10.347143,10.067738,10.078939,10.147893,10.200625,10.114117,10.089796,10.121414,...,10.070246,10.125678,10.119163,10.067410,10.131215,10.071895,10.264416,10.154208,10.065197,4903
858,10.089136,10.156340,10.369944,10.094872,10.111371,10.172492,10.225388,10.137780,10.112810,10.163585,...,10.100577,10.158762,10.158821,10.089854,10.161986,10.110595,10.293901,10.184855,10.090113,4904


In [37]:
pred_y_df = pred_y_df.melt(
    id_vars='Run ID',
    var_name='Point Index',
    value_name="Measurement"
)

In [38]:
pred_y_df = pred_y_df.sort_values('Run ID')
pred_y_df

Unnamed: 0,Run ID,Point Index,Measurement
0,6,0,10.495191
34400,6,40,10.465165
2580,6,3,10.352983
39560,6,46,10.626579
7740,6,9,10.749933
...,...,...,...
12039,4910,13,10.230327
29239,4910,33,10.280479
12899,4910,14,10.127986
13759,4910,15,10.310932


In [39]:
global_run_data = pd.read_parquet(os.path.join(processed_path, 'global_run_data.parquet'))
global_incoming_run_data = pd.read_parquet(os.path.join(processed_path, 'global_incoming_run_data.parquet'))
global_metrology_data = pd.read_parquet(os.path.join(processed_path, 'global_metrology_data.parquet'))

test_run_data = pd.read_parquet(os.path.join(processed_path, 'rounded_test_run_data.parquet'))
test_incoming_run_data = pd.read_parquet(os.path.join(processed_path, 'rounded_test_incoming_run_data.parquet'))

In [40]:
tool_id_dict = {'8060e8e1-504a-5138-a9f0-e2770bd61ba1': 1,
                'ad28cd71-3811-543c-80a1-a440468a49d7': 2,
                'ac40636d-b491-5620-8d16-9aad56e5c4d3': 3,
                '08e35986-3a0a-5274-8990-4ad303be9d5f': 4,
                '2f2816e1-d294-5c4d-a5fa-3d804c46726a': 5,
                '8a1010f8-7e9e-52ba-8586-2eac7e68eaee': 6,
                '683e5405-4b93-5160-be90-d3d5b0d9287a': 7,
                'db66ded5-0f4f-5179-b8e7-4673304ddff1': 8,
                '5116a75f-0543-502f-8278-2b2496d337d2': 9,
                'c7f536aa-a2fc-54b4-9b98-9f01793c9b5d': 10,
                'a060c807-dbdb-5de3-9cd7-0c596234a3d7': 51,
                '1914935a-df63-54c2-9c47-da905244e631': 52,
                'b3439d8e-290b-59a4-96a5-53348288f6a9': 53,
                'd7291410-f34f-5cee-b3a1-0c60bb854423': 54,
                '1b314ddd-198a-5cd5-90ae-933b947d013d': 55,
                '30476bd4-f093-56c9-8cd1-23cf7f39ce5f': 56,
                'e4838ac1-3788-50bf-bbca-04cf339ec369': 57,
                '4738fd3d-8fc7-59fd-85a7-eec6efb4bfae': 58,
                'bb7e0385-0227-575a-9894-dacc8bf07f2a': 59,
                '3cb39167-0519-578a-846d-7132592965d3': 60}

run_id_set = set(test_run_data['Run ID'].unique()).union(set(global_run_data['Run ID'].unique()))
run_id_dict = {run_id: i + 1 for i, run_id in enumerate(sorted(list(run_id_set)))}

### Two checks to confirm one-to-one mapping

In [41]:
violation_check_1 = global_metrology_data.groupby('Point Index')[['X', 'Y', 'X_index', 'Y_index']].nunique()
(violation_check_1 > 1).any().any()

False

In [42]:
violation_check_2 = global_metrology_data.groupby(['X', 'Y', 'X_index', 'Y_index'])['Point Index'].nunique()
(violation_check_2 > 1).any()

False

#### Join back the columns

In [43]:
# Define the columns that describe each unique point
point_columns = ['Point Index', 'X', 'Y', 'X_index', 'Y_index']

# Create a clean DataFrame where each Point Index appears only once
point_mapping_df = global_metrology_data[point_columns].drop_duplicates()

print("Created a unique mapping table with shape:", point_mapping_df.shape)
print(point_mapping_df.head())

Created a unique mapping table with shape: (49, 5)
   Point Index           X          Y  X_index  Y_index
0            3 -143.877551  -9.183673        1       23
1           48 -143.877551  27.551020        1       29
2           43 -137.755102  58.163265        2       34
3           20 -131.632653 -64.285714        3       14
4            8 -119.387755  88.775510        5       39


In [44]:
pred_y_df = pd.merge(
    left=pred_y_df,
    right=point_mapping_df,
    on='Point Index',
    how='left'  # Use a 'left' merge to ensure all rows from sorted_df are kept
)

# Display the final, combined DataFrame
print("\nSuccessfully merged the datasets.")
print("Final DataFrame shape:", pred_y_df.shape)
print(pred_y_df.head())


Successfully merged the datasets.
Final DataFrame shape: (42140, 7)
   Run ID Point Index  Measurement           X           Y  X_index  Y_index
0       6           0    10.495191    3.061224 -119.387755       25        5
1       6          40    10.465165   82.653061  -94.897959       38        9
2       6           3    10.352983 -143.877551   -9.183673        1       23
3       6          46    10.626579  131.632653    9.183673       46       26
4       6           9    10.749933  -58.163265  -33.673469       15       19


## Add back `Run Start Time` and `Run End Time`

* Metrology data share the same `Run Start Time` and `Run End Time` with the corresponding run data

In [None]:
tool_1_run_data = global_run_data[global_run_data['Tool ID'] == '8060e8e1-504a-5138-a9f0-e2770bd61ba1']
tool_1_incoming_run_data = global_incoming_run_data[global_incoming_run_data['Tool ID'] == '8060e8e1-504a-5138-a9f0-e2770bd61ba1']
for key, run in tool_1_run_data.groupby('Run ID'):
    print(f"{key}: ")
    print(run['Run Start Time'].nunique())
    print(run['Run End Time'].nunique())
    incoming_run = tool_1_incoming_run_data[tool_1_incoming_run_data['Run ID'] == key]
    print(incoming_run['Run Start Time'].nunique())
    print(incoming_run['Run End Time'].nunique())
    print(run['Run Start Time'].iloc[0] == incoming_run['Run Start Time'].iloc[0])
    print(run['Run End Time'].iloc[0] == incoming_run['Run End Time'].iloc[0])
    metrology_data = global_metrology_data[global_metrology_data['Run ID'] == key]
    print(run['Run Start Time'].iloc[0] == metrology_data['Run Start Time'].iloc[0])
    print(run['Run End Time'].iloc[0] == metrology_data['Run End Time'].iloc[0])
    print("============")

In [45]:
# Define the columns that describe each unique point
time_columns = ['Run ID', 'Run Start Time', 'Run End Time']

# Create a clean DataFrame where each Point Index appears only once
time_mapping_df = test_run_data[time_columns].drop_duplicates()

In [46]:
time_mapping_df['Run ID'] = time_mapping_df['Run ID'].map(run_id_dict)
time_mapping_df

Unnamed: 0,Run ID,Run Start Time,Run End Time
0,4227,2024-01-01 01:27:55,2024-01-01 01:40:30
10170,2833,2024-01-01 04:06:10,2024-01-01 04:18:45
20160,3727,2024-01-01 04:23:45,2024-01-01 04:36:20
29160,2521,2024-01-01 07:19:35,2024-01-01 07:32:10
38370,1363,2024-01-01 13:46:25,2024-01-01 13:59:00
...,...,...,...
8506110,4242,2024-01-02 20:19:35,2024-01-02 20:32:10
8517045,1169,2024-01-02 20:37:10,2024-01-02 20:49:45
8527320,4753,2024-01-02 20:54:45,2024-01-02 21:07:20
8537670,1259,2024-01-02 21:12:20,2024-01-02 21:24:55


In [47]:
set(time_mapping_df['Run ID'].unique()) == set(sorted_processed_test_run_data['Run ID'].unique())

True

In [48]:
time_mapping_df['Run ID'].nunique()

860

In [49]:
pred_y_df = pd.merge(
    left=pred_y_df,
    right=time_mapping_df,
    on='Run ID',
    how='left'  # Use a 'left' merge to ensure all rows from sorted_df are kept
)
pred_y_df

Unnamed: 0,Run ID,Point Index,Measurement,X,Y,X_index,Y_index,Run Start Time,Run End Time
0,6,0,10.495191,3.061224,-119.387755,25,5,2024-01-03 19:15:35,2024-01-03 19:28:10
1,6,40,10.465165,82.653061,-94.897959,38,9,2024-01-03 19:15:35,2024-01-03 19:28:10
2,6,3,10.352983,-143.877551,-9.183673,1,23,2024-01-03 19:15:35,2024-01-03 19:28:10
3,6,46,10.626579,131.632653,9.183673,46,26,2024-01-03 19:15:35,2024-01-03 19:28:10
4,6,9,10.749933,-58.163265,-33.673469,15,19,2024-01-03 19:15:35,2024-01-03 19:28:10
...,...,...,...,...,...,...,...,...,...
42135,4910,13,10.230327,27.551020,-52.040816,29,16,2024-01-01 10:50:35,2024-01-01 11:03:10
42136,4910,33,10.280479,52.040816,88.775510,33,39,2024-01-01 10:50:35,2024-01-01 11:03:10
42137,4910,14,10.127986,70.408163,-131.632653,36,3,2024-01-01 10:50:35,2024-01-01 11:03:10
42138,4910,15,10.310932,76.530612,58.163265,37,34,2024-01-01 10:50:35,2024-01-01 11:03:10


In [None]:
pred_y_df.isnull().sum()

In [52]:
def reverse_map(input_dict):
    reversed_dict = {v: k for k, v in input_dict.items()}
    return reversed_dict

In [53]:
reversed_run_id_dict = reverse_map(run_id_dict)

In [55]:
pred_y_df['Run ID'] = pred_y_df['Run ID'].map(reversed_run_id_dict)
pred_y_df

Unnamed: 0,Run ID,Point Index,Measurement,X,Y,X_index,Y_index,Run Start Time,Run End Time
0,005b8b1c-b638-5e28-9d20-e8cd2d99dc3d,0,10.495191,3.061224,-119.387755,25,5,2024-01-03 19:15:35,2024-01-03 19:28:10
1,005b8b1c-b638-5e28-9d20-e8cd2d99dc3d,40,10.465165,82.653061,-94.897959,38,9,2024-01-03 19:15:35,2024-01-03 19:28:10
2,005b8b1c-b638-5e28-9d20-e8cd2d99dc3d,3,10.352983,-143.877551,-9.183673,1,23,2024-01-03 19:15:35,2024-01-03 19:28:10
3,005b8b1c-b638-5e28-9d20-e8cd2d99dc3d,46,10.626579,131.632653,9.183673,46,26,2024-01-03 19:15:35,2024-01-03 19:28:10
4,005b8b1c-b638-5e28-9d20-e8cd2d99dc3d,9,10.749933,-58.163265,-33.673469,15,19,2024-01-03 19:15:35,2024-01-03 19:28:10
...,...,...,...,...,...,...,...,...,...
42135,fffe558e-d3a4-5e08-b36c-c9f70a3861a5,13,10.230327,27.551020,-52.040816,29,16,2024-01-01 10:50:35,2024-01-01 11:03:10
42136,fffe558e-d3a4-5e08-b36c-c9f70a3861a5,33,10.280479,52.040816,88.775510,33,39,2024-01-01 10:50:35,2024-01-01 11:03:10
42137,fffe558e-d3a4-5e08-b36c-c9f70a3861a5,14,10.127986,70.408163,-131.632653,36,3,2024-01-01 10:50:35,2024-01-01 11:03:10
42138,fffe558e-d3a4-5e08-b36c-c9f70a3861a5,15,10.310932,76.530612,58.163265,37,34,2024-01-01 10:50:35,2024-01-01 11:03:10


In [57]:
column_order = global_metrology_data.columns
pred_y_df = pred_y_df[column_order]

In [58]:
pred_y_df

Unnamed: 0,Run ID,Run Start Time,Run End Time,X_index,Y_index,X,Y,Point Index,Measurement
0,005b8b1c-b638-5e28-9d20-e8cd2d99dc3d,2024-01-03 19:15:35,2024-01-03 19:28:10,25,5,3.061224,-119.387755,0,10.495191
1,005b8b1c-b638-5e28-9d20-e8cd2d99dc3d,2024-01-03 19:15:35,2024-01-03 19:28:10,38,9,82.653061,-94.897959,40,10.465165
2,005b8b1c-b638-5e28-9d20-e8cd2d99dc3d,2024-01-03 19:15:35,2024-01-03 19:28:10,1,23,-143.877551,-9.183673,3,10.352983
3,005b8b1c-b638-5e28-9d20-e8cd2d99dc3d,2024-01-03 19:15:35,2024-01-03 19:28:10,46,26,131.632653,9.183673,46,10.626579
4,005b8b1c-b638-5e28-9d20-e8cd2d99dc3d,2024-01-03 19:15:35,2024-01-03 19:28:10,15,19,-58.163265,-33.673469,9,10.749933
...,...,...,...,...,...,...,...,...,...
42135,fffe558e-d3a4-5e08-b36c-c9f70a3861a5,2024-01-01 10:50:35,2024-01-01 11:03:10,29,16,27.551020,-52.040816,13,10.230327
42136,fffe558e-d3a4-5e08-b36c-c9f70a3861a5,2024-01-01 10:50:35,2024-01-01 11:03:10,33,39,52.040816,88.775510,33,10.280479
42137,fffe558e-d3a4-5e08-b36c-c9f70a3861a5,2024-01-01 10:50:35,2024-01-01 11:03:10,36,3,70.408163,-131.632653,14,10.127986
42138,fffe558e-d3a4-5e08-b36c-c9f70a3861a5,2024-01-01 10:50:35,2024-01-01 11:03:10,37,34,76.530612,58.163265,15,10.310932


In [61]:
pred_y_df.to_csv('predicted_metrology.csv', index=False)