In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from tqdm import tqdm


### Data Preparation

In [None]:
# Find absolute path of ./data/train
root_path = os.path.abspath(os.path.dirname(os.getcwd()))
train_path = os.path.join(root_path, 'data', 'train')
test_path = os.path.join(root_path, 'data', 'test')

In [None]:
file_ids = ['1', '2', '3', '4', '6', '7', '8', '9', '10', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60']
for file_id in file_ids:
    globals()[f'run_data_{file_id}'] = pd.read_parquet(os.path.join(train_path, f"run_data_{file_id}.parquet"))
    globals()[f'incoming_run_data_{file_id}'] = pd.read_parquet(os.path.join(train_path, f"incoming_run_data_{file_id}.parquet"))
    globals()[f'metrology_data_{file_id}'] = pd.read_parquet(os.path.join(train_path, f"metrology_data{file_id}.parquet"))

In [None]:
test_run_data = pd.read_parquet(os.path.join(test_path, "run_data.parquet"))
test_incoming_run_data = pd.read_parquet(os.path.join(test_path, "incoming_run_data.parquet"))

In [None]:
s = 0
for file_id in file_ids:
    s += len(globals()[f'run_data_{file_id}'])
print(f"Total number of entries in run_data: {s}")

## Basic Properties

### Tool ID
* A single and unique Tool ID for each run/incoming run file
* Corresponding run/incoming run files share the same Tool ID

In [None]:
unique_tools = set()
for file_id in file_ids:
    unique_tool_run = globals()[f'run_data_{file_id}']['Tool ID'].unique()
    unique_tool_incoming_run = globals()[f'incoming_run_data_{file_id}']['Tool ID'].unique()
    print(unique_tool_run == unique_tool_incoming_run)
    unique_tools.add(unique_tool_run[0])

### Run IDs


In [None]:
for file_id in file_ids:
    run_data = globals()[f'run_data_{file_id}']
    incoming_run_data = globals()[f'incoming_run_data_{file_id}']
    run_ids_run = run_data['Run ID']
    unique_run_ids_run = set(run_ids_run.unique())
    run_ids_incoming_run = incoming_run_data['Run ID']
    unique_run_ids_incoming = set(run_ids_incoming_run.unique())
    metrology_data = globals()[f'metrology_data_{file_id}']
    run_ids_metrology = metrology_data['Run ID']
    unique_run_ids_metrology = set(run_ids_metrology.unique())

    print(file_id)
    print(len(unique_run_ids_run))
    print(len(run_ids_run))
    print(len(run_ids_incoming_run))
    print(len(run_ids_metrology))
    print(unique_run_ids_run == unique_run_ids_incoming == unique_run_ids_metrology)
    print()

### Sensor Names

In [None]:
print(run_data_1['Sensor Name'].unique())
for file_id in file_ids:
    run_data = globals()[f'run_data_{file_id}']
    print(run_data['Sensor Name'].unique() == run_data_1['Sensor Name'].unique())

In [None]:
print(incoming_run_data_1['Sensor Name'].unique())
for file_id in file_ids:
    incoming_run_data = globals()[f'incoming_run_data_{file_id}']
    print(incoming_run_data['Sensor Name'].unique() == incoming_run_data_1['Sensor Name'].unique())

In [None]:
print(run_data_1.head(5))

In [None]:
print(run_data_1['Consumable Life'].nunique())

In [None]:
print(incoming_run_data_1.head(5))

### Process Step

* All run_data file entries share the same `Process Step` (intuitively since run_data focuses on the specific current process).
* All incoming_run_data file entries also share the same `Process Step`.

In [None]:
unique_processes = set()
for file_id in file_ids:
    run_data = globals()[f'run_data_{file_id}']
    unique_processes.add(run_data['Process Step'].nunique())
    unique_processes.add(run_data['Process Step'].unique()[0])
print(unique_processes)

In [None]:
unique_processes = set()
for file_id in file_ids:
    incoming_run_data = globals()[f'incoming_run_data_{file_id}']
    unique_processes.add(incoming_run_data['Process Step'].nunique())
    unique_processes.add(incoming_run_data['Process Step'].unique()[0])
print(unique_processes)

### Time Stamp

In [None]:
for file_id in file_ids:
    run_data = globals()[f'run_data_{file_id}']
    incoming_run_data = globals()[f'incoming_run_data_{file_id}']
    print(run_data['Run Start Time'].nunique())

* Every run has `no_time_stamps` * 15 entries for `run_data` and `no_time_stamps` * 41 entries for `incoming_run_data` (in two cases `no_time_stamps` are different)

In [None]:
run_1 = run_data_1[run_data_1['Run ID'] == run_data_1['Run ID'].unique()[1]]
print(len(run_1))
print(run_1['Time Stamp'].value_counts())

In [None]:
print(run_1['Step ID'].value_counts())

In [None]:
incoming_run_1 = incoming_run_data_1[incoming_run_data_1['Run ID'] == run_data_1['Run ID'].unique()[1]]
print(len(incoming_run_1))

In [None]:
from tqdm import tqdm

for file_id in file_ids:
    run_data = globals()[f'run_data_{file_id}']
    unique_runs = run_data['Run ID'].unique()
    incoming_run_data = globals()[f'incoming_run_data_{file_id}']
    for run_id in tqdm(unique_runs, desc=f'Checking run data for file {file_id}'):
        run = run_data[run_data['Run ID'] == run_id]
        incoming_run = incoming_run_data[incoming_run_data['Run ID'] == run_id]
        if len(run) != run['Time Stamp'].nunique() * 15:
            print("Mismatch in run data for Run ID:", run_id)
        if len(incoming_run) != incoming_run['Time Stamp'].nunique() * 41:
            print("Mismatch in incoming run data for Run ID:", run_id)

# No mismatches found, all runs have the expected number of time stamps.

* Within each run in `run_data` or `incoming_run_data`, sensors start measuring since the beginning of the run and record data every second. However, they stop recording before the end of the run.

In [None]:
# Taking run_data_1 and incoming_run_data_1 as an example
from tqdm import tqdm
import matplotlib.pyplot as plt

run_durations = []
run_ending_early = []
incoming_run_durations = []
incoming_run_ending_early = []
for run_id in tqdm(run_data_1['Run ID'].unique()):
    run = run_data_1[run_data_1['Run ID'] == run_id]
    run_durations.append((run['Run End Time'].iloc[0] - run['Run Start Time'].iloc[0]).total_seconds())
    run_ending_early.append((run['Run End Time'].iloc[0] - run['Time Stamp'].max()).total_seconds())

    incoming_run = incoming_run_data_1[incoming_run_data_1['Run ID'] == run_id]
    incoming_run_durations.append((incoming_run['Run End Time'].iloc[0] - incoming_run['Run Start Time'].iloc[0]).total_seconds())
    incoming_run_ending_early.append((incoming_run['Run End Time'].iloc[0] - incoming_run['Time Stamp'].max()).total_seconds())

In [None]:
print((np.array(run_durations) == 755.0).all())
print((np.array(incoming_run_durations) == 742.0).all())

In [None]:
print(run_ending_early)

In [None]:
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.hist(run_ending_early, bins=100)
plt.subplot(1, 2, 2)
plt.hist(incoming_run_ending_early, bins=100)
plt.show()

In [None]:
run_1 = run_data_1[run_data_1['Run ID'] == run_data_1['Run ID'].unique()[1]]
# run_1['Time Stamp'] = (run_1['Time Stamp'] - run_1['Time Stamp'].min()).dt.total_seconds().round().astype(int)
incoming_run_1 = incoming_run_data_1[incoming_run_data_1['Run ID'] == run_data_1['Run ID'].unique()[1]]
# incoming_run_1['Time Stamp'] = (incoming_run_1['Time Stamp'] - incoming_run_1['Time Stamp'].min()).dt.total_seconds().round().astype(int)

In [None]:
print(run_data_1['Run ID'].nunique())

### Consumable Life

In [None]:
start_times = []
consumable_lives = []
for run_id in run_data_1['Run ID'].unique():
    run = run_data_1[run_data_1['Run ID'] == run_id]
    start_times.append(run['Run Start Time'].iloc[0])
    consumable_lives.append(run['Consumable Life'].iloc[0])

In [None]:
plt.scatter(start_times, consumable_lives)

In [None]:
for file_id in file_ids:
    run_data = globals()[f'run_data_{file_id}']
    run_1 = run_data[run_data['Run ID'] == run_data['Run ID'].unique()[1]]
    print(run_1['Consumable Life'].unique())

In [None]:
print(run_1['Run Start Time'].iloc[0])
print((run_1['Run End Time'].iloc[0] - run_1['Run Start Time'].iloc[0]).total_seconds())
print(run_1['Run End Time'].iloc[0])
print(incoming_run_1['Run Start Time'].iloc[0])
print((incoming_run_1['Run End Time'].iloc[0] - incoming_run_1['Run Start Time'].iloc[0]).total_seconds())
print(incoming_run_1['Run End Time'].iloc[0])

In [None]:
for file_id in file_ids:
    run_data = globals()[f'run_data_{file_id}']
    incoming_run_data = globals()[f'incoming_run_data_{file_id}']
    print(len(run_data) / len(incoming_run_data))

## Feature Engineering

### Initialization

In [None]:
sample_run_data = run_data_1
sample_incoming_run_data = incoming_run_data_1

### Column Modifications

#### Drop `Process Step`

In [None]:
sample_run_data = sample_run_data.drop(columns=['Process Step'])
sample_incoming_run_data = sample_incoming_run_data.drop(columns=['Process Step'])

#### Replace `Run End Time` with duration of the run and rename it to `Run Duration`

In [None]:
sample_run_data['Run End Time'] = (sample_run_data['Run End Time'] - sample_run_data['Run Start Time']).dt.total_seconds().astype(int)
sample_run_data.rename(columns={'Run End Time': 'Run Duration'}, inplace=True)
print(sample_run_data.head(5))

#### Replace `Run Start Time` with number representing seconds elapsed since the start of the first run

In [None]:
start_time = sample_run_data['Run Start Time'].min()
sample_run_data['Run Start Time'] = (sample_run_data['Run Start Time'] - start_time).dt.total_seconds().astype(int)

print(sample_run_data.head(5))

#### Test if `Time Stamp` is in seconds

In [None]:
second_threshold = 0.0001
start_time = sample_run_data['Time Stamp'].min()
nearest_second = (sample_run_data['Time Stamp'] - start_time).dt.total_seconds().round() * pd.Timedelta(seconds=1) + start_time
deviations = (nearest_second - sample_run_data['Time Stamp']).abs()
print(deviations.max() < pd.Timedelta(seconds=second_threshold))

#### Replace `Time Stamp` with seconds elapsed since the start of the first run

In [None]:
sample_run_data['Time Stamp'] = (sample_run_data['Time Stamp'] - start_time).dt.total_seconds().round().astype(int)

#### Replace `Tool ID` with a categorical number

In [None]:
sample_run_data['Tool ID'] = sample_run_data['Tool ID'].astype('category').cat.codes

print(sample_run_data.head(5))

#### Step ID uniqueness

In [None]:
for file_id in file_ids:
    run_data = globals()[f'run_data_{file_id}']
    if run_data['Step ID'].nunique() != 13:
        print('Not all step counts are the same')
        break
else:
    print('All step counts are the same')

for file_id in file_ids:
    incoming_run_data = globals()[f'incoming_run_data_{file_id}']
    if incoming_run_data['Step ID'].nunique() != 8:
        print('Not all incoming step counts are the same')
        break
else:
    print('All incoming step counts are the same')

#### Replace `Run ID`, `Step ID` and `Sensor Name` with a categorical number

In [None]:
sample_run_data['Run ID'] = sample_run_data['Run ID'].astype('category').cat.codes
sample_run_data['Step ID'] = sample_run_data['Step ID'].astype('category').cat.codes
sample_run_data['Sensor Name'] = sample_run_data['Sensor Name'].astype('category').cat.codes

print(sample_run_data.head(5))

### Column Collinearity (VIF)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
def calculate_vif(dataframe):
    vif_data = pd.DataFrame()
    vif_data['feature'] = dataframe.columns
    vif_data['VIF'] = [variance_inflation_factor(dataframe.values, i) for i in range(dataframe.shape[1])]
    return vif_data

print(calculate_vif(sample_run_data[['Run Start Time', 'Run Duration', 'Run ID', 'Consumable Life', 'Step ID', 'Time Stamp', 'Sensor Name', 'Sensor Value']]))

In [None]:
import matplotlib.pyplot as plt
plt.scatter(sample_run_data['Run Start Time'], sample_run_data['Time Stamp'], alpha=0.1)

# Calculate the correlation coefficient
correlation = sample_run_data['Run Start Time'].corr(sample_run_data['Time Stamp'])
print(f"Correlation between 'Run Start Time' and 'Time Stamp': {correlation:.10f}")

#### Possible to remove `Time Stamp` and `Run Duration`

In [None]:
print(calculate_vif(sample_run_data[['Run Start Time', 'Run ID', 'Consumable Life', 'Step ID', 'Sensor Name', 'Sensor Value']]))

In [None]:
# `Time Stamp` is almost y=x
print(sample_run_data['Time Stamp'].corr(pd.Series(np.arange(len(sample_run_data['Time Stamp'])))))

## Test Dataset

* Contains data for 20 different tools (1 more than train datasets)
* Contains the missing runs from the train datasets, so when combined every tool has 250 different runs

In [None]:
set(test_run_data['Tool ID'].unique()) == set(test_incoming_run_data['Tool ID'].unique())

In [None]:
unique_tools.issubset(set(test_run_data['Tool ID'].unique()))

In [None]:
set(test_run_data['Tool ID'].unique()).difference(unique_tools)

In [None]:
print(run_data_1['Tool ID'][0])

In [None]:
test_run_data[test_run_data['Tool ID'] == '8060e8e1-504a-5138-a9f0-e2770bd61ba1'].head()

In [None]:
run_data_1.head()

In [None]:
for key, tool in test_run_data.groupby('Tool ID'):
    print(key)
    print(tool['Run ID'].nunique())
    print(len(tool))