### Import Dataset

In [1]:
import pandas as pd

# Define monitoring time frames
monitoring_time = {
    "New_Nozzle_set1": ("4:49:00 pm", "4:53:59 pm"),
    "New_Nozzle_set2": ("4:59:00 pm", "5:03:59 pm"),
    "New_Nozzle_set3": ("5:10:00 pm", "5:14:59 pm"),
    "Worst_Nozzle_set1": ("6:10:00 pm", "6:14:59 pm"),
    "Worst_Nozzle_set2": ("10:59:00 am", "11:03:59 am"),
    "Worst_Nozzle_set3": ("11:13:00 am", "11:17:59 am"),
}

# CSV filenames
g_csv_name = ["New_Nozzle_set1", "New_Nozzle_set2", "New_Nozzle_set3"]
b_csv_name = ["Worst_Nozzle_set1", "Worst_Nozzle_set2", "Worst_Nozzle_set3"]

# Read CSVs into DataFrames
def load_and_filter_csv(file_name, start_time, end_time):
    df = pd.read_csv(f"../data/{file_name}.csv")
    df['Time'] = pd.to_datetime(df['Time'], format='%I:%M:%S %p')
    start_time = pd.to_datetime(start_time, format='%I:%M:%S %p')
    end_time = pd.to_datetime(end_time, format='%I:%M:%S %p')
    return df[(df['Time'] >= start_time) & (df['Time'] <= end_time)]

# Filter and concatenate good data
filtered_g_df = [
    load_and_filter_csv(g_csv_name[0], *monitoring_time['New_Nozzle_set1']),
    load_and_filter_csv(g_csv_name[1], *monitoring_time['New_Nozzle_set2']),
    load_and_filter_csv(g_csv_name[2], *monitoring_time['New_Nozzle_set3']),
]
g_df = pd.concat(filtered_g_df, ignore_index=True)
g_df['Nozzle_Condition'] = 1

# Filter and concatenate bad data
filtered_b_df = [
    load_and_filter_csv(b_csv_name[0], *monitoring_time['Worst_Nozzle_set1']),
    load_and_filter_csv(b_csv_name[1], *monitoring_time['Worst_Nozzle_set2']),
    load_and_filter_csv(b_csv_name[2], *monitoring_time['Worst_Nozzle_set3']),
]
b_df = pd.concat(filtered_b_df, ignore_index=True)
b_df['Nozzle_Condition'] = 0



In [2]:
print(g_df.shape)
g_df.head()

(900, 17)


Unnamed: 0,Date,Time,ms,Data_Type,ABB_X,ABB_Y,ABB_Z,Chamber_Temp,Chamber_Pressure,Main_Gas_Flow,PF1_Gas_Flow,PF2_Gas_Flow,PF1_LibraWeight,PF2_LibraWeight,PF1_Pressure,PF2_Pressure,Nozzle_Condition
0,5/12/2024,1900-01-01 16:49:00,0,Double[12],0,0,0,1000,4.995,1797,301.9,0,62.26,62.61,5.187,0.002,1
1,5/12/2024,1900-01-01 16:49:01,0,Double[12],0,0,0,1000,4.995,1797,299.4,0,62.26,62.61,5.187,0.002,1
2,5/12/2024,1900-01-01 16:49:02,0,Double[12],0,0,0,1000,4.995,1797,300.4,0,62.26,62.61,5.187,0.002,1
3,5/12/2024,1900-01-01 16:49:03,0,Double[12],0,0,0,1000,4.995,1797,298.3,0,62.26,62.61,5.187,0.002,1
4,5/12/2024,1900-01-01 16:49:04,0,Double[12],0,0,0,1000,4.993,1797,298.6,0,62.26,62.61,5.187,0.002,1


In [3]:
print(b_df.shape)
b_df.head()

(900, 17)


Unnamed: 0,Date,Time,ms,Data_Type,ABB_X,ABB_Y,ABB_Z,Chamber_Temp,Chamber_Pressure,Main_Gas_Flow,PF1_Gas_Flow,PF2_Gas_Flow,PF1_LibraWeight,PF2_LibraWeight,PF1_Pressure,PF2_Pressure,Nozzle_Condition
0,4/12/2024,1900-01-01 18:10:00,0,Double[12],-509.83,-2213.82,1278.23,1000,4.998,1741,300.6,0,62.26,62.61,5.182,0.002,0
1,4/12/2024,1900-01-01 18:10:01,0,Double[12],-509.83,-2213.82,1278.23,1000,4.998,1737,299.4,0,62.26,62.61,5.182,0.002,0
2,4/12/2024,1900-01-01 18:10:02,0,Double[12],-509.83,-2213.82,1278.23,1000,4.998,1737,297.4,0,62.26,62.61,5.182,0.002,0
3,4/12/2024,1900-01-01 18:10:03,0,Double[12],-509.83,-2213.82,1278.23,1000,4.995,1741,297.4,0,62.26,62.61,5.182,0.002,0
4,4/12/2024,1900-01-01 18:10:04,0,Double[12],-509.83,-2213.82,1278.23,1000,4.995,1736,298.7,0,62.26,62.6,5.182,0.002,0


In [8]:
from itertools import combinations, product
from scipy.stats import ttest_ind

good_data = g_df['Main_Gas_Flow'].to_numpy()
bad_data = b_df['Main_Gas_Flow'].to_numpy()

# Step 1: Split the datasets into 15 sub-datasets
sub_size = 60
good_subs = [good_data[i:i+sub_size] for i in range(0, len(good_data), sub_size)]
bad_subs = [bad_data[i:i+sub_size] for i in range(0, len(bad_data), sub_size)]

# Step 2: Initialize comparison dataframe
comparisons = []
labels = []

# Step 3: Compare all sub-datasets pairwise without duplication
# Good vs Good (unique pairs only)
for (i, g1), (j, g2) in combinations(enumerate(good_subs), 2):
    t_stat, p_val = ttest_ind(g1, g2)
    t_test = 1 if p_val < 0.05 else 0
    comparisons.append((f'good_{i}', f'good_{j}', p_val, t_test))
    labels.append(1)  # Good vs Good

# Good vs Bad (all pairs)
for (i, g), (j, b) in product(enumerate(good_subs), enumerate(bad_subs)):
    t_stat, p_val = ttest_ind(g, b)
    t_test = 1 if p_val < 0.05 else 0
    comparisons.append((f'good_{i}', f'bad_{j}', p_val, t_test))
    labels.append(0)  # Good vs Bad

# Bad vs Bad (unique pairs only)
for (i, b1), (j, b2) in combinations(enumerate(bad_subs), 2):
    t_stat, p_val = ttest_ind(b1, b2)
    t_test = 1 if p_val < 0.05 else 0
    comparisons.append((f'bad_{i}', f'bad_{j}', p_val, t_test))
    labels.append(-1)  # Bad vs Bad

# Step 4: Organize results into a DataFrame
comparison_df = pd.DataFrame(comparisons, columns=['Dataset1', 'Dataset2', 'P_Value', 'T_Test'])
comparison_df['Label'] = labels

# Display results
comparison_df


Unnamed: 0,Dataset1,Dataset2,P_Value,T_Test,Label
0,good_0,good_1,1.246243e-19,1,1
1,good_0,good_2,4.034869e-11,1,1
2,good_0,good_3,9.372630e-19,1,1
3,good_0,good_4,1.926053e-12,1,1
4,good_0,good_5,2.394817e-18,1,1
...,...,...,...,...,...
430,bad_11,bad_13,2.445614e-15,1,-1
431,bad_11,bad_14,5.135635e-08,1,-1
432,bad_12,bad_13,9.909127e-06,1,-1
433,bad_12,bad_14,1.329469e-13,1,-1


In [11]:
# Group by 'Label' and count the number of 0s and 1s in 'T_Test'
group_counts = comparison_df.groupby('Label')['T_Test'].value_counts().unstack(fill_value=0)

# Convert counts to percentages
group_percentages = group_counts.div(group_counts.sum(axis=1), axis=0) * 100

# Rename columns for clarity
group_percentages.columns = ['t_test = 0 (No Significant %)', 't_test = 1 (Significant %)']

# Display the percentages
print(group_percentages)


       t_test = 0 (No Significant %)  t_test = 1 (Significant %)
Label                                                           
-1                          6.666667                   93.333333
 0                          9.777778                   90.222222
 1                         16.190476                   83.809524
