In [1]:
import os
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy.stats import shapiro
from statsmodels.formula.api import ols


In [2]:
CURRENT_DIR = Path.cwd()
PARENT_DIR = CURRENT_DIR.parent
DATA_DIR = PARENT_DIR / "T04_preprocess"
print(CURRENT_DIR)
print(DATA_DIR)

c:\Users\admin\Coding\research\weld-ml\src\P02_data\T05_EA_center_loc_7
c:\Users\admin\Coding\research\weld-ml\src\P02_data\T04_preprocess


In [3]:
filepath = os.path.join(DATA_DIR, "S10_center_location_7_individual.xlsx")
dfm = pd.read_excel(filepath)
dfm

Unnamed: 0,idx_excel_post,section,sample_no,location,R,W,D,sigma_x_post,FWHM_post,measurement,diff_sigma_x
0,21,Center,1,7,1400,60,10,6,2.51,first,6
1,42,Center,2,7,1400,60,15,2,2.48,first,2
2,63,Center,3,7,1400,60,20,8,2.54,first,8
3,84,Center,4,7,1400,70,10,3,2.58,first,3
4,105,Center,5,7,1400,70,15,6,2.43,first,6
...,...,...,...,...,...,...,...,...,...,...,...
157,73,Center,52,7,1600,80,10,3,2.44,second,3
158,5,Center,53,7,1600,80,15,5,2.43,second,5
159,59,Center,53,7,1600,80,15,5,2.45,second,5
160,3,Center,54,7,1600,80,20,2,2.42,second,2


In [4]:
# Standardize columnes R, W, D

for col in ["R", "W", "D"]:
    mean = dfm[col].mean()
    std = dfm[col].std()
    dfm[col] = (dfm[col] - mean) / std
dfm

Unnamed: 0,idx_excel_post,section,sample_no,location,R,W,D,sigma_x_post,FWHM_post,measurement,diff_sigma_x
0,21,Center,1,7,-1.220959,-1.220959,-1.220959,6,2.51,first,6
1,42,Center,2,7,-1.220959,-1.220959,0.000000,2,2.48,first,2
2,63,Center,3,7,-1.220959,-1.220959,1.220959,8,2.54,first,8
3,84,Center,4,7,-1.220959,0.000000,-1.220959,3,2.58,first,3
4,105,Center,5,7,-1.220959,0.000000,0.000000,6,2.43,first,6
...,...,...,...,...,...,...,...,...,...,...,...
157,73,Center,52,7,1.220959,1.220959,-1.220959,3,2.44,second,3
158,5,Center,53,7,1.220959,1.220959,0.000000,5,2.43,second,5
159,59,Center,53,7,1.220959,1.220959,0.000000,5,2.45,second,5
160,3,Center,54,7,1.220959,1.220959,1.220959,2,2.42,second,2


## Understanding the ANOVA table columns:

- `df`: Degrees of freedom - the number of independent pieces of information for each source
- `sum_sq`: Sum of squares - the total variation attributed to each factor
- `mean_sq`: Mean square - sum of squares divided by degrees of freedom (variance estimate)
- `F`: F-statistic - the ratio of factor variance to residual variance
- `PR(>F)`: P-value - probability of seeing this F-statistic by chance alone

## What is Type

- typ=1 (Type I): sequential SS, each term tested in the order it appears in the formula.
- typ=2, each term (factor or covariate) is tested after all other main effects but ignoring higher‑order interactions that include it.
  This is usually recommended when the model is reasonably balanced and you want tests that respect the marginality principle (main effects evaluated in the presence of other main effects, but not “penalized” by interactions).
- typ=3 (Type III): each term tested after all other terms including interactions; often used in software like SPSS, especially for unbalanced designs, but interpretation of main effects with strong interactions can be tricky.


In [5]:
# Filter the DataFrame for specific sections
filt = (dfm["section"].isin(["Center"])) 
dfa = dfm[filt]

# Define the formula for the model
formula = "diff_sigma_x ~ R + W + D"
# formula = "diff_sigma_x ~ C(R) + C(W) + C(D) + C(section)"

# Fit the model
model = ols(formula, data=dfa).fit()

# Perform ANOVA
anova_results = sm.stats.anova_lm(model)

# Print the ANOVA results
anova_results.sort_values(by="PR(>F)")

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
W,1.0,31.148148,31.148148,1.878061,0.172499
R,1.0,3.703704,3.703704,0.223313,0.637179
D,1.0,2.675926,2.675926,0.161344,0.688465
Residual,158.0,2620.472222,16.585267,,


In [6]:
model.summary()

0,1,2,3
Dep. Variable:,diff_sigma_x,R-squared:,0.014
Model:,OLS,Adj. R-squared:,-0.005
Method:,Least Squares,F-statistic:,0.7542
Date:,"Sun, 11 Jan 2026",Prob (F-statistic):,0.521
Time:,22:18:39,Log-Likelihood:,-455.33
No. Observations:,162,AIC:,918.7
Df Residuals:,158,BIC:,931.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,4.5556,0.320,14.238,0.000,3.924,5.188
R,-0.1517,0.321,-0.473,0.637,-0.786,0.482
W,0.4398,0.321,1.370,0.172,-0.194,1.074
D,0.1289,0.321,0.402,0.688,-0.505,0.763

0,1,2,3
Omnibus:,2.298,Durbin-Watson:,1.427
Prob(Omnibus):,0.317,Jarque-Bera (JB):,2.173
Skew:,0.006,Prob(JB):,0.337
Kurtosis:,3.567,Cond. No.,1.0
