In [13]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split

RAW = Path("../data_raw")
PROC = Path("../data_proc")
PROC.mkdir(exist_ok=True, parents=True)

RAW_FILE = RAW / "ai4i2020.csv"            # full dataset
SAMPLE_FILE = RAW / "ai4i2020_sample.csv"  # fast-iteration sample (optional)


In [14]:
df = pd.read_csv(RAW_FILE)
print(df.shape)
df.head(10)


(10000, 14)


Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0
5,6,M14865,M,298.1,308.6,1425,41.9,11,0,0,0,0,0,0
6,7,L47186,L,298.1,308.6,1558,42.4,14,0,0,0,0,0,0
7,8,L47187,L,298.1,308.6,1527,40.2,16,0,0,0,0,0,0
8,9,M14868,M,298.3,308.7,1667,28.6,18,0,0,0,0,0,0
9,10,M14869,M,298.5,309.0,1741,28.0,21,0,0,0,0,0,0


In [15]:
df.info()
df.isna().sum()
df.duplicated().sum()
df['Machine failure'].value_counts(normalize=True)  # target balance


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   UDI                      10000 non-null  int64  
 1   Product ID               10000 non-null  object 
 2   Type                     10000 non-null  object 
 3   Air temperature [K]      10000 non-null  float64
 4   Process temperature [K]  10000 non-null  float64
 5   Rotational speed [rpm]   10000 non-null  int64  
 6   Torque [Nm]              10000 non-null  float64
 7   Tool wear [min]          10000 non-null  int64  
 8   Machine failure          10000 non-null  int64  
 9   TWF                      10000 non-null  int64  
 10  HDF                      10000 non-null  int64  
 11  PWF                      10000 non-null  int64  
 12  OSF                      10000 non-null  int64  
 13  RNF                      10000 non-null  int64  
dtypes: float64(3), int64(9)

Machine failure
0    0.9661
1    0.0339
Name: proportion, dtype: float64

In [16]:
df = df.rename(columns={
    "Air temperature [K]": "air_temp_k",
    "Process temperature [K]": "proc_temp_k",
    "Rotational speed [rpm]": "rpm",
    "Torque [Nm]": "torque_nm",
    "Tool wear [min]": "tool_wear_min",
    "Machine failure": "failure",
    "Product ID": "product_id",
    "Type": "type"
})
# Drop pure identifiers (no predictive power)
df = df.drop(columns=["UDI", "product_id"])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   type           10000 non-null  object 
 1   air_temp_k     10000 non-null  float64
 2   proc_temp_k    10000 non-null  float64
 3   rpm            10000 non-null  int64  
 4   torque_nm      10000 non-null  float64
 5   tool_wear_min  10000 non-null  int64  
 6   failure        10000 non-null  int64  
 7   TWF            10000 non-null  int64  
 8   HDF            10000 non-null  int64  
 9   PWF            10000 non-null  int64  
 10  OSF            10000 non-null  int64  
 11  RNF            10000 non-null  int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 937.6+ KB


In [17]:
# Temperature delta (often predictive)
df["temp_diff_k"] = df["proc_temp_k"] - df["air_temp_k"]

# Simple “power-like” interaction
df["power"] = df["torque_nm"] * df["rpm"]

# One-hot encode categorical 'type' (A/B/C)
df = pd.get_dummies(df, columns=["type"], drop_first=False)

df.head()


Unnamed: 0,air_temp_k,proc_temp_k,rpm,torque_nm,tool_wear_min,failure,TWF,HDF,PWF,OSF,RNF,temp_diff_k,power,type_H,type_L,type_M
0,298.1,308.6,1551,42.8,0,0,0,0,0,0,0,10.5,66382.8,False,False,True
1,298.2,308.7,1408,46.3,3,0,0,0,0,0,0,10.5,65190.4,False,True,False
2,298.1,308.5,1498,49.4,5,0,0,0,0,0,0,10.4,74001.2,False,True,False
3,298.2,308.6,1433,39.5,7,0,0,0,0,0,0,10.4,56603.5,False,True,False
4,298.2,308.7,1408,40.0,9,0,0,0,0,0,0,10.5,56320.0,False,True,False


In [18]:
TARGET = "failure"
y = df[TARGET].astype(int)

feature_cols = [c for c in df.columns if c != TARGET]
X = df[feature_cols]
len(feature_cols), feature_cols[:8]


(15,
 ['air_temp_k',
  'proc_temp_k',
  'rpm',
  'torque_nm',
  'tool_wear_min',
  'TWF',
  'HDF',
  'PWF'])

In [19]:
# 70 / 15 / 15 split with stratification
# stratify=y → maintains the same ratio of 0s and 1s (failures vs non-failures) in both sets.
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

for name, part in [("train", y_train), ("val", y_val), ("test", y_test)]:
    print(name, "shape:", part.shape, "positive rate:", part.mean().round(4))


train shape: (7000,) positive rate: 0.0339
val shape: (1500,) positive rate: 0.034
test shape: (1500,) positive rate: 0.034


In [20]:
# Save as parquet (fast & compact)
X_train.to_parquet(PROC / "X_train.parquet", index=False)
y_train.to_frame("failure").to_parquet(PROC / "y_train.parquet", index=False)

X_val.to_parquet(PROC / "X_val.parquet", index=False)
y_val.to_frame("failure").to_parquet(PROC / "y_val.parquet", index=False)

X_test.to_parquet(PROC / "X_test.parquet", index=False)
y_test.to_frame("failure").to_parquet(PROC / "y_test.parquet", index=False)

# also save the full feature list for later use
pd.Series(feature_cols).to_json(PROC / "feature_list.json", orient="values")
print("✅ Saved cleaned splits and feature_list to:", PROC)


✅ Saved cleaned splits and feature_list to: ..\data_proc


In [21]:
import pandas as pd
from pathlib import Path

# Path to your test data
PROC = Path("../data_proc")  # adjust path if needed
X_test = pd.read_parquet(PROC / "X_test.parquet")
y_test = pd.read_parquet(PROC / "y_test.parquet")

print("Shape:", X_test.shape)
print("Shape:", y_test.shape)
display(X_test.head(2))
display(y_test.head(2))

# df_test = X_test.copy()
# df_test["failure"] = y_test.values
# failures = df_test[df_test["failure"] == 1]
# print("Total failures:", len(failures))
# display(failures.head())

Shape: (1500, 15)
Shape: (1500, 1)


Unnamed: 0,air_temp_k,proc_temp_k,rpm,torque_nm,tool_wear_min,TWF,HDF,PWF,OSF,RNF,temp_diff_k,power,type_H,type_L,type_M
0,298.1,309.0,1557,36.0,56,0,0,0,0,0,10.9,56052.0,False,True,False
1,297.7,308.5,1559,35.2,127,0,0,0,0,0,10.8,54876.8,False,False,True


Unnamed: 0,failure
0,0
1,0
