In [1]:
from utils import (
    preprocess_pipeline,
    get_models,
    evaluate_multiple,
    evaluate_parallel,
)
import pandas as pd

In [2]:
df = pd.read_csv("train_machine.csv")
df.head()

Unnamed: 0,id,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,0,L50096,L,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,1,M20343,M,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,2,L49454,L,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,3,L53355,L,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,4,M24050,M,298.0,309.0,1641,35.4,34,0,0,0,0,0,0


In [3]:
df.columns

Index(['id', 'Product ID', 'Type', 'Air temperature [K]',
       'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]',
       'Tool wear [min]', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF',
       'RNF'],
      dtype='object')

In [4]:
X = df.drop(columns=["id", "Product ID", "Machine failure"])
y = df[["Machine failure"]].values.flatten()

In [5]:
X.head()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],TWF,HDF,PWF,OSF,RNF
0,L,300.6,309.6,1596,36.1,140,0,0,0,0,0
1,M,302.6,312.1,1759,29.1,200,0,0,0,0,0
2,L,299.3,308.5,1805,26.5,25,0,0,0,0,0
3,L,301.0,310.9,1524,44.3,197,0,0,0,0,0
4,M,298.0,309.0,1641,35.4,34,0,0,0,0,0


In [6]:
y

array([0, 0, 0, ..., 0, 0, 0], shape=(136429,))

In [7]:
X_pre, pre = preprocess_pipeline(X)

In [8]:
X_pre

array([[ 0.3958803 , -0.24623038,  0.54541592, ..., -0.04756777,
         1.        ,  0.        ],
       [ 1.46985559,  1.55860483,  1.72030819, ..., -0.04756777,
         0.        ,  1.        ],
       [-0.30220363, -1.04035788,  2.05187289, ..., -0.04756777,
         1.        ,  0.        ],
       ...,
       [ 0.34218154,  1.34202461,  0.0264451 , ..., -0.04756777,
         1.        ,  0.        ],
       [ 0.98656671,  0.69228393, -0.52856537, ..., -0.04756777,
         1.        ,  0.        ],
       [-1.59097397, -1.32913151,  0.26430672, ..., -0.04756777,
         1.        ,  0.        ]], shape=(136429, 12))

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_pre, y, test_size=0.3, random_state=42
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((95500, 12), (40929, 12), (95500,), (40929,))

In [10]:
models = get_models()
models

[LogisticRegression(),
 DecisionTreeClassifier(),
 HistGradientBoostingClassifier(),
 XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, feature_types=None,
               feature_weights=None, gamma=None, grow_policy=None,
               importance_type=None, interaction_constraints=None,
               learning_rate=None, max_bin=None, max_cat_threshold=None,
               max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
               max_leaves=None, min_child_weight=None, missing=nan,
               monotone_constraints=None, multi_strategy=None, n_estimators=None,
               n_jobs=None, num_parallel_tree=None, ...)]

In [11]:
best_model, res = evaluate_multiple(models, X_train, y_train, X_test, y_test)

{'name': 'LogisticRegression', 'model': LogisticRegression(), 'train_score': 0.9319, 'test_score': 0.9272, 'cv_score': np.float64(0.9319), 'model_time': 7.0658}
{'name': 'DecisionTreeClassifier', 'model': DecisionTreeClassifier(), 'train_score': 0.9975, 'test_score': 0.8659, 'cv_score': np.float64(0.8776), 'model_time': 7.78}
{'name': 'HistGradientBoostingClassifier', 'model': HistGradientBoostingClassifier(), 'train_score': 0.9357, 'test_score': 0.9282, 'cv_score': np.float64(0.9286), 'model_time': 12.2625}
{'name': 'XGBClassifier', 'model': XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              feature_weights=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin

In [12]:
res

Unnamed: 0,name,model,train_score,test_score,cv_score,model_time
0,LogisticRegression,LogisticRegression(),0.9319,0.9272,0.9319,7.0658
1,HistGradientBoostingClassifier,HistGradientBoostingClassifier(),0.9357,0.9282,0.9286,12.2625
2,XGBClassifier,"XGBClassifier(base_score=None, booster=None, c...",0.9561,0.9272,0.9263,6.5484
3,DecisionTreeClassifier,DecisionTreeClassifier(),0.9975,0.8659,0.8776,7.78


In [13]:
best_model, res = evaluate_parallel(models, "threading", X_train, y_train, X_test, y_test)

{'name': 'LogisticRegression', 'model': LogisticRegression(), 'train_score': 0.9319, 'test_score': 0.9272, 'cv_score': np.float64(0.9319), 'model_time': 1.9171}
{'name': 'XGBClassifier', 'model': XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              feature_weights=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=None,
              n_jobs=None, num_parallel_tree=None, ...), 'train_score': 0.9561, 'test_score': 0.927

In [14]:
res

Unnamed: 0,name,model,train_score,test_score,cv_score,model_time
0,LogisticRegression,LogisticRegression(),0.9319,0.9272,0.9319,1.9171
1,HistGradientBoostingClassifier,HistGradientBoostingClassifier(),0.9331,0.9272,0.9307,9.2228
2,XGBClassifier,"XGBClassifier(base_score=None, booster=None, c...",0.9561,0.9272,0.9263,5.8889
3,DecisionTreeClassifier,DecisionTreeClassifier(),0.9975,0.8628,0.8725,10.0887


In [15]:
best_model, res = evaluate_parallel(models, "loky", X_train, y_train, X_test, y_test)

Total time: 20.5385 seconds


In [16]:
res

Unnamed: 0,name,model,train_score,test_score,cv_score,model_time
0,LogisticRegression,LogisticRegression(),0.9319,0.9272,0.9319,1.347
1,HistGradientBoostingClassifier,HistGradientBoostingClassifier(),0.937,0.9278,0.9305,18.2176
2,XGBClassifier,"XGBClassifier(base_score=None, booster=None, c...",0.9561,0.9272,0.9263,11.9813
3,DecisionTreeClassifier,DecisionTreeClassifier(),0.9975,0.8662,0.8734,9.6607
