### Preparation Pipeline

In [2]:
# Load dataset
from pathlib import Path
import pandas as pd

data_path = Path("data/car.data")
columns = ["buying","maint","doors","persons","lug_boot","safety","class"]
df = pd.read_csv(data_path, header=None, names=columns).astype(str)

print("Shape:", df.shape)
display(df.head())

Shape: (1728, 7)


Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [3]:
# Train/Test split with One-Hot Encoding
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

target_col = "class"
feature_cols = [c for c in df.columns if c != target_col]

X = df[feature_cols]
y = df[target_col]

categorical_features = feature_cols

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LogisticRegression(max_iter=1000))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         acc       0.77      0.79      0.78        77
        good       0.64      0.50      0.56        14
       unacc       0.96      0.96      0.96       242
       vgood       0.85      0.85      0.85        13

    accuracy                           0.90       346
   macro avg       0.80      0.78      0.79       346
weighted avg       0.90      0.90      0.90       346



In [4]:
# Export processed splits (optional)
from pathlib import Path
out_dir = Path("data/processed_car")
out_dir.mkdir(parents=True, exist_ok=True)

target_col = "class"
feature_cols = [c for c in df.columns if c != target_col]

df_train = X_train.copy()
df_train[target_col] = y_train.values
df_test = X_test.copy()
df_test[target_col] = y_test.values

df_train.to_csv(out_dir / "train.csv", index=False)
df_test.to_csv(out_dir / "test.csv", index=False)

print("Saved:", out_dir / "train.csv")
print("Saved:", out_dir / "test.csv")

Saved: data/processed_car/train.csv
Saved: data/processed_car/test.csv


In [5]:
# Load raw data
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
import joblib

data_path = Path("data/car.data")
columns = ["buying","maint","doors","persons","lug_boot","safety","class"]
df = pd.read_csv(data_path, header=None, names=columns).astype(str)

target_col = "class"
feature_cols = [c for c in df.columns if c != target_col]

X = df[feature_cols]
y = df[target_col]

categorical_features = feature_cols

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features)
    ]
)

pipeline = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LogisticRegression(max_iter=1000))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

# Persist the pipeline for reuse
artifacts_dir = Path("data/artifacts_car")
artifacts_dir.mkdir(parents=True, exist_ok=True)
joblib.dump(pipeline, artifacts_dir / "pipeline.joblib")
print("Saved pipeline to", artifacts_dir / "pipeline.joblib")

              precision    recall  f1-score   support

         acc       0.77      0.79      0.78        77
        good       0.64      0.50      0.56        14
       unacc       0.96      0.96      0.96       242
       vgood       0.85      0.85      0.85        13

    accuracy                           0.90       346
   macro avg       0.80      0.78      0.79       346
weighted avg       0.90      0.90      0.90       346

Saved pipeline to data/artifacts_car/pipeline.joblib
