## Code

In [1]:
import numpy as np
import pandas as pd
import polars as pl

np.random.seed(11)
# Have a look at the dataset
df = pl.read_csv('winequality-red.csv', null_values=["?", "NaN"])
df.head()

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64
7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [2]:
# Missing values in each columns
max_length = max(len(i) for i in df.columns)
for col in df.get_columns():
    print(f"{col.name:>{max_length}} - {col.is_null().sum()}")

       fixed acidity - 0
    volatile acidity - 0
         citric acid - 0
      residual sugar - 0
           chlorides - 0
 free sulfur dioxide - 0
total sulfur dioxide - 0
             density - 0
                  pH - 0
           sulphates - 0
             alcohol - 0
             quality - 0


### Split data

In [3]:
from sklearn.model_selection import train_test_split

target_column = "quality"
df = df.drop_nulls(subset=target_column)
# Get an idea about train_test_split ratio
print(df.shape)
X = df.drop(target_column).to_pandas()
y = df[target_column].to_pandas()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

(1599, 12)


### Pipeline processor

In [4]:
from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import (
    StandardScaler
)
from category_encoders.cat_boost import CatBoostEncoder

# def combine_dimensions(x):
#     x["dimensions"] = x["length"] * x["width"] * x["height"]
#     x = x.drop(["length", "width", "height"])
#
#
# feature_engineering = Pipeline(
#     steps=[
#         ("combine_dimensions", FunctionTransformer(combine_dimensions)),
#     ]
# )

set_config(display="diagram")

union = FeatureUnion(
    transformer_list=[
        ("pca", PCA(n_components=1)),
        ("svd", TruncatedSVD(n_components=2)),
    ]
)

# Get numeric and categorical columns
num_cols = X.select_dtypes(include="number").columns
cat_cols = list(set(X.columns) - set(num_cols))
num_pipe = Pipeline(
    [
        ("Impute", KNNImputer()),
        ("Scale", StandardScaler()),
        # ('reduce_dimensionality', union)
    ]
)
cat_pipe = Pipeline(
    [
        ('encode', CatBoostEncoder(cols=cat_cols)),
        ("Impute", KNNImputer()),
    ]
)

# Define the pipeline with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", cat_pipe, cat_cols),
        ("numeric", num_pipe, num_cols),
    ],
    remainder="passthrough",
    verbose_feature_names_out=True,
)

processor = Pipeline(
    [
        ("preprocessing", preprocessor),
        # ("feature_engineering", feature_engineering),
    ]
)

In [5]:
processor

In [6]:
processor.fit(X_train)
X_train_transformed = pd.DataFrame(processor.transform(X_train), columns = X_train.columns)
X_test_transformed = pd.DataFrame(processor.transform(X_test), columns = X_test.columns)

In [7]:
# Export to pickle files
pd.concat([X_train_transformed, X_test_transformed]).to_pickle('wine_X.pkl')
pd.concat([y_train, y_test]).to_pickle('wine_y.pkl')
X_train_transformed.to_pickle('wine_X_train.pkl')
X_test_transformed.to_pickle('wine_X_test.pkl')
y_train.to_pickle('wine_y_train.pkl')
y_test.to_pickle('wine_y_test.pkl')