In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# data cleaning using scikit-learn
from sklearn.impute import SimpleImputer # fixes missing values
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder # fixes categorical data
from sklearn.preprocessing import StandardScaler, MinMaxScaler # fixes scaling
pd.set_option('display.max_columns', None) # remove the limit for max columns (only for display)

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [9]:
df = pd.read_csv('https://raw.githubusercontent.com/digipodium/Datasets/main/regression/automobile.csv', index_col=0)

In [10]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [11]:
df.replace('?', np.nan, inplace=True)     # remove anything that cannot be handled in piple
df.dropna(subset=['price'], inplace=True) # remove rows with missing price
X = df.drop('price', axis=1)              # create X and y
y = df['price']


In [14]:
numerical_x = ['normalized-losses','wheel-base', 'length','width','height','curb-weight','engine-size',
'compression-ratio','city-mpg','highway-mpg','bore','stroke','horsepower','peak-rpm']
categorical_x = ['symboling','make','body-style','engine-type','num-of-cylinders','fuel-system']
binary_cat_x = ['fuel-type','aspiration','num-of-doors','engine-location']

In [20]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer()),
    ("scaler", StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(drop="first"))
])
binary_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("label", OrdinalEncoder())
])
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', numeric_transformer, numerical_x),
        ('categorical', categorical_transformer, categorical_x),
        ('binary', binary_transformer, binary_cat_x)
])


In [21]:
preprocessor

In [24]:
opt_X = preprocessor.fit_transform(X)
opt_X.shape

(201, 66)

#### Goto kaggle, and download the titanic data, then create a pipeline to preprocess the data