In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [28]:
car_sales = pd.read_csv("https://raw.githubusercontent.com/CongLiu-CN/zero-to-mastery-ml/master/data/car-sales-extended-missing-data.csv")

## How many rows are there total?

In [29]:
car_sales.count(axis=0) # Zaehlt NaN nicht!!
car_sales.shape[0]

1000

## What datatypes are in each column?

In [30]:
car_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Make           951 non-null    object 
 1   Colour         950 non-null    object 
 2   Odometer (KM)  950 non-null    float64
 3   Doors          950 non-null    float64
 4   Price          950 non-null    float64
dtypes: float64(3), object(2)
memory usage: 39.2+ KB


In [31]:
car_sales.count(axis=1)

0      5
1      5
2      5
3      5
4      5
      ..
995    5
996    4
997    5
998    5
999    5
Length: 1000, dtype: int64

### length is 1000 (0 - 999)

In [32]:
car_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Make           951 non-null    object 
 1   Colour         950 non-null    object 
 2   Odometer (KM)  950 non-null    float64
 3   Doors          950 non-null    float64
 4   Price          950 non-null    float64
dtypes: float64(3), object(2)
memory usage: 39.2+ KB


## How many missing values are in each column?

In [33]:
car_sales.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

## Are there any categorial data?

In [34]:
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


## Remove NaNs in numerical Data ('Odometer (KM)', 'Doors', 'Price')

In [35]:
from sklearn import impute

imputer = impute.SimpleImputer()

X2 = car_sales[['Odometer (KM)', 'Doors', 'Price']].values
imputer.fit(X2)
X2_imp = imputer.transform(X2)

X2_imp;

In [36]:
df = pd.DataFrame(X2_imp, columns = ['Odometer (KM) imputed', 'Doors imputed', 'Price imputed'])

In [37]:
df.head()

Unnamed: 0,Odometer (KM) imputed,Doors imputed,Price imputed
0,35431.0,4.0,15323.0
1,192714.0,5.0,19943.0
2,84714.0,4.0,28343.0
3,154365.0,4.0,13434.0
4,181577.0,3.0,14043.0


In [38]:
car_sales.dropna(subset=["Price"], inplace=True)

In [39]:
df3 = pd.concat([car_sales, df], axis=1)

In [40]:
df3

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price,Odometer (KM) imputed,Doors imputed,Price imputed
0,Honda,White,35431.0,4.0,15323.0,35431.0,4.0,15323.000000
1,BMW,Blue,192714.0,5.0,19943.0,192714.0,5.0,19943.000000
2,Honda,White,84714.0,4.0,28343.0,84714.0,4.0,28343.000000
3,Toyota,White,154365.0,4.0,13434.0,154365.0,4.0,13434.000000
4,Nissan,Blue,181577.0,3.0,14043.0,181577.0,3.0,14043.000000
...,...,...,...,...,...,...,...,...
940,,,,,,53474.0,4.0,16042.814737
947,,,,,,17609.0,4.0,16042.814737
952,,,,,,136022.0,4.0,16042.814737
959,,,,,,186786.0,4.0,16042.814737


In [41]:
df3.isna().sum()

Make                     97
Colour                   96
Odometer (KM)            98
Doors                    97
Price                    50
Odometer (KM) imputed     0
Doors imputed             0
Price imputed             0
dtype: int64

In [42]:
df3.Colour.bfill()

0      White
1       Blue
2      White
3      White
4       Blue
       ...  
940      NaN
947      NaN
952      NaN
959      NaN
968      NaN
Name: Colour, Length: 1000, dtype: object

In [43]:
df3.isna().sum()

Make                     97
Colour                   96
Odometer (KM)            98
Doors                    97
Price                    50
Odometer (KM) imputed     0
Doors imputed             0
Price imputed             0
dtype: int64

In [44]:
categorical_features = ["Make", "Colour"]


categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))])

In [45]:
# Define Doors features
door_feature = ["Doors"]

# Create Doors transformer Pipeline
door_transformer = Pipeline(steps=[
    # Set SimpleImputer strategy to "constant" and fill value to 4
    ("imputer", SimpleImputer(strategy="constant", fill_value=4))])

In [46]:
# Define numeric features (only the Odometer (KM) column)
numeric_features = ["Odometer (KM)"]

# Crearte numeric transformer Pipeline
numeric_transformer = Pipeline(steps=[
    # Set SimpleImputer strategy to fill missing values with the "Median"
    ("imputer", SimpleImputer(strategy="median"))])

In [47]:
# Setup preprocessing steps (fill missing values, then convert to numbers)
preprocessor = ColumnTransformer(
    transformers=[
        # Use the categorical_transformer to transform the categorical_features
        ("cat", categorical_transformer, categorical_features),
        # Use the door_transformer to transform the door_feature
        ("door", door_transformer, door_feature),
        # Use the numeric_transformer to transform the numeric_features
        ("num", numeric_transformer, numeric_features)])

In [48]:
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

In [49]:
regression_models = {"Ridge": Ridge(),
                     "SVR_linear": SVR(kernel="linear"),
                     "SVR_rbf": SVR(kernel="rbf"),
                     "RandomForestRegressor": RandomForestRegressor()}

# Create an empty dictionary for the regression results
regression_results = {}

In [50]:
car_sales_X = car_sales.drop("Price", axis=1)

car_sales_y = car_sales["Price"]

In [None]:
car_X_train, car_X_test, car_y_train, car_y_test = train_test_split(car_sales_X,
                                                                    car_sales_y,
                                                                    test_size=0.2,
                                                                    random_state=42)


car_X_train.shape, car_X_test.shape, car_y_train.shape, car_y_test.shape

In [None]:
for model_name, model in regression_models.items():
    
    # Create a model pipeline with a preprocessor step and model step
    model_pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                                      ("model", model)])
    
    # Fit the model pipeline to the car sales training data
    print(f"Fitting {model_name}...")
    model_pipeline.fit(car_X_train, car_y_train)
    
    # Score the model pipeline on the test data appending the model_name to the 
    # results dictionary
    print(f"Scoring {model_name}...")
    regression_results[model_name] = model_pipeline.score(car_X_test, 
                                                          car_y_test)

In [53]:
regression_results

{'Ridge': 0.254026110579439,
 'SVR_linear': -0.489452821008145,
 'SVR_rbf': 0.0018546241516633755,
 'RandomForestRegressor': 0.20809940879840194}

In [96]:
df3.Make.unique()

array(['Honda', 'BMW', 'Toyota', 'Nissan', nan], dtype=object)