In [1]:
%load_ext cudf.pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
car_sales_missing = pd.read_csv("/home/rapids/workspace/ML_Course/ztm-ml/data/car-sales-extended-missing-data.csv")

In [3]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [4]:
car_sales_missing.dropna(subset=["Price"], inplace=True)
car_sales_missing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [5]:
x = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]  

In [6]:
x_train, x_test, y_train, y_test = train_test_split(
    x, 
    y, 
    test_size=0.2, 
    random_state=42)

In [7]:
# Fill missing values with Scikit-learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill categorical values with "missing" and numerical values with mean
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
num_imputer = SimpleImputer(strategy="mean")

# Define columns
cat_features = ["Make", "Colour"]
door_feature = ["Doors"]
num_features = ["Odometer (KM)"]

# Create an imputer (something that fills missing data)
imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_features),
    ("door_imputer", door_imputer, door_feature),
    ("num_imputer", num_imputer, num_features)
])

In [8]:
# Transform the data
filled_x_train = imputer.fit_transform(x_train)
filled_x_test = imputer.transform(x_test)

In [9]:
filled_x_train_df = pd.DataFrame(filled_x_train, columns=["Make", "Colour", "Doors", "Odometer (KM)"])
filled_x_test_df = pd.DataFrame(filled_x_test, columns=["Make", "Colour", "Doors", "Odometer (KM)"])

In [16]:
filled_x_train_df

Unnamed: 0,Make,Colour,Doors,Odometer (KM)
0,Honda,White,4.0,71934.0
1,Toyota,Red,4.0,162665.0
2,Honda,White,4.0,42844.0
3,Honda,White,4.0,195829.0
4,Honda,Blue,4.0,219217.0
...,...,...,...,...
755,Toyota,,4.0,218803.0
756,BMW,Blue,5.0,245427.0
757,Toyota,White,4.0,196225.0
758,Honda,Blue,4.0,133117.0


In [15]:
# Import OneHotEncoder class from sklearn
from sklearn.preprocessing import OneHotEncoder

# Now let's one hot encode the features with the same code as before 
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", 
                                 one_hot, 
                                 categorical_features)],
                                 remainder="passthrough")

# Fill train and test values separately
transformed_X_train = transformer.fit_transform(filled_x_train_df) # fit and transform the training data
transformed_X_test = transformer.transform(filled_x_test_df) # transform the test data

# Check transformed and filled X_train
transformed_X_train.toarray()

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 7.19340e+04],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.62665e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 4.28440e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.96225e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.33117e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.50582e+05]], shape=(760, 15))

In [None]:
# categorical_features = ["Make", "Colour", "Doors"]
# x_train_dummies = pd.get_dummies(filled_x_train_df,columns=categorical_features, drop_first=True)
# x_test_dummies = pd.get_dummies(filled_x_test_df,columns=categorical_features, drop_first=True)

In [28]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100)
model.fit(transformed_X_train, y_train)
model.score(transformed_X_train, y_train)

0.8835225846913191

In [29]:
model.score(transformed_X_test, y_test)

0.2071953056046988

In [35]:
# import linear regression model
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(transformed_X_train, y_train)
model.score(transformed_X_train, y_train)

0.34488358946844133

In [None]:
model.score(transformed_X_test, y_test)

0.2567925995014665