In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# data cleaning using scikit-learn
from sklearn.impute import SimpleImputer # fixes missing values
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder # fixes categorical data
from sklearn.preprocessing import StandardScaler, MinMaxScaler # fixes scaling

In [2]:
pd.set_option('display.max_columns', None) # remove the limit for max columns (only for display)

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/digipodium/Datasets/main/regression/automobile.csv', index_col=0)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       205 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non

imputation

In [5]:
df.drop(df[df['price']=='?'].index.tolist(),inplace=True)

In [6]:
X = df.drop(columns=['price']) # same as  X = df.loc[:, :-1]
y = df['price']

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 201 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          201 non-null    int64  
 1   normalized-losses  201 non-null    object 
 2   make               201 non-null    object 
 3   fuel-type          201 non-null    object 
 4   aspiration         201 non-null    object 
 5   num-of-doors       201 non-null    object 
 6   body-style         201 non-null    object 
 7   drive-wheels       201 non-null    object 
 8   engine-location    201 non-null    object 
 9   wheel-base         201 non-null    float64
 10  length             201 non-null    float64
 11  width              201 non-null    float64
 12  height             201 non-null    float64
 13  curb-weight        201 non-null    int64  
 14  engine-type        201 non-null    object 
 15  num-of-cylinders   201 non-null    object 
 16  engine-size        201 non

In [8]:
numerical_x = ['normalized-losses','wheel-base', 'length','width','height','curb-weight','engine-size','compression-ratio','city-mpg','highway-mpg','bore','stroke','horsepower','peak-rpm']
categorical_x = ['symboling','make','fuel-type','aspiration','num-of-doors','body-style','engine-location','engine-type','num-of-cylinders','fuel-system']

In [9]:
numerical_x=df[numerical_x].copy()
categorical_x=df[categorical_x].copy()

In [10]:
# remove a occurance of ? from the data
numerical_x.replace('?', np.nan, inplace=True)
categorical_x.replace('?', np.nan, inplace=True)

In [11]:
si = SimpleImputer()
numerical_x_2 = si.fit_transform(numerical_x)
print(numerical_x_2.shape)

(201, 14)


In [12]:
si2 = SimpleImputer(strategy='most_frequent')
categorical_x_2 = si2.fit_transform(categorical_x)
print(categorical_x_2.shape)

(201, 10)


In [13]:
# numerical data scaling/normalization
numerical_x_2.astype(int) # just to look at the values

array([[ 122,   88,  168, ...,    2,  111, 5000],
       [ 122,   88,  168, ...,    2,  111, 5000],
       [ 122,   94,  171, ...,    3,  154, 5000],
       ...,
       [  95,  109,  188, ...,    2,  134, 5500],
       [  95,  109,  188, ...,    3,  106, 4800],
       [  95,  109,  188, ...,    3,  114, 5400]])

In [14]:
scaler = MinMaxScaler()
numrical_x_2 = scaler.fit_transform(numerical_x_2)
print(numrical_x_2)

[[0.29842932 0.05830904 0.41343284 ... 0.29047619 0.29439252 0.34693878]
 [0.29842932 0.05830904 0.41343284 ... 0.29047619 0.29439252 0.34693878]
 [0.29842932 0.2303207  0.44925373 ... 0.66666667 0.4953271  0.34693878]
 ...
 [0.15706806 0.65597668 0.7119403  ... 0.38095238 0.40186916 0.55102041]
 [0.15706806 0.65597668 0.7119403  ... 0.63333333 0.27102804 0.26530612]
 [0.15706806 0.65597668 0.7119403  ... 0.51428571 0.30841121 0.51020408]]


In [15]:
categorical_x_2

array([[3, 'alfa-romero', 'gas', ..., 'dohc', 'four', 'mpfi'],
       [3, 'alfa-romero', 'gas', ..., 'dohc', 'four', 'mpfi'],
       [1, 'alfa-romero', 'gas', ..., 'ohcv', 'six', 'mpfi'],
       ...,
       [-1, 'volvo', 'gas', ..., 'ohcv', 'six', 'mpfi'],
       [-1, 'volvo', 'diesel', ..., 'ohc', 'six', 'idi'],
       [-1, 'volvo', 'gas', ..., 'ohc', 'four', 'mpfi']], dtype=object)

Encoding
- if a column has only 2 unique values
    - OrdinalEncoder - for multiple columns in a dataframe
    - LabelEncoder - for single column in a dataframe
- if a column has more than 2 unique values
    - OneHotEncoder

In [16]:
# find out nunique values for each column
for col in categorical_x:
    print(col, categorical_x[col].nunique())

symboling 6
make 22
fuel-type 2
aspiration 2
num-of-doors 2
body-style 5
engine-location 2
engine-type 6
num-of-cylinders 7
fuel-system 8


In [17]:
cat_xo = categorical_x_2[:,[2,3,4,6]]
oenc = OrdinalEncoder()
cat_xo = oenc.fit_transform(cat_xo)
print(cat_xo)

[[1. 0. 1. 0.]
 [1. 0. 1. 0.]
 [1. 0. 1. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 1. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 1. 0. 0.]
 [1. 0. 1. 0.]
 [1. 0. 0. 0.]
 [1. 0. 1. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 1. 0.]
 [1. 0. 0. 0.]
 [1. 0. 1. 0.]
 [1. 0. 1. 0.]
 [1. 0. 0. 0.]
 [1. 0. 1. 0.]
 [1. 0. 1. 0.]
 [1. 1. 1. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 1. 1. 0.]
 [1. 0. 1. 0.]
 [1. 0. 1. 0.]
 [1. 0. 1. 0.]
 [1. 0. 1. 0.]
 [1. 0. 1. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 1. 0.]
 [1. 0. 1. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 1. 0.]
 [1. 0. 0. 0.]
 [1. 0. 1. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 1. 0.]
 [1. 0. 1. 0.]
 [1. 0. 1. 0.]
 [1. 0. 1. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 1. 0.]
 [1. 0. 1. 0.]
 [1. 0. 1. 0.]
 [1. 0. 1. 0.]
 [1. 0. 1. 0.]
 [1. 0. 0. 0.]
 [1. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 1.

In [18]:
categorical_x_2[:, [5]]

array([['convertible'],
       ['convertible'],
       ['hatchback'],
       ['sedan'],
       ['sedan'],
       ['sedan'],
       ['sedan'],
       ['wagon'],
       ['sedan'],
       ['sedan'],
       ['sedan'],
       ['sedan'],
       ['sedan'],
       ['sedan'],
       ['sedan'],
       ['sedan'],
       ['sedan'],
       ['hatchback'],
       ['hatchback'],
       ['sedan'],
       ['hatchback'],
       ['hatchback'],
       ['hatchback'],
       ['hatchback'],
       ['sedan'],
       ['sedan'],
       ['sedan'],
       ['wagon'],
       ['hatchback'],
       ['hatchback'],
       ['hatchback'],
       ['hatchback'],
       ['hatchback'],
       ['hatchback'],
       ['sedan'],
       ['wagon'],
       ['hatchback'],
       ['hatchback'],
       ['sedan'],
       ['sedan'],
       ['sedan'],
       ['sedan'],
       ['sedan'],
       ['hatchback'],
       ['sedan'],
       ['sedan'],
       ['sedan'],
       ['hatchback'],
       ['hatchback'],
       ['hatchback'],
       ['sed

In [19]:
bstyle_enc = OneHotEncoder(drop='first')
dummy_body_style = bstyle_enc.fit_transform(categorical_x_2[:, [5]]).toarray()

In [20]:
dummy_body_style

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],


In [21]:
bstyle_enc.categories_

[array(['convertible', 'hardtop', 'hatchback', 'sedan', 'wagon'],
       dtype=object)]