In [236]:
import numpy as np
import pandas as pd

In [238]:
df = pd.read_csv("cars93.csv")
df.head()

Unnamed: 0,Manufacturer,Model,Type,Price,MPG.city,AirBags,Horsepower,Passengers,Rear.seat.room,Luggage.room
0,Acura,Integra,Small,3259500,25,Driver only,140,5,26.5,11.0
1,Acura,Legend,Midsize,6949500,18,Driver & Passenger,200,5,30.0,15.0
2,Audi,90,Compact,5965500,20,Driver only,172,5,28.0,14.0
3,Audi,100,Midsize,7728500,19,Driver & Passenger,172,6,31.0,17.0
4,BMW,535i,Midsize,6150000,22,Driver only,208,4,27.0,13.0


In [239]:
print("Total number of records: ", df.shape[0])
print("Total number of columns: ", len(df.columns))

Total number of records:  93
Total number of columns:  10


### Checking for missing values

In [240]:
df.isna().sum() #column wise

Manufacturer       0
Model              0
Type               0
Price              0
MPG.city           0
AirBags           14
Horsepower         0
Passengers         0
Rear.seat.room     2
Luggage.room      11
dtype: int64

### Imputing the missing values

#### 1. Luggage.room

In [186]:
df["Luggage.room"].mean()

13.890243902439025

In [187]:
df["Luggage.room"].median()

14.0

In [188]:
df['Luggage.room'] = df['Luggage.room'].fillna(value=df['Luggage.room'].median())

In [189]:
df.isna().sum()

Manufacturer       0
Model              0
Type               0
Price              0
MPG.city           0
AirBags           14
Horsepower         0
Passengers         0
Rear.seat.room     2
Luggage.room       0
dtype: int64

#### 2. Rear.seat.room

In [190]:
df["Rear.seat.room"].mean()

27.82967032967033

In [191]:
df["Rear.seat.room"].median()

27.5

In [192]:
df['Rear.seat.room'] = df['Rear.seat.room'].fillna(value=df['Rear.seat.room'].median())

In [193]:
df.isna().sum()

Manufacturer       0
Model              0
Type               0
Price              0
MPG.city           0
AirBags           14
Horsepower         0
Passengers         0
Rear.seat.room     0
Luggage.room       0
dtype: int64

#### 3. AirBags

In [194]:
df["AirBags"].mode()

0    Driver only
dtype: object

In [195]:
df["AirBags"].mode().values[0]

'Driver only'

In [196]:
df['AirBags'] = df['AirBags'].fillna(value=df['AirBags'].mode().values[0])

In [197]:
df.isna().sum()

Manufacturer      0
Model             0
Type              0
Price             0
MPG.city          0
AirBags           0
Horsepower        0
Passengers        0
Rear.seat.room    0
Luggage.room      0
dtype: int64

#### Assessing the linear relationship

In [198]:
df[["Horsepower", "MPG.city"]].corr()

Unnamed: 0,Horsepower,MPG.city
Horsepower,1.0,-0.672636
MPG.city,-0.672636,1.0


In [199]:
df[["Price", "MPG.city", "Passengers"]].corr()

Unnamed: 0,Price,MPG.city,Passengers
Price,1.0,-0.594562,0.05786
MPG.city,-0.594562,1.0,-0.416856
Passengers,0.05786,-0.416856,1.0


In [200]:
df.corr()

Unnamed: 0,Price,MPG.city,Horsepower,Passengers,Rear.seat.room,Luggage.room
Price,1.0,-0.594562,0.788218,0.05786,0.297898,0.355827
MPG.city,-0.594562,1.0,-0.672636,-0.416856,-0.378071,-0.466621
Horsepower,0.788218,-0.672636,1.0,0.009264,0.230654,0.331115
Passengers,0.05786,-0.416856,0.009264,1.0,0.629517,0.423096
Rear.seat.room,0.297898,-0.378071,0.230654,0.629517,1.0,0.591162
Luggage.room,0.355827,-0.466621,0.331115,0.423096,0.591162,1.0


### Data Encoding

In [201]:
df.columns

Index(['Manufacturer', 'Model', 'Type', 'Price', 'MPG.city', 'AirBags',
       'Horsepower', 'Passengers', 'Rear.seat.room', 'Luggage.room'],
      dtype='object')

In [202]:
len(df["Manufacturer"].unique())

32

In [203]:
len(df["Model"].unique())

93

In [204]:
len(df["Type"].unique())

6

In [205]:
len(df["AirBags"].unique())

2

In [209]:
32+93+6+2

133

In [206]:
df.head()

Unnamed: 0,Manufacturer,Model,Type,Price,MPG.city,AirBags,Horsepower,Passengers,Rear.seat.room,Luggage.room
0,Acura,Integra,Small,3259500,25,Driver only,140,5,26.5,11.0
1,Acura,Legend,Midsize,6949500,18,Driver & Passenger,200,5,30.0,15.0
2,Audi,90,Compact,5965500,20,Driver only,172,5,28.0,14.0
3,Audi,100,Midsize,7728500,19,Driver & Passenger,172,6,31.0,17.0
4,BMW,535i,Midsize,6150000,22,Driver only,208,4,27.0,13.0


#### 1. Using pd.get_dummies()

In [207]:
pd.get_dummies(df)

Unnamed: 0,Price,MPG.city,Horsepower,Passengers,Rear.seat.room,Luggage.room,Manufacturer_Acura,Manufacturer_Audi,Manufacturer_BMW,Manufacturer_Buick,...,Model_Town_Car,Model_Vision,Type_Compact,Type_Large,Type_Midsize,Type_Small,Type_Sporty,Type_Van,AirBags_Driver & Passenger,AirBags_Driver only
0,3259500,25,140,5,26.5,11.0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
1,6949500,18,200,5,30.0,15.0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
2,5965500,20,172,5,28.0,14.0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,1
3,7728500,19,172,6,31.0,17.0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
4,6150000,22,208,4,27.0,13.0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,4038500,17,109,7,34.0,14.0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
89,4100000,21,134,5,31.5,14.0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
90,4776500,18,178,4,26.0,15.0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
91,4653500,21,114,5,29.5,14.0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [208]:
pd.get_dummies(df).columns.to_list()

['Price',
 'MPG.city',
 'Horsepower',
 'Passengers',
 'Rear.seat.room',
 'Luggage.room',
 'Manufacturer_Acura',
 'Manufacturer_Audi',
 'Manufacturer_BMW',
 'Manufacturer_Buick',
 'Manufacturer_Cadillac',
 'Manufacturer_Chevrolet',
 'Manufacturer_Chrylser',
 'Manufacturer_Chrysler',
 'Manufacturer_Dodge',
 'Manufacturer_Eagle',
 'Manufacturer_Ford',
 'Manufacturer_Geo',
 'Manufacturer_Honda',
 'Manufacturer_Hyundai',
 'Manufacturer_Infiniti',
 'Manufacturer_Lexus',
 'Manufacturer_Lincoln',
 'Manufacturer_Mazda',
 'Manufacturer_Mercedes-Benz',
 'Manufacturer_Mercury',
 'Manufacturer_Mitsubishi',
 'Manufacturer_Nissan',
 'Manufacturer_Oldsmobile',
 'Manufacturer_Plymouth',
 'Manufacturer_Pontiac',
 'Manufacturer_Saab',
 'Manufacturer_Saturn',
 'Manufacturer_Subaru',
 'Manufacturer_Suzuki',
 'Manufacturer_Toyota',
 'Manufacturer_Volkswagen',
 'Manufacturer_Volvo',
 'Model_100',
 'Model_190E',
 'Model_240',
 'Model_300E',
 'Model_323',
 'Model_535i',
 'Model_626',
 'Model_850',
 'Model_90',

In [210]:
pd.get_dummies(df,drop_first=True)

Unnamed: 0,Price,MPG.city,Horsepower,Passengers,Rear.seat.room,Luggage.room,Manufacturer_Audi,Manufacturer_BMW,Manufacturer_Buick,Manufacturer_Cadillac,...,Model_Tempo,Model_Tercel,Model_Town_Car,Model_Vision,Type_Large,Type_Midsize,Type_Small,Type_Sporty,Type_Van,AirBags_Driver only
0,3259500,25,140,5,26.5,11.0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
1,6949500,18,200,5,30.0,15.0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,5965500,20,172,5,28.0,14.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,7728500,19,172,6,31.0,17.0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,6150000,22,208,4,27.0,13.0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,4038500,17,109,7,34.0,14.0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
89,4100000,21,134,5,31.5,14.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
90,4776500,18,178,4,26.0,15.0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
91,4653500,21,114,5,29.5,14.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


#### 2. Using sklearn's OneHotEncoder

In [211]:
from sklearn.preprocessing import OneHotEncoder

In [212]:
onehot = OneHotEncoder()

In [213]:
onehot.fit_transform(df[["Type"]])

<93x6 sparse matrix of type '<class 'numpy.float64'>'
	with 93 stored elements in Compressed Sparse Row format>

In [214]:
onehot.fit_transform(df[["Type"]]).toarray()

array([[0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0.

In [215]:
onehot.categories_

[array(['Compact', 'Large', 'Midsize', 'Small', 'Sporty', 'Van'],
       dtype=object)]

In [216]:
df["Type"]

0       Small
1     Midsize
2     Compact
3     Midsize
4     Midsize
       ...   
88        Van
89    Compact
90     Sporty
91    Compact
92    Midsize
Name: Type, Length: 93, dtype: object

In [217]:
encoded = pd.DataFrame(onehot.fit_transform(df[["Type"]]).toarray())
encoded.head()

Unnamed: 0,0,1,2,3,4,5
0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0


In [235]:
pd.concat([df, encoded], axis=1)

Unnamed: 0,Manufacturer,Model,Type,Price,MPG.city,AirBags,Horsepower,Passengers,Rear.seat.room,Luggage.room,0,1,2,3,4,5
0,Acura,Integra,Small,3259500,25,Driver only,140,5,26.5,11.0,0.0,0.0,0.0,1.0,0.0,0.0
1,Acura,Legend,Midsize,6949500,18,Driver & Passenger,200,5,30.0,15.0,0.0,0.0,1.0,0.0,0.0,0.0
2,Audi,90,Compact,5965500,20,Driver only,172,5,28.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0
3,Audi,100,Midsize,7728500,19,Driver & Passenger,172,6,31.0,17.0,0.0,0.0,1.0,0.0,0.0,0.0
4,BMW,535i,Midsize,6150000,22,Driver only,208,4,27.0,13.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,Volkswagen,Eurovan,Van,4038500,17,Driver & Passenger,109,7,34.0,14.0,0.0,0.0,0.0,0.0,0.0,1.0
89,Volkswagen,Passat,Compact,4100000,21,Driver & Passenger,134,5,31.5,14.0,1.0,0.0,0.0,0.0,0.0,0.0
90,Volkswagen,Corrado,Sporty,4776500,18,Driver & Passenger,178,4,26.0,15.0,0.0,0.0,0.0,0.0,1.0,0.0
91,Volvo,240,Compact,4653500,21,Driver only,114,5,29.5,14.0,1.0,0.0,0.0,0.0,0.0,0.0


### Data Normalization

#### 1. Min-Max Scaler

In [219]:
from sklearn.preprocessing import MinMaxScaler

In [220]:
min_max = MinMaxScaler()

In [226]:
price_normalized = min_max.fit_transform(df[["Price"]])
price_normalized

array([[0.1559633 ],
       [0.48623853],
       [0.39816514],
       [0.5559633 ],
       [0.4146789 ],
       [0.15229358],
       [0.24587156],
       [0.29908257],
       [0.34678899],
       [0.50091743],
       [0.6       ],
       [0.11009174],
       [0.0733945 ],
       [0.1412844 ],
       [0.1559633 ],
       [0.16330275],
       [0.16880734],
       [0.20917431],
       [0.56146789],
       [0.20183486],
       [0.15412844],
       [0.40550459],
       [0.03302752],
       [0.07155963],
       [0.10825688],
       [0.21284404],
       [0.15045872],
       [0.33761468],
       [0.08807339],
       [0.21834862],
       [0.        ],
       [0.04954128],
       [0.07155963],
       [0.1559633 ],
       [0.12110092],
       [0.2293578 ],
       [0.23486239],
       [0.24770642],
       [0.01834862],
       [0.09357798],
       [0.22752294],
       [0.08623853],
       [0.1853211 ],
       [0.01100917],
       [0.04770642],
       [0.04770642],
       [0.11926606],
       [0.743

In [227]:
print("Min value: ", price_normalized.min())
print("Max value: ", price_normalized.max())

Min value:  0.0
Max value:  1.0


In [225]:
min_max.fit_transform(df[["Price", "MPG.city"]])

array([[0.1559633 , 0.32258065],
       [0.48623853, 0.09677419],
       [0.39816514, 0.16129032],
       [0.5559633 , 0.12903226],
       [0.4146789 , 0.22580645],
       [0.15229358, 0.22580645],
       [0.24587156, 0.12903226],
       [0.29908257, 0.03225806],
       [0.34678899, 0.12903226],
       [0.50091743, 0.03225806],
       [0.6       , 0.03225806],
       [0.11009174, 0.32258065],
       [0.0733945 , 0.32258065],
       [0.1412844 , 0.12903226],
       [0.1559633 , 0.19354839],
       [0.16330275, 0.09677419],
       [0.16880734, 0.        ],
       [0.20917431, 0.06451613],
       [0.56146789, 0.06451613],
       [0.20183486, 0.16129032],
       [0.15412844, 0.25806452],
       [0.40550459, 0.16129032],
       [0.03302752, 0.4516129 ],
       [0.07155963, 0.25806452],
       [0.10825688, 0.22580645],
       [0.21284404, 0.06451613],
       [0.15045872, 0.19354839],
       [0.33761468, 0.09677419],
       [0.08807339, 0.4516129 ],
       [0.21834862, 0.16129032],
       [0.

#### 2. Standard Scaler 

In [228]:
from sklearn.preprocessing import StandardScaler

In [229]:
std_scaler = StandardScaler()

In [230]:
price_standardized = std_scaler.fit_transform(df[["Price"]])
price_standardized

array([[-3.75720139e-01],
       [ 1.49784409e+00],
       [ 9.98226964e-01],
       [ 1.89337432e+00],
       [ 1.09190518e+00],
       [-3.96537519e-01],
       [ 1.34305680e-01],
       [ 4.36157694e-01],
       [ 7.06783639e-01],
       [ 1.58111361e+00],
       [ 2.14318288e+00],
       [-6.35937393e-01],
       [-8.44111196e-01],
       [-4.58989660e-01],
       [-3.75720139e-01],
       [-3.34085378e-01],
       [-3.02859307e-01],
       [-7.38681238e-02],
       [ 1.92460039e+00],
       [-1.15502884e-01],
       [-3.86128829e-01],
       [ 1.03986172e+00],
       [-1.07310238e+00],
       [-8.54519886e-01],
       [-6.46346083e-01],
       [-5.30507434e-02],
       [-4.06946209e-01],
       [ 6.54740188e-01],
       [-7.60841675e-01],
       [-2.18246729e-02],
       [-1.26045880e+00],
       [-9.79424168e-01],
       [-8.54519886e-01],
       [-3.75720139e-01],
       [-5.73485252e-01],
       [ 4.06274681e-02],
       [ 7.18535386e-02],
       [ 1.44714370e-01],
       [-1.1

In [232]:
std_scaler.fit_transform(df[["Price", "MPG.city"]])

array([[-3.75720139e-01,  4.71312488e-01],
       [ 1.49784409e+00, -7.81032122e-01],
       [ 9.98226964e-01, -4.23219377e-01],
       [ 1.89337432e+00, -6.02125750e-01],
       [ 1.09190518e+00, -6.54066309e-02],
       [-3.96537519e-01, -6.54066309e-02],
       [ 1.34305680e-01, -6.02125750e-01],
       [ 4.36157694e-01, -1.13884487e+00],
       [ 7.06783639e-01, -6.02125750e-01],
       [ 1.58111361e+00, -1.13884487e+00],
       [ 2.14318288e+00, -1.13884487e+00],
       [-6.35937393e-01,  4.71312488e-01],
       [-8.44111196e-01,  4.71312488e-01],
       [-4.58989660e-01, -6.02125750e-01],
       [-3.75720139e-01, -2.44313004e-01],
       [-3.34085378e-01, -7.81032122e-01],
       [-3.02859307e-01, -1.31775124e+00],
       [-7.38681238e-02, -9.59938495e-01],
       [ 1.92460039e+00, -9.59938495e-01],
       [-1.15502884e-01, -4.23219377e-01],
       [-3.86128829e-01,  1.13499742e-01],
       [ 1.03986172e+00, -4.23219377e-01],
       [-1.07310238e+00,  1.18693798e+00],
       [-8.