In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('car_data.csv')

df.head()

Unnamed: 0,Car ID,Symboling,Car Name,Fuel Type,Aspiration,Door Number,Car Body,Drive Wheel,Engine Location,Wheel Base,...,Engine Size,Fuel System,Bore Ratio,Stroke,Compression Ratio,Horse Power,Peak RPM,City MPG,Highway MPG,Price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5K,21,27,13495
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5K,21,27,16500
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5K,19,26,16500
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [3]:
df.columns

Index(['Car ID', 'Symboling', 'Car Name', 'Fuel Type', 'Aspiration',
       'Door Number', 'Car Body', 'Drive Wheel', 'Engine Location',
       'Wheel Base', 'Car Length', 'Car Width', 'Car Height', 'Curb Weight',
       'Engine Type', 'Cylinder Number', 'Engine Size', 'Fuel System',
       'Bore Ratio', 'Stroke', 'Compression Ratio', 'Horse Power', 'Peak RPM',
       'City MPG', 'Highway MPG', 'Price'],
      dtype='object')

In [4]:
# Issues with the dataframe?
# Column Names
# In this case study we will focus on Car Name, Door Number and Peak RPM.

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Car ID             205 non-null    int64  
 1   Symboling          205 non-null    int64  
 2   Car Name           205 non-null    object 
 3   Fuel Type          205 non-null    object 
 4   Aspiration         205 non-null    object 
 5   Door Number        205 non-null    object 
 6   Car Body           205 non-null    object 
 7   Drive Wheel        205 non-null    object 
 8   Engine Location    205 non-null    object 
 9   Wheel Base         205 non-null    float64
 10  Car Length         205 non-null    float64
 11  Car Width          205 non-null    float64
 12  Car Height         205 non-null    float64
 13  Curb Weight        205 non-null    int64  
 14  Engine Type        205 non-null    object 
 15  Cylinder Number    205 non-null    object 
 16  Engine Size        205 non

### Fixing Column Names

In [6]:
col_names = [ col.lower().replace(' ', '_') for col in df.columns ]

col_names

['car_id',
 'symboling',
 'car_name',
 'fuel_type',
 'aspiration',
 'door_number',
 'car_body',
 'drive_wheel',
 'engine_location',
 'wheel_base',
 'car_length',
 'car_width',
 'car_height',
 'curb_weight',
 'engine_type',
 'cylinder_number',
 'engine_size',
 'fuel_system',
 'bore_ratio',
 'stroke',
 'compression_ratio',
 'horse_power',
 'peak_rpm',
 'city_mpg',
 'highway_mpg',
 'price']

In [7]:
df.columns = col_names

In [8]:
df.head()

Unnamed: 0,car_id,symboling,car_name,fuel_type,aspiration,door_number,car_body,drive_wheel,engine_location,wheel_base,...,engine_size,fuel_system,bore_ratio,stroke,compression_ratio,horse_power,peak_rpm,city_mpg,highway_mpg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5K,21,27,13495
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5K,21,27,16500
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5K,19,26,16500
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   car_id             205 non-null    int64  
 1   symboling          205 non-null    int64  
 2   car_name           205 non-null    object 
 3   fuel_type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   door_number        205 non-null    object 
 6   car_body           205 non-null    object 
 7   drive_wheel        205 non-null    object 
 8   engine_location    205 non-null    object 
 9   wheel_base         205 non-null    float64
 10  car_length         205 non-null    float64
 11  car_width          205 non-null    float64
 12  car_height         205 non-null    float64
 13  curb_weight        205 non-null    int64  
 14  engine_type        205 non-null    object 
 15  cylinder_number    205 non-null    object 
 16  engine_size        205 non

## As a part of this case study, we will focus only on `car_name`, `door_number` and `peak_rpm`

From df.info(), it is clear that `car_name`, `door_number` and `peak_rpm` are object type without any missing values

In [10]:
df.car_name.value_counts()

toyota corona           6
toyota corolla          6
peugeot 504             6
subaru dl               4
mitsubishi mirage g4    3
                       ..
mazda glc 4             1
mazda rx2 coupe         1
maxda glc deluxe        1
maxda rx3               1
volvo 246               1
Name: car_name, Length: 147, dtype: int64

In [11]:
df.door_number.value_counts()

four    114
two      90
?         1
Name: door_number, dtype: int64

In [12]:
df.peak_rpm.value_counts()

5500     36
4800     36
5K       27
5200     21
5400     10
6K        9
4500      7
5250      7
5800      7
5100      5
4150      5
4200      5
4750      4
4350      4
5,400     3
4250      3
5900      3
4400      3
5,200     2
6600      2
4900      1
5,500     1
5750      1
5600      1
4650      1
5300      1
Name: peak_rpm, dtype: int64

**Observations**

- `car_name` is a categorical column with 147 categories of cars
- It looks like `door_number` is containing one missing value 
- `peak_rpm` looks like an integer column with some abnormalities.

## Fixing `door_number` column

**Posible Fixes**
- Handle the missing value
- Possibility to conver to integer column

#### Crosstab - It compares the index and columns which yields the count 

In [13]:
df.columns

Index(['car_id', 'symboling', 'car_name', 'fuel_type', 'aspiration',
       'door_number', 'car_body', 'drive_wheel', 'engine_location',
       'wheel_base', 'car_length', 'car_width', 'car_height', 'curb_weight',
       'engine_type', 'cylinder_number', 'engine_size', 'fuel_system',
       'bore_ratio', 'stroke', 'compression_ratio', 'horse_power', 'peak_rpm',
       'city_mpg', 'highway_mpg', 'price'],
      dtype='object')

In [15]:
# pd.crosstab(index=df['door_number'], columns=df['car_body'])

pd.crosstab(df['door_number'], df['car_body'])

car_body,convertible,hardtop,hatchback,sedan,wagon
door_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
?,0,0,0,0,1
four,0,0,10,80,24
two,6,8,60,16,0


In [14]:
pd.crosstab(df['door_number'], df['fuel_type'])

fuel_type,diesel,gas
door_number,Unnamed: 1_level_1,Unnamed: 2_level_1
?,0,1
four,17,97
two,3,87


In [17]:
pd.crosstab(df['door_number'], df['drive_wheel'])

drive_wheel,4wd,fwd,rwd
door_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
?,0,1,0
four,7,68,39
two,2,51,37


In [16]:
pd.crosstab(df['door_number'], df['engine_location'])

engine_location,front,rear
door_number,Unnamed: 1_level_1,Unnamed: 2_level_1
?,1,0
four,114,0
two,87,3


#### Replace

In [18]:
# df.replace(to_replace='old', value='new', regex=False, inplace=True)
# df.replace(to_replace=r'regex', value='new', regex=True, inplace=True)

df['door_number'].replace(to_replace='?', value='four', regex=False, inplace=True)

df.door_number.value_counts()

four    115
two      90
Name: door_number, dtype: int64

In [19]:
df.head()

Unnamed: 0,car_id,symboling,car_name,fuel_type,aspiration,door_number,car_body,drive_wheel,engine_location,wheel_base,...,engine_size,fuel_system,bore_ratio,stroke,compression_ratio,horse_power,peak_rpm,city_mpg,highway_mpg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5K,21,27,13495
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5K,21,27,16500
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5K,19,26,16500
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [20]:
# df['door_number'].replace(to_replace='four', value='4', regex=False, inplace=True)
# df['door_number'].replace(to_replace='two', value='2', regex=False, inplace=True)

df['door_number'].apply(lambda x : 4 if x=='four' else 2)

0      2
1      2
2      2
3      4
4      4
      ..
200    4
201    4
202    4
203    4
204    4
Name: door_number, Length: 205, dtype: int64

In [21]:
df['door_number'].value_counts()

four    115
two      90
Name: door_number, dtype: int64

In [22]:
df['door_number'].apply(lambda x : 4 if x=='four' else 2).value_counts()

4    115
2     90
Name: door_number, dtype: int64

In [23]:
df['door_number'] = df['door_number'].apply(lambda x : 4 if x=='four' else 2)

In [24]:
df.head()

Unnamed: 0,car_id,symboling,car_name,fuel_type,aspiration,door_number,car_body,drive_wheel,engine_location,wheel_base,...,engine_size,fuel_system,bore_ratio,stroke,compression_ratio,horse_power,peak_rpm,city_mpg,highway_mpg,price
0,1,3,alfa-romero giulia,gas,std,2,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5K,21,27,13495
1,2,3,alfa-romero stelvio,gas,std,2,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5K,21,27,16500
2,3,1,alfa-romero Quadrifoglio,gas,std,2,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5K,19,26,16500
3,4,2,audi 100 ls,gas,std,4,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,5,2,audi 100ls,gas,std,4,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [25]:
df['door_number'] = df['door_number'].astype('int')

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   car_id             205 non-null    int64  
 1   symboling          205 non-null    int64  
 2   car_name           205 non-null    object 
 3   fuel_type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   door_number        205 non-null    int64  
 6   car_body           205 non-null    object 
 7   drive_wheel        205 non-null    object 
 8   engine_location    205 non-null    object 
 9   wheel_base         205 non-null    float64
 10  car_length         205 non-null    float64
 11  car_width          205 non-null    float64
 12  car_height         205 non-null    float64
 13  curb_weight        205 non-null    int64  
 14  engine_type        205 non-null    object 
 15  cylinder_number    205 non-null    object 
 16  engine_size        205 non

## Fixing `peak_rpm`

**Posible Fixes**
- `5K` should be replaced with `5000`
- `5,500` should be replaced with `5500`
- Convert to numerical column

In [27]:
## Run the code and understand the output

df['peak_rpm'].apply(lambda x : (int(x.replace('K', '')) * 1000) if 'K' in x else int(x))

ValueError: ignored

In [28]:
df['peak_rpm'].apply(lambda x : (int(x.replace('K', '')) * 1000) if 'K' in x else int(x.replace(',', '')) if ',' in x else int(x))

0      5000
1      5000
2      5000
3      5500
4      5500
       ... 
200    5400
201    5300
202    5500
203    4800
204    5400
Name: peak_rpm, Length: 205, dtype: int64

In [29]:
df['peak_rpm'] = df['peak_rpm'].apply(lambda x : (int(x.replace('K', '')) * 1000) if 'K' in x else int(x.replace(',', '')) if ',' in x else int(x))

In [30]:
df['peak_rpm'] = df['peak_rpm'].astype('int')

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   car_id             205 non-null    int64  
 1   symboling          205 non-null    int64  
 2   car_name           205 non-null    object 
 3   fuel_type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   door_number        205 non-null    int64  
 6   car_body           205 non-null    object 
 7   drive_wheel        205 non-null    object 
 8   engine_location    205 non-null    object 
 9   wheel_base         205 non-null    float64
 10  car_length         205 non-null    float64
 11  car_width          205 non-null    float64
 12  car_height         205 non-null    float64
 13  curb_weight        205 non-null    int64  
 14  engine_type        205 non-null    object 
 15  cylinder_number    205 non-null    object 
 16  engine_size        205 non

In [32]:
df.cylinder_number.value_counts()

four      159
six        24
five       11
eight       5
two         4
three       1
twelve      1
Name: cylinder_number, dtype: int64

## Fixing `car_name`

**Posible Fixes**
- Extract Company Name from car name
- Observe problems in Company Names

In [33]:
df.head()

Unnamed: 0,car_id,symboling,car_name,fuel_type,aspiration,door_number,car_body,drive_wheel,engine_location,wheel_base,...,engine_size,fuel_system,bore_ratio,stroke,compression_ratio,horse_power,peak_rpm,city_mpg,highway_mpg,price
0,1,3,alfa-romero giulia,gas,std,2,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,2,3,alfa-romero stelvio,gas,std,2,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,3,1,alfa-romero Quadrifoglio,gas,std,2,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,4,2,audi 100 ls,gas,std,4,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,5,2,audi 100ls,gas,std,4,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [34]:
df.car_name

0            alfa-romero giulia
1           alfa-romero stelvio
2      alfa-romero Quadrifoglio
3                   audi 100 ls
4                    audi 100ls
                 ...           
200             volvo 145e (sw)
201                 volvo 144ea
202                 volvo 244dl
203                   volvo 246
204                 volvo 264gl
Name: car_name, Length: 205, dtype: object

In [35]:
df.car_name[:20]

0           alfa-romero giulia
1          alfa-romero stelvio
2     alfa-romero Quadrifoglio
3                  audi 100 ls
4                   audi 100ls
5                     audi fox
6                   audi 100ls
7                    audi 5000
8                    audi 4000
9          audi 5000s (diesel)
10                    bmw 320i
11                    bmw 320i
12                      bmw x1
13                      bmw x3
14                      bmw z4
15                      bmw x4
16                      bmw x5
17                      bmw x3
18            chevrolet impala
19       chevrolet monte carlo
Name: car_name, dtype: object

In [36]:
df.car_name.apply(lambda x : x.split())

0            [alfa-romero, giulia]
1           [alfa-romero, stelvio]
2      [alfa-romero, Quadrifoglio]
3                  [audi, 100, ls]
4                    [audi, 100ls]
                  ...             
200            [volvo, 145e, (sw)]
201                 [volvo, 144ea]
202                 [volvo, 244dl]
203                   [volvo, 246]
204                 [volvo, 264gl]
Name: car_name, Length: 205, dtype: object

In [37]:
df.car_name.apply(lambda x : x.split()[0])

0      alfa-romero
1      alfa-romero
2      alfa-romero
3             audi
4             audi
          ...     
200          volvo
201          volvo
202          volvo
203          volvo
204          volvo
Name: car_name, Length: 205, dtype: object

In [38]:
df['car_company'] = df.car_name.apply(lambda x : x.split()[0])

In [39]:
df.car_company.value_counts()

toyota         31
nissan         17
mazda          15
honda          13
mitsubishi     13
subaru         12
peugeot        11
volvo          11
volkswagen      9
dodge           9
buick           8
bmw             8
audi            7
plymouth        7
saab            6
isuzu           4
porsche         4
alfa-romero     3
chevrolet       3
jaguar          3
vw              2
maxda           2
renault         2
toyouta         1
vokswagen       1
Nissan          1
mercury         1
porcshce        1
Name: car_company, dtype: int64

**Observations**

Notice that some **car_company names are misspelled**  
- vw and vokswagen should be volkswagen 
- porcshce should be porsche 
- toyouta should be toyota
- Nissan should be nissan 
- maxda should be mazda

This is a data quality issue, let's solve it.

In [45]:
# replacing misspelled car_company names

# volkswagen
df.loc[(df['car_company'] == "vw") | (df['car_company'] == "vokswagen"), 'car_company'] = 'volkswagen'

# porsche
df.loc[df['car_company'] == "porcshce", 'car_company'] = 'porsche'

# toyota
df.loc[df['car_company'] == "toyouta", 'car_company'] = 'toyota'

# nissan
df.loc[df['car_company'] == "Nissan", 'car_company'] = 'nissan'

# mazda
df.loc[df['car_company'] == "maxda", 'car_company'] = 'mazda'

In [46]:
# df['car_company'].apply(lambda x : "volkswagen" if x == "vw" or x == "vokswagen" else 
#                        "porsche" if x == "porcshce" else
#                        "toyota" if x == "toyouta" else
#                        "nissan" if x == "Nissan" else
#                        "mazda" if x == "maxda" else x)

In [47]:
# company_dictionary = {
#                       "vw" : "volkswagen", 
#                       "vokswagen" : "volkswagen", 
#                       "porcshce" : "porsche", 
#                       "toyouta" : "toyota", 
#                       "Nissan" : "nissan",
#                       "maxda" : "mazda"
#                       }

# df['car_company'].apply(lambda x : company_dictionary[x] if x in company_dictionary.keys() else x).value_counts()

In [48]:
df.car_company.value_counts()

toyota         32
nissan         18
mazda          17
mitsubishi     13
honda          13
volkswagen     12
subaru         12
peugeot        11
volvo          11
dodge           9
buick           8
bmw             8
audi            7
plymouth        7
saab            6
porsche         5
isuzu           4
jaguar          3
chevrolet       3
alfa-romero     3
renault         2
mercury         1
Name: car_company, dtype: int64

#### Dropping a column

In [49]:
df.columns

Index(['car_id', 'symboling', 'car_name', 'fuel_type', 'aspiration',
       'door_number', 'car_body', 'drive_wheel', 'engine_location',
       'wheel_base', 'car_length', 'car_width', 'car_height', 'curb_weight',
       'engine_type', 'cylinder_number', 'engine_size', 'fuel_system',
       'bore_ratio', 'stroke', 'compression_ratio', 'horse_power', 'peak_rpm',
       'city_mpg', 'highway_mpg', 'price', 'car_company'],
      dtype='object')

In [50]:
df.drop('car_name', axis=1, inplace=True)

df.columns

Index(['car_id', 'symboling', 'fuel_type', 'aspiration', 'door_number',
       'car_body', 'drive_wheel', 'engine_location', 'wheel_base',
       'car_length', 'car_width', 'car_height', 'curb_weight', 'engine_type',
       'cylinder_number', 'engine_size', 'fuel_system', 'bore_ratio', 'stroke',
       'compression_ratio', 'horse_power', 'peak_rpm', 'city_mpg',
       'highway_mpg', 'price', 'car_company'],
      dtype='object')

## Filtering Categorical and Numerical Columns

In [None]:
categorical_df = df.select_dtypes(include=['object'])

categorical_df.head()

Unnamed: 0,fuel_type,aspiration,car_body,drive_wheel,engine_location,engine_type,cylinder_number,fuel_system,price,car_company
0,gas,std,convertible,rwd,front,dohc,four,mpfi,13495,alfa-romero
1,gas,std,convertible,rwd,front,dohc,four,mpfi,16500,alfa-romero
2,gas,std,hatchback,rwd,front,ohcv,six,mpfi,16500,alfa-romero
3,gas,std,sedan,fwd,front,ohc,four,mpfi,13950,audi
4,gas,std,sedan,4wd,front,ohc,five,mpfi,17450,audi


In [None]:
numerical_df = df.select_dtypes(include=['int64', 'float64'])

numerical_df.head()

Unnamed: 0,car_id,symboling,wheel_base,car_length,car_width,car_height,curb_weight,engine_size,bore_ratio,stroke,compression_ratio,horse_power,city_mpg,highway_mpg
0,1,3,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,21,27
1,2,3,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,21,27
2,3,1,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154,19,26
3,4,2,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102,24,30
4,5,2,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115,18,22


In [None]:
print(len(categorical_df.columns))
print(len(numerical_df.columns))

10
14


In [None]:
print(len(df.columns))

26


In [None]:
df.dtypes

car_id                 int64
symboling              int64
fuel_type             object
aspiration            object
door_number            int32
car_body              object
drive_wheel           object
engine_location       object
wheel_base           float64
car_length           float64
car_width            float64
car_height           float64
curb_weight            int64
engine_type           object
cylinder_number       object
engine_size            int64
fuel_system           object
bore_ratio           float64
stroke               float64
compression_ratio    float64
horse_power            int64
peak_rpm               int32
city_mpg               int64
highway_mpg            int64
price                 object
car_company           object
dtype: object

In [None]:
for col in df.columns:
    if(df[col].dtype == "int32"):
        print(col)

door_number
peak_rpm


In [None]:
for col in df.columns:
    if(df[col].dtype == "int32"):
        df[col] = df[col].astype("int64")

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   car_id             205 non-null    int64  
 1   symboling          205 non-null    int64  
 2   fuel_type          205 non-null    object 
 3   aspiration         205 non-null    object 
 4   door_number        205 non-null    int64  
 5   car_body           205 non-null    object 
 6   drive_wheel        205 non-null    object 
 7   engine_location    205 non-null    object 
 8   wheel_base         205 non-null    float64
 9   car_length         205 non-null    float64
 10  car_width          205 non-null    float64
 11  car_height         205 non-null    float64
 12  curb_weight        205 non-null    int64  
 13  engine_type        205 non-null    object 
 14  cylinder_number    205 non-null    object 
 15  engine_size        205 non-null    int64  
 16  fuel_system        205 non

In [None]:
numerical_df = df.select_dtypes(include=['int64', 'float64'])

numerical_df.head()

Unnamed: 0,car_id,symboling,door_number,wheel_base,car_length,car_width,car_height,curb_weight,engine_size,bore_ratio,stroke,compression_ratio,horse_power,peak_rpm,city_mpg,highway_mpg
0,1,3,2,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27
1,2,3,2,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27
2,3,1,2,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154,5000,19,26
3,4,2,4,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102,5500,24,30
4,5,2,4,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115,5500,18,22


## Can you fix the Price Column? 

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   car_id             205 non-null    int64  
 1   symboling          205 non-null    int64  
 2   fuel_type          205 non-null    object 
 3   aspiration         205 non-null    object 
 4   door_number        205 non-null    int64  
 5   car_body           205 non-null    object 
 6   drive_wheel        205 non-null    object 
 7   engine_location    205 non-null    object 
 8   wheel_base         205 non-null    float64
 9   car_length         205 non-null    float64
 10  car_width          205 non-null    float64
 11  car_height         205 non-null    float64
 12  curb_weight        205 non-null    int64  
 13  engine_type        205 non-null    object 
 14  cylinder_number    205 non-null    object 
 15  engine_size        205 non-null    int64  
 16  fuel_system        205 non