# Data I/O

In [1]:
# load cars dataset to pandas dataframe from url
import pandas as pd

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'
df = pd.read_csv(url, header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [2]:
pd.set_option('display.max_columns', 500)

In [3]:
column_names = [
    'symboling',
    'normalized-losses',
    'make',
    'fuel-type',
    'aspiration',
    'num-of-doors',
    'body-style',
    'drive-wheels',
    'engine-location',
    'wheel-base',
    'length',
    'width',
    'height',
    'curb-weight',
    'engine-type',
    'num-of-cylinders',
    'engine-size',
    'fuel-system',
    'bore',
    'stroke',
    'compression-ratio',
    'horsepower',
    'peak-rpm',
    'city-mpg',
    'highway-mpg',
    'price',
]

df = pd.read_csv(url, names=column_names)
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


## cleaning

In [4]:
import numpy as np
df.replace('?', np.nan, inplace=True)
df.head(1)

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495


## Transforming

### num-of-doors

In [8]:
df['num-of-doors'].value_counts()

four    114
two      89
Name: num-of-doors, dtype: int64

In [5]:
df['num-of-doors'].describe()

count      203
unique       2
top       four
freq       114
Name: num-of-doors, dtype: object

In [7]:
df['num-of-doors'].replace({'four': 4, 'two': 2} ,inplace=True)
df.dropna(subset=['num-of-doors'], inplace=True)
df['num-of-doors'] = df['num-of-doors'].astype('int8')


In [9]:
df['num-of-doors'].describe()

count    203.000000
mean       3.123153
std        0.994841
min        2.000000
25%        2.000000
50%        4.000000
75%        4.000000
max        4.000000
Name: num-of-doors, dtype: float64

In [11]:
df.head(1)

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,2,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495


In [10]:
df.describe()

Unnamed: 0,symboling,num-of-doors,wheel-base,length,width,height,curb-weight,engine-size,compression-ratio,city-mpg,highway-mpg
count,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0
mean,0.837438,3.123153,98.781281,174.1133,65.915271,53.731527,2557.916256,127.073892,10.093202,25.172414,30.699507
std,1.250021,0.994841,6.040994,12.33909,2.150274,2.442526,522.557049,41.797123,3.888216,6.529812,6.874645
min,-2.0,2.0,86.6,141.1,60.3,47.8,1488.0,61.0,7.0,13.0,16.0
25%,0.0,2.0,94.5,166.55,64.1,52.0,2145.0,97.0,8.6,19.0,25.0
50%,1.0,4.0,97.0,173.2,65.5,54.1,2414.0,120.0,9.0,24.0,30.0
75%,2.0,4.0,102.4,183.3,66.9,55.5,2943.5,143.0,9.4,30.0,34.0
max,3.0,4.0,120.9,208.1,72.3,59.8,4066.0,326.0,23.0,49.0,54.0


### Categorical variables

In [14]:
categorical_columns = [
    'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 
    'drive-wheels', 'engine-location', 'engine-type', 'fuel-system'
]
for column in categorical_columns:
    print(df[column].value_counts())


toyota           32
nissan           18
mazda            16
mitsubishi       13
honda            13
volkswagen       12
subaru           12
peugot           11
volvo            11
mercedes-benz     8
dodge             8
bmw               8
audi              7
plymouth          7
saab              6
porsche           5
isuzu             4
jaguar            3
chevrolet         3
alfa-romero       3
renault           2
mercury           1
Name: make, dtype: int64
gas       184
diesel     19
Name: fuel-type, dtype: int64
std      167
turbo     36
Name: aspiration, dtype: int64
4    114
2     89
Name: num-of-doors, dtype: int64
sedan          94
hatchback      70
wagon          25
hardtop         8
convertible     6
Name: body-style, dtype: int64
fwd    118
rwd     76
4wd      9
Name: drive-wheels, dtype: int64
front    200
rear       3
Name: engine-location, dtype: int64
ohc      146
ohcf      15
ohcv      13
dohc      12
l         12
rotor      4
dohcv      1
Name: engine-type, dtype: int

In [16]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,2,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,,alfa-romero,gas,std,2,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,,alfa-romero,gas,std,2,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164.0,audi,gas,std,4,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164.0,audi,gas,std,4,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [15]:
categorical_columns = [
    'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 
    'drive-wheels', 'engine-location', 'engine-type', 'fuel-system'
]
for column in categorical_columns:
    df[column] = df[column].astype('category')

### num-of-cylinders

In [16]:
df['num-of-cylinders'].value_counts()

four      157
six        24
five       11
eight       5
two         4
three       1
twelve      1
Name: num-of-cylinders, dtype: int64

In [17]:
df['num-of-cylinders'].replace({'four': 4, 'six': 6, 'five': 5, 'eight': 8, 'two': 2, 'twelve': 12, 'three': 3}, inplace=True)

In [18]:
df['num-of-doors'] =  df['num-of-doors'].astype('int8')

In [19]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,2,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,4,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,,alfa-romero,gas,std,2,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,4,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,,alfa-romero,gas,std,2,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,6,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164.0,audi,gas,std,4,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,4,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164.0,audi,gas,std,4,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,5,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [25]:
df.to_csv('autos.csv', index=False)

In [26]:
df.to_json('autos.json')

In [27]:
df.to_pickle('autos.pkl')