In [440]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import requests, zipfile
import io

In [441]:
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
res = requests.get(url).content

auto = pd.read_csv(io.StringIO(res.decode("utf-8")), header=None)
columns =  ["symboling", "normalized-losses", "make", "fuel-type", "aspiration", "num-of-doors", "body-style", "drive-wheels", "engine-location", "wheel-base",
                "length", "width", "height", "curb-weight", "engine-type", "num-of-cylinders", "engine-size", "fuel-system", "bore", "stroke",
                "compression-ratio", "horsepower", "peak-rpm", "city-mpg", "highway-mpg", "price"]
auto.columns = columns

In [442]:
auto = auto.replace("?", np.nan).replace("|", "np.nan").replace(",", "").dropna(axis=0).reset_index(drop=True)
auto.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
1,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450
2,1,158,audi,gas,std,four,sedan,fwd,front,105.8,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,17710
3,1,158,audi,gas,turbo,four,sedan,fwd,front,105.8,...,131,mpfi,3.13,3.4,8.3,140,5500,17,20,23875
4,2,192,bmw,gas,std,two,sedan,rwd,front,101.2,...,108,mpfi,3.5,2.8,8.8,101,5800,23,29,16430


In [443]:
onehotencoding = ["symboling", "make", "fuel-type", "aspiration", "num-of-doors", "body-style", "drive-wheels",
                  "engine-location", "engine-type", "num-of-cylinders", "fuel-system" ]

In [444]:
auto = pd.get_dummies(auto, columns=onehotencoding,
                      drop_first=True)


In [445]:
floating = list(set(columns) - set(onehotencoding))

In [446]:
auto.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 59 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   normalized-losses       159 non-null    object 
 1   wheel-base              159 non-null    float64
 2   length                  159 non-null    float64
 3   width                   159 non-null    float64
 4   height                  159 non-null    float64
 5   curb-weight             159 non-null    int64  
 6   engine-size             159 non-null    int64  
 7   bore                    159 non-null    object 
 8   stroke                  159 non-null    object 
 9   compression-ratio       159 non-null    float64
 10  horsepower              159 non-null    object 
 11  peak-rpm                159 non-null    object 
 12  city-mpg                159 non-null    int64  
 13  highway-mpg             159 non-null    int64  
 14  price                   159 non-null    ob

In [447]:
for col in floating:
    auto[col] = auto[col].astype(np.float64)

auto.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 59 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   normalized-losses       159 non-null    float64
 1   wheel-base              159 non-null    float64
 2   length                  159 non-null    float64
 3   width                   159 non-null    float64
 4   height                  159 non-null    float64
 5   curb-weight             159 non-null    float64
 6   engine-size             159 non-null    float64
 7   bore                    159 non-null    float64
 8   stroke                  159 non-null    float64
 9   compression-ratio       159 non-null    float64
 10  horsepower              159 non-null    float64
 11  peak-rpm                159 non-null    float64
 12  city-mpg                159 non-null    float64
 13  highway-mpg             159 non-null    float64
 14  price                   159 non-null    fl

In [448]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA

In [449]:
auto.corr()

Unnamed: 0,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-ratio,...,engine-type_ohcv,num-of-cylinders_five,num-of-cylinders_four,num-of-cylinders_six,num-of-cylinders_three,fuel-system_2bbl,fuel-system_idi,fuel-system_mfi,fuel-system_mpfi,fuel-system_spdi
normalized-losses,1.0,-0.060086,0.035541,0.109726,-0.413702,0.125858,0.20782,-0.031558,0.06333,-0.127259,...,0.186968,0.003516,-0.244995,0.288623,-0.000296,-0.14231,-0.110178,0.053429,0.233676,0.090593
wheel-base,-0.060086,1.0,0.871534,0.814991,0.555767,0.810181,0.649206,0.578159,0.167449,0.291431,...,0.014449,0.389531,-0.364894,0.220615,-0.152345,-0.459666,0.339752,-0.036513,0.419498,-0.109983
length,0.035541,0.871534,1.0,0.838338,0.499251,0.871291,0.725953,0.646318,0.121073,0.184814,...,0.151761,0.338955,-0.383195,0.275542,-0.216873,-0.483221,0.230757,0.005445,0.545299,-0.09308
width,0.109726,0.814991,0.838338,1.0,0.292706,0.870595,0.779253,0.572554,0.196619,0.258752,...,0.262856,0.513866,-0.513139,0.269691,-0.217457,-0.526301,0.312339,0.028371,0.467067,-0.078649
height,-0.413702,0.555767,0.499251,0.292706,1.0,0.367052,0.111083,0.254836,-0.091313,0.233308,...,-0.120793,0.20339,-0.019879,-0.085293,-0.024601,-0.147031,0.245546,-0.130131,0.141426,-0.243746
curb-weight,0.125858,0.810181,0.871291,0.870595,0.367052,1.0,0.888626,0.645792,0.173844,0.224724,...,0.347825,0.38645,-0.578496,0.426738,-0.161147,-0.567896,0.287617,0.057935,0.53056,-0.064262
engine-size,0.20782,0.649206,0.725953,0.779253,0.111083,0.888626,1.0,0.595737,0.299683,0.141097,...,0.5109,0.303305,-0.701183,0.609479,-0.152553,-0.502771,0.190333,0.096347,0.503662,-0.083234
bore,-0.031558,0.578159,0.646318,0.572554,0.254836,0.645792,0.595737,1.0,-0.102581,0.015119,...,0.131608,0.083877,-0.102459,0.085637,-0.116463,-0.321252,0.06121,0.089521,0.474115,-0.125847
stroke,0.06333,0.167449,0.121073,0.196619,-0.091313,0.173844,0.299683,-0.102581,1.0,0.243587,...,-0.029443,0.219586,-0.179062,0.089174,-0.055846,-0.241469,0.270438,0.179607,-0.090158,0.119926
compression-ratio,-0.127259,0.291431,0.184814,0.258752,0.233308,0.224724,0.141097,0.015119,0.243587,1.0,...,-0.084515,0.313685,-0.153997,-0.0215,-0.013566,-0.225656,0.987713,-0.064862,-0.276533,-0.121813


In [450]:
X = auto.drop(["price"], axis=1).values
y = auto["price"].values

In [451]:
pca = PCA(n_components=3)
pca.fit(X)
X = pca.transform(X)

In [452]:
model = RandomForestRegressor(n_estimators=100,
                              random_state=0)


In [453]:
scores = cross_val_score(model, X, y, cv=4)


In [454]:
print(scores)

[0.64080788 0.6717524  0.5529943  0.57987281]


In [455]:
print(scores.mean())

0.6113568451541496
