In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import requests, zipfile
import io

In [16]:
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
res = requests.get(url).content

auto = pd.read_csv(io.StringIO(res.decode("utf-8")), header=None)
columns =  ["symboling", "normalized-losses", "make", "fuel-type", "aspiration", "num-of-doors", "body-style", "drive-wheels", "engine-location", "wheel-base",
                "length", "width", "height", "curb-weight", "engine-type", "num-of-cylinders", "engine-size", "fuel-system", "bore", "stroke",
                "compression-ratio", "horsepower", "peak-rpm", "city-mpg", "highway-mpg", "price"]
auto.columns = columns

In [17]:
auto = auto.replace("?", np.nan).dropna(axis=0).reset_index(drop=True)
auto.info()
auto.iloc[:, :10]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          159 non-null    int64  
 1   normalized-losses  159 non-null    object 
 2   make               159 non-null    object 
 3   fuel-type          159 non-null    object 
 4   aspiration         159 non-null    object 
 5   num-of-doors       159 non-null    object 
 6   body-style         159 non-null    object 
 7   drive-wheels       159 non-null    object 
 8   engine-location    159 non-null    object 
 9   wheel-base         159 non-null    float64
 10  length             159 non-null    float64
 11  width              159 non-null    float64
 12  height             159 non-null    float64
 13  curb-weight        159 non-null    int64  
 14  engine-type        159 non-null    object 
 15  num-of-cylinders   159 non-null    object 
 16  engine-size        159 non

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base
0,2,164,audi,gas,std,four,sedan,fwd,front,99.8
1,2,164,audi,gas,std,four,sedan,4wd,front,99.4
2,1,158,audi,gas,std,four,sedan,fwd,front,105.8
3,1,158,audi,gas,turbo,four,sedan,fwd,front,105.8
4,2,192,bmw,gas,std,two,sedan,rwd,front,101.2
...,...,...,...,...,...,...,...,...,...,...
154,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1
155,-1,95,volvo,gas,turbo,four,sedan,rwd,front,109.1
156,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1
157,-1,95,volvo,diesel,turbo,four,sedan,rwd,front,109.1


In [18]:
auto = auto[["price", "horsepower", "width", "length", "height", "curb-weight"]]
auto.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   price        159 non-null    object 
 1   horsepower   159 non-null    object 
 2   width        159 non-null    float64
 3   length       159 non-null    float64
 4   height       159 non-null    float64
 5   curb-weight  159 non-null    int64  
dtypes: float64(3), int64(1), object(2)
memory usage: 7.6+ KB


In [19]:
auto["price"] = auto["price"].astype(np.int64)
auto["horsepower"] = auto["horsepower"].astype(np.int64)

auto.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   price        159 non-null    int64  
 1   horsepower   159 non-null    int64  
 2   width        159 non-null    float64
 3   length       159 non-null    float64
 4   height       159 non-null    float64
 5   curb-weight  159 non-null    int64  
dtypes: float64(3), int64(3)
memory usage: 7.6 KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  auto["price"] = auto["price"].astype(np.int64)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  auto["horsepower"] = auto["horsepower"].astype(np.int64)


In [20]:
auto.corr()

Unnamed: 0,price,horsepower,width,length,height,curb-weight
price,1.0,0.759874,0.843371,0.760952,0.244836,0.893639
horsepower,0.759874,1.0,0.681872,0.672063,0.034317,0.790095
width,0.843371,0.681872,1.0,0.838338,0.292706,0.870595
length,0.760952,0.672063,0.838338,1.0,0.499251,0.871291
height,0.244836,0.034317,0.292706,0.499251,1.0,0.367052
curb-weight,0.893639,0.790095,0.870595,0.871291,0.367052,1.0


In [21]:
auto.shape

(159, 6)

In [22]:
X = auto.drop(["price"], axis=1).values
y = auto["price"].values

In [23]:
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

In [24]:
std = StandardScaler()
X = std.fit_transform(X)

In [25]:
model = RandomForestRegressor(n_estimators=100, random_state=0)

In [26]:
scores = cross_val_score(model, X, y, cv=10, scoring="neg_mean_squared_error")


In [27]:
print(scores)

[-13928620.94881458  -2146913.48178503  -9277094.6120102
  -3894347.73706177  -7667507.86850156  -4169298.81764369
  -1420823.91624276  -6355185.63967578  -3703747.52014657
  -8750445.62168074]


In [28]:
print(scores.mean())

-6131398.6163562685
