In [1]:
import numpy as np
import numpy.random as random
import scipy as sp
import pandas as pd
from pandas import Series, DataFrame

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

import sklearn

%precision 3

'%.3f'

In [2]:
import requests, zipfile
import io

In [3]:
# 自動車価格データを取得
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'
res = requests.get(url).content

In [21]:
# 取得したデータをDataFrameで読み込み
auto = pd.read_csv(io.StringIO(res.decode('utf-8')), header=None)

In [22]:
auto.columns = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors',
               'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height',
               'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 
               'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price'
               ]

In [25]:
print(auto.shape)
auto.head()

(205, 4)


Unnamed: 0,price,horsepower,width,height
0,13495,111,64.1,48.8
1,16500,111,64.1,48.8
2,16500,154,65.5,52.4
3,13950,102,66.2,54.3
4,17450,115,66.4,54.3


In [26]:
# 不適切なデータ「？」のカウント
auto = auto[['price', 'horsepower', 'width', 'height']]
auto.isin(['?']).sum()

price         4
horsepower    2
width         0
height        0
dtype: int64

In [27]:
# ? をNaNに置換して、NaNがある行を削除
auto = auto.replace('?', np.nan).dropna()
auto.shape

(199, 4)

In [28]:
auto.dtypes

price          object
horsepower     object
width         float64
height        float64
dtype: object

In [29]:
auto = auto.assign(price=pd.to_numeric(auto.price))
auto = auto.assign(horsepower=pd.to_numeric(auto.horsepower))
auto.dtypes

price           int64
horsepower      int64
width         float64
height        float64
dtype: object

In [30]:
# 相関の確認
auto.corr()

Unnamed: 0,price,horsepower,width,height
price,1.0,0.810533,0.753871,0.13499
horsepower,0.810533,1.0,0.615315,-0.087407
width,0.753871,0.615315,1.0,0.309223
height,0.13499,-0.087407,0.309223,1.0


In [32]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# X:説明変数 y:目的変数
X = auto.drop('price', axis=1)
y = auto['price']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.5, random_state=0)

model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [35]:
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

0.7333575683901375
0.7370688738125756


In [37]:
print('\n回帰係数\n{}'.format(pd.Series(model.coef_, index=X.columns)))
print('切片：{:.3f}'.format(model.intercept_))


回帰係数
horsepower      81.651078
width         1829.174506
height         229.510077
dtype: float64
切片：-128409.046


In [39]:
# 取得したデータをDataFrameで読み込み
auto = pd.read_csv(io.StringIO(res.decode('utf-8')), header=None)

In [40]:
auto.columns = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors',
               'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height',
               'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 
               'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price'
               ]

In [41]:
# 不適切なデータ「？」のカウント
auto = auto[['price', 'width', 'engine-size']]
auto.isin(['?']).sum()

price          4
width          0
engine-size    0
dtype: int64

In [42]:
# ? をNaNに置換して、NaNがある行を削除
auto = auto.replace('?', np.nan).dropna()

In [44]:
auto = auto.assign(price=pd.to_numeric(auto.price))
auto.dtypes

price            int64
width          float64
engine-size      int64
dtype: object

In [45]:
# X:説明変数 y:目的変数
X = auto.drop('price', axis=1)
y = auto['price']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.5, random_state=0)

model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [46]:
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

0.783188597901612
0.778292193907783


In [47]:
print('\n回帰係数\n{}'.format(pd.Series(model.coef_, index=X.columns)))
print('切片：{:.3f}'.format(model.intercept_))


回帰係数
width          1261.735518
engine-size     109.526787
dtype: float64
切片：-84060.643
