# 二手車價預測
## [Auto MPG 資料集](https://archive.ics.uci.edu/ml/datasets/auto+mpg)

## 載入相關套件

In [76]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

## 1. 載入資料集

In [77]:
# Load the dataset from the UCI repository
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
names = ["mpg", "cylinders", "displacement", "horsepower", "weight",
         "acceleration", "model_year", "origin", "car_name"]
df = pd.read_csv(url, names=names, delim_whitespace=True)

# Display the first 5 rows of the dataframe
df.head()


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [78]:
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
car_name        0
dtype: int64

In [79]:
# 方法 2
df = pd.read_csv(url, names=names, delim_whitespace=True, na_values='?')
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


## 2. 資料清理、資料探索與分析

In [80]:
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
car_name        0
dtype: int64

In [81]:
df = df.dropna()

In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 392 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        392 non-null    float64
 5   acceleration  392 non-null    float64
 6   model_year    392 non-null    int64  
 7   origin        392 non-null    int64  
 8   car_name      392 non-null    object 
dtypes: float64(5), int64(3), object(1)
memory usage: 30.6+ KB


In [83]:
for col in df.columns:
    if df[col].dtype == 'object':
        print(col)
        print(df[col].unique())

car_name
['chevrolet chevelle malibu' 'buick skylark 320' 'plymouth satellite'
 'amc rebel sst' 'ford torino' 'ford galaxie 500' 'chevrolet impala'
 'plymouth fury iii' 'pontiac catalina' 'amc ambassador dpl'
 'dodge challenger se' "plymouth 'cuda 340" 'chevrolet monte carlo'
 'buick estate wagon (sw)' 'toyota corona mark ii' 'plymouth duster'
 'amc hornet' 'ford maverick' 'datsun pl510'
 'volkswagen 1131 deluxe sedan' 'peugeot 504' 'audi 100 ls' 'saab 99e'
 'bmw 2002' 'amc gremlin' 'ford f250' 'chevy c20' 'dodge d200' 'hi 1200d'
 'chevrolet vega 2300' 'toyota corona' 'plymouth satellite custom'
 'ford torino 500' 'amc matador' 'pontiac catalina brougham'
 'dodge monaco (sw)' 'ford country squire (sw)' 'pontiac safari (sw)'
 'amc hornet sportabout (sw)' 'chevrolet vega (sw)' 'pontiac firebird'
 'ford mustang' 'mercury capri 2000' 'opel 1900' 'peugeot 304' 'fiat 124b'
 'toyota corolla 1200' 'datsun 1200' 'volkswagen model 111'
 'plymouth cricket' 'toyota corona hardtop' 'dodge colt hard

In [84]:
df.car_name.nunique()

301

In [85]:
df = df.drop('car_name', axis=1)

In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 392 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        392 non-null    float64
 5   acceleration  392 non-null    float64
 6   model_year    392 non-null    int64  
 7   origin        392 non-null    int64  
dtypes: float64(5), int64(3)
memory usage: 27.6 KB


In [87]:
y = df.mpg
X = df.drop('mpg', axis=1)

## 3. 不須進行特徵工程

## 4. 資料分割

In [88]:
# 指定X，並轉為 Numpy 陣列
X = X.values

# 資料分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

# 查看陣列維度
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((313, 7), (79, 7), (313,), (79,))

## 特徵縮放

In [89]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

## 5. 選擇演算法

In [90]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

## 6. 模型訓練

In [91]:
model.fit(X_train_std, y_train)

## 7. 模型評估

In [92]:
y_pred = model.predict(X_test_std)
y_pred

array([34.591842  , 30.72688449, 16.4297567 , 23.25578021, 18.51034139,
       28.77286093, 14.84856629, 29.39225069, 13.49839868, 29.15508698,
       31.46386333, 13.77944237, 26.00095961, 29.14153641, 24.13470199,
       25.99840378, 26.72993906, 29.91218521, 19.78227061, 30.86673992,
       17.27159997, 20.67758297, 19.96535969, 10.87736295, 12.61108533,
       25.54817185, 25.68850415, 26.94991119, 15.52805841, 25.46665403,
       19.95169275, 16.57072051, 19.63821458, 32.80230354, 24.09525063,
       12.83702446, 28.16518239, 17.08579252, 29.01990335,  6.42800614,
       26.2414252 , 13.52851528, 22.93544922, 26.72381304, 23.62362795,
       25.85646091, 22.09203071, 22.24863391, 33.20913057, 19.10853334,
       31.72311368, 36.13583256, 24.1429739 , 23.53700951, 11.57827176,
       28.95814325, 10.18462873, 13.55139799, 12.86096079, 25.52530476,
       26.29075594, 29.45843724, 25.21190301, 21.16519821, 28.5008907 ,
       22.70011568, 32.69197453, 27.8489495 , 30.84765923, 15.00

In [93]:
# R2、MSE、MAE
y_pred = model.predict(X_test_std)
print(f'R2 = {r2_score(y_test, y_pred):.2f}') 
print(f'MSE = {mean_squared_error(y_test, y_pred)}') 
print(f'MAE = {mean_absolute_error(y_test, y_pred)}') 

R2 = 0.82
MSE = 11.357689553898508
MAE = 2.4665461712857972
