# 연비 예측하기
- 데이터셋 : auto-mpg.csv
- 목적 : 연비예측
- 피쳐 : cylinders, displacement, horse power, weight, acceleration
- 타겟 : MPG
- 학습방법 : 지도학습 -> 회귀
- 알고리즘 : LinearRegression


In [50]:
import pandas as pd
import matplotlib.pyplot as plt
# 데이터준비
FILE = '../data/auto_mpg.csv'
autodf=pd.read_csv(FILE)
autodf




Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


In [51]:
# pd.to_numeric(..., errors='coerce')를 사용하여 ? 값을 NaN으로 변환
# ? 값을 NaN으로 변환
autodf['cylinders'] = pd.to_numeric(autodf['cylinders'], errors='coerce')
autodf['displacement'] = pd.to_numeric(autodf['displacement'], errors='coerce')
autodf['horsepower'] = pd.to_numeric(autodf['horsepower'], errors='coerce')
autodf['weight'] = pd.to_numeric(autodf['weight'], errors='coerce')
autodf['acceleration'] = pd.to_numeric(autodf['acceleration'], errors='coerce')

# NaN 값을 가진 행을 제거
autodf.dropna(subset=['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration'], inplace=True)

# 정리된 데이터 확인
print(autodf.head())
target=autodf['mpg']*0.425144
autodf.info()

    mpg  cylinders  displacement  horsepower  weight  acceleration  \
0  18.0          8         307.0       130.0    3504          12.0   
1  15.0          8         350.0       165.0    3693          11.5   
2  18.0          8         318.0       150.0    3436          11.0   
3  16.0          8         304.0       150.0    3433          12.0   
4  17.0          8         302.0       140.0    3449          10.5   

   model year  origin                   car name  
0          70       1  chevrolet chevelle malibu  
1          70       1          buick skylark 320  
2          70       1         plymouth satellite  
3          70       1              amc rebel sst  
4          70       1                ford torino  
<class 'pandas.core.frame.DataFrame'>
Index: 392 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2

In [52]:
target.shape

(392,)

In [53]:
feature = pd.concat([autodf['cylinders'], autodf['displacement'], autodf['horsepower'], autodf['weight'], autodf['acceleration']], axis=1)
feature.shape

(392, 5)

In [54]:
from sklearn.model_selection import train_test_split

In [55]:
x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.3, random_state=38)

In [56]:
print(f"X_train: {x_train.shape}, {x_train.ndim}D")
print(f"y_tarin: {y_train.shape}, {y_train.ndim}D")
print()
print(f"X_test: {x_test.shape}, {x_test.ndim}D")
print(f"y_test: {y_test.shape}, {y_test.ndim}D")

X_train: (274, 5), 2D
y_tarin: (274,), 1D

X_test: (118, 5), 2D
y_test: (118,), 1D


In [58]:
# 비율
print('X_train ratio: ', len(x_train)/len(autodf))
print('X_test ratio: ', len(x_test)/len(autodf))

X_train ratio:  0.6989795918367347
X_test ratio:  0.3010204081632653


In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model = LinearRegression()
model.fit(feature, target)

In [None]:
print(f'기울기:{model.coef_}, 절편: {model.intercept_}')

기울기:[-3.97928359e-01 -8.31301222e-05 -4.52570802e-02 -5.18691735e-03
 -2.91047140e-02], 절편: 46.264307852571136


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
score = model.score(feature, target)
print(f'score : {score}')

score : 0.7076926326770516


In [None]:
# 예측결과
pre_feature= model.predict(feature)
print(pre_feature)

[18.64772466 16.09437724 18.12348372 18.11110358 18.52450703 11.27686094
 10.24080171 10.70065382  9.61705734 14.23259315 16.58330487 16.85906149
 16.47457598 16.56233967 27.62383944 24.41519541 24.63062624 26.12902926
 29.21175544 32.46806514 26.35730848 27.56433477 27.53626507 27.59710334
 25.61552974  8.97559235 10.86942298 10.42847331  9.23806286 29.21175544
 28.39351504 28.39986025 25.29104192 20.81710818 21.61187624 22.29500797
 21.82603619 13.40337553 11.63853665 14.18800074 14.64190884  8.86688805
 10.38755901  8.11762817 23.12044849 28.35935989 21.87021371 23.16958021
 28.84792114 29.17052258 30.17282302 30.0927382  31.97558794 32.65348798
 31.88331    30.75996391 28.09685731 29.5217564  29.84537599 27.53018009
 28.74403785 13.0662259  12.03375077 14.42506671 14.33222603 16.88598269
  9.28060556 12.29252248 12.30483527 11.14842627 28.19633583 15.71575616
 15.50848633 13.98148595 14.71135556 23.95849576 27.67476327 24.70588413
 29.66382847 27.88200118 28.13843239 26.8522488  29

In [None]:
# 오차 계산 함수들임 -> 정답과 예측값(predcit)을 줘야함
mse = mean_squared_error(target, pre_feature)
rmse = mean_squared_error(target, pre_feature, squared=False)
mae = mean_absolute_error(target, pre_feature)

# 얼마나 정답에 가깝게 값을 예측했느냐를 나타내는 지표.
r2 = r2_score(target, pre_feature)

In [None]:
print(mse)
# 평균 제곱 오차는 예측 값과 실제 값 간의 차이를 제곱하여 평균을 구한 값
# # 값이 작을 수록 모델의 성능이 좋음을 나타냄
print(rmse)


print(mae)
# 평균 절대 오차는 예측 값과 실제 값 간의 차이의 절대값을 평균한 값
# 값이 작을 수록 모델의 성능이 좋음을 나타냄

print(r2)
# 결정 계수 : 모델이 데이터의 변동성을 얼마나 설명하는지 지표임.
# 1에 가까울 수록 모델이 데이터를 잘 설명함
# 회귀 모델을 평가할 때 사용

17.761396105406217
4.214427138462145
3.2223001384505623
0.7076926326770516
