In [330]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import all_estimators
from tensorflow.keras import Sequential   # 모델 담는 애
from tensorflow.keras.layers import Dense
import tensorflow as tf
import pandas as pd
import numpy as np

## [1] 데이터 로딩

In [331]:
# re(정규표현식) => \s : 화이트 스페이스(공백)
df=pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data', header=None, sep='\s+')
df

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.00,2790.0,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52.00,2130.0,24.6,82,2,vw pickup
395,32.0,4,135.0,84.00,2295.0,11.6,82,1,dodge rampage
396,28.0,4,120.0,79.00,2625.0,18.6,82,1,ford ranger


In [332]:
df.columns=['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year','origin', 'car_name']

In [333]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.00,2790.0,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52.00,2130.0,24.6,82,2,vw pickup
395,32.0,4,135.0,84.00,2295.0,11.6,82,1,dodge rampage
396,28.0,4,120.0,79.00,2625.0,18.6,82,1,ford ranger


In [334]:
df.info()
# horsepower => float

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car_name      398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


## [2] 데이터 전처리

In [335]:
df['horsepower'].unique()

array(['130.0', '165.0', '150.0', '140.0', '198.0', '220.0', '215.0',
       '225.0', '190.0', '170.0', '160.0', '95.00', '97.00', '85.00',
       '88.00', '46.00', '87.00', '90.00', '113.0', '200.0', '210.0',
       '193.0', '?', '100.0', '105.0', '175.0', '153.0', '180.0', '110.0',
       '72.00', '86.00', '70.00', '76.00', '65.00', '69.00', '60.00',
       '80.00', '54.00', '208.0', '155.0', '112.0', '92.00', '145.0',
       '137.0', '158.0', '167.0', '94.00', '107.0', '230.0', '49.00',
       '75.00', '91.00', '122.0', '67.00', '83.00', '78.00', '52.00',
       '61.00', '93.00', '148.0', '129.0', '96.00', '71.00', '98.00',
       '115.0', '53.00', '81.00', '79.00', '120.0', '152.0', '102.0',
       '108.0', '68.00', '58.00', '149.0', '89.00', '63.00', '48.00',
       '66.00', '139.0', '103.0', '125.0', '133.0', '138.0', '135.0',
       '142.0', '77.00', '62.00', '132.0', '84.00', '64.00', '74.00',
       '116.0', '82.00'], dtype=object)

In [336]:
# horsepower의 ?를 결측치로 변경
df['horsepower']=df['horsepower'].replace('?', np.NaN)

In [337]:
df['horsepower'].unique()

array(['130.0', '165.0', '150.0', '140.0', '198.0', '220.0', '215.0',
       '225.0', '190.0', '170.0', '160.0', '95.00', '97.00', '85.00',
       '88.00', '46.00', '87.00', '90.00', '113.0', '200.0', '210.0',
       '193.0', nan, '100.0', '105.0', '175.0', '153.0', '180.0', '110.0',
       '72.00', '86.00', '70.00', '76.00', '65.00', '69.00', '60.00',
       '80.00', '54.00', '208.0', '155.0', '112.0', '92.00', '145.0',
       '137.0', '158.0', '167.0', '94.00', '107.0', '230.0', '49.00',
       '75.00', '91.00', '122.0', '67.00', '83.00', '78.00', '52.00',
       '61.00', '93.00', '148.0', '129.0', '96.00', '71.00', '98.00',
       '115.0', '53.00', '81.00', '79.00', '120.0', '152.0', '102.0',
       '108.0', '68.00', '58.00', '149.0', '89.00', '63.00', '48.00',
       '66.00', '139.0', '103.0', '125.0', '133.0', '138.0', '135.0',
       '142.0', '77.00', '62.00', '132.0', '84.00', '64.00', '74.00',
       '116.0', '82.00'], dtype=object)

In [338]:
# horsepower의 자료형 변환
df['horsepower']=df['horsepower'].astype('float64')

In [294]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car_name      398 non-null    object 
dtypes: float64(5), int64(3), object(1)
memory usage: 28.1+ KB


In [295]:
df.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
car_name        0
dtype: int64

In [296]:
df['horsepower'].mean()

104.46938775510205

In [339]:
# horsepower의 결측치를 평균으로 대치
df['horsepower'].fillna(df['horsepower'].mean(), inplace=True)

In [340]:
df.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
car_name        0
dtype: int64

In [341]:
df.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,104.469388,2970.424623,15.56809,76.01005,1.572864
std,7.815984,1.701004,104.269838,38.199187,846.841774,2.757689,3.697627,0.802055
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0,1.0
25%,17.5,4.0,104.25,76.0,2223.75,13.825,73.0,1.0
50%,23.0,4.0,148.5,95.0,2803.5,15.5,76.0,1.0
75%,29.0,8.0,262.0,125.0,3608.0,17.175,79.0,2.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0,3.0


In [342]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790.0,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52.0,2130.0,24.6,82,2,vw pickup
395,32.0,4,135.0,84.0,2295.0,11.6,82,1,dodge rampage
396,28.0,4,120.0,79.0,2625.0,18.6,82,1,ford ranger


In [343]:
# mpg => km/L 단위 변환한 컬럼 추가
df.insert(1, 'kmL', df['mpg']*0.425)

In [344]:
df

Unnamed: 0,mpg,kmL,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,7.650,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,6.375,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,7.650,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,6.800,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,7.225,8,302.0,140.0,3449.0,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...,...
393,27.0,11.475,4,140.0,86.0,2790.0,15.6,82,1,ford mustang gl
394,44.0,18.700,4,97.0,52.0,2130.0,24.6,82,2,vw pickup
395,32.0,13.600,4,135.0,84.0,2295.0,11.6,82,1,dodge rampage
396,28.0,11.900,4,120.0,79.0,2625.0,18.6,82,1,ford ranger


In [345]:
# mpg 대신 kmL을 종속변수로 사용할 거라서 mpg 열 drop
df.drop(columns=['mpg'], inplace=True)

In [346]:
df

Unnamed: 0,kmL,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,7.650,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,6.375,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,7.650,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,6.800,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,7.225,8,302.0,140.0,3449.0,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,11.475,4,140.0,86.0,2790.0,15.6,82,1,ford mustang gl
394,18.700,4,97.0,52.0,2130.0,24.6,82,2,vw pickup
395,13.600,4,135.0,84.0,2295.0,11.6,82,1,dodge rampage
396,11.900,4,120.0,79.0,2625.0,18.6,82,1,ford ranger


In [347]:
len(df['car_name'].unique())

305

In [348]:
# car_name의 unique한 값이 너무 많으므로 열 삭제
df.drop(columns=['car_name'], inplace=True)

In [349]:
df

Unnamed: 0,kmL,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,7.650,8,307.0,130.0,3504.0,12.0,70,1
1,6.375,8,350.0,165.0,3693.0,11.5,70,1
2,7.650,8,318.0,150.0,3436.0,11.0,70,1
3,6.800,8,304.0,150.0,3433.0,12.0,70,1
4,7.225,8,302.0,140.0,3449.0,10.5,70,1
...,...,...,...,...,...,...,...,...
393,11.475,4,140.0,86.0,2790.0,15.6,82,1
394,18.700,4,97.0,52.0,2130.0,24.6,82,2
395,13.600,4,135.0,84.0,2295.0,11.6,82,1
396,11.900,4,120.0,79.0,2625.0,18.6,82,1


In [308]:
df.corr()

Unnamed: 0,kmL,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
kmL,1.0,-0.775396,-0.804203,-0.771437,-0.831741,0.420289,0.579267,0.56345
cylinders,-0.775396,1.0,0.950721,0.838939,0.896017,-0.505419,-0.348746,-0.562543
displacement,-0.804203,0.950721,1.0,0.893646,0.932824,-0.543684,-0.370164,-0.609409
horsepower,-0.771437,0.838939,0.893646,1.0,0.860574,-0.684259,-0.411651,-0.453669
weight,-0.831741,0.896017,0.932824,0.860574,1.0,-0.417457,-0.306564,-0.581024
acceleration,0.420289,-0.505419,-0.543684,-0.684259,-0.417457,1.0,0.288137,0.205873
model_year,0.579267,-0.348746,-0.370164,-0.411651,-0.306564,0.288137,1.0,0.180662
origin,0.56345,-0.562543,-0.609409,-0.453669,-0.581024,0.205873,0.180662,1.0


In [309]:
df.describe()

Unnamed: 0,kmL,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,9.993693,5.454774,193.425879,104.469388,2970.424623,15.56809,76.01005,1.572864
std,3.321793,1.701004,104.269838,38.199187,846.841774,2.757689,3.697627,0.802055
min,3.825,3.0,68.0,46.0,1613.0,8.0,70.0,1.0
25%,7.4375,4.0,104.25,76.0,2223.75,13.825,73.0,1.0
50%,9.775,4.0,148.5,95.0,2803.5,15.5,76.0,1.0
75%,12.325,8.0,262.0,125.0,3608.0,17.175,79.0,2.0
max,19.805,8.0,455.0,230.0,5140.0,24.8,82.0,3.0


## [3] 데이터 분리

In [350]:
# 종속변수 분리
data=df.iloc[:, 1:]
target=df.iloc[:, 1]

In [351]:
# train, test 세트 나누기
x_train, x_test, y_train, y_test=train_test_split(data, target, test_size=0.2, random_state=42)

In [352]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(318, 7) (80, 7) (318,) (80,)


## [4] 스케일링 - MinMaxScaler

In [353]:
# 스케일링
scaler=MinMaxScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)

## [5] 모델 - RandomForestRegressor

In [354]:
# 모델 만들기
rfmodel=RandomForestRegressor()

In [355]:
rfmodel.fit(x_train, y_train)

RandomForestRegressor()

In [356]:
print(rfmodel.score(x_train, y_train))
print(rfmodel.score(x_test, y_test))

0.9996402134224107
0.9996057304277643


### 최적의 모델 찾기

In [321]:
# all_estimator
models=all_estimators(type_filter='regressor')

In [322]:
scores=[]

for name, model in models:
    try:
        # 모델 객체 생성
        md=model()
        # 학습
        md.fit(x_train, y_train)
        # 평가
        train_result=md.score(x_train, y_train)
        result=md.score(x_test, y_test)
        scores.append((name, train_result, result))
        
    except:
        pass

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Lars())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LarsCV())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in

In [323]:
scores

[('ARDRegression', 1.0, 1.0),
 ('AdaBoostRegressor', 0.9966665851366037, 1.0),
 ('BaggingRegressor', 0.999155534901273, 0.9970944309927361),
 ('BayesianRidge', 1.0, 1.0),
 ('CCA', 1.0, 1.0),
 ('DecisionTreeRegressor', 1.0, 1.0),
 ('DummyRegressor', 0.0, -0.004585776294147026),
 ('ElasticNet', 0.04260437172113907, 0.038213969544857784),
 ('ElasticNetCV', 0.9998071529172983, 0.9997891789127219),
 ('ExtraTreeRegressor', 1.0, 1.0),
 ('ExtraTreesRegressor', 1.0, 0.9999935431799839),
 ('GammaRegressor', 0.4224720561300088, 0.4226249025540497),
 ('GaussianProcessRegressor', 0.9999999999998531, 0.9999992496154754),
 ('GradientBoostingRegressor', 0.9999999992944921, 0.9999999992912568),
 ('HistGradientBoostingRegressor', 0.99937910385469, 0.99990591337026),
 ('HuberRegressor', 1.0, 1.0),
 ('KNeighborsRegressor', 0.9919109132648248, 0.9959644874899112),
 ('KernelRidge', 0.9222067851860705, 0.9329712746460481),
 ('Lars', 1.0, 1.0),
 ('LarsCV', 1.0, 1.0),
 ('Lasso', 0.0, -0.004585776294147026),
 (

## [6] 예측

In [357]:
pred=np.array([4, 130, 80, 2600, 20, 80, 2])

In [358]:
rfmodel.predict(scaler.transform(pred.reshape(1, -1)))



array([4.])

In [359]:
rfmodel.predict([x_train[0]])

array([8.])

In [363]:
len(df.columns)

8

---

# 딥러닝

In [388]:
model=Sequential(name='auto_mpg')

In [389]:
model.add(Dense(10, activation='relu', input_shape=(7, )))

In [390]:
model.add(Dense(30, activation='relu'))

In [391]:
model.add(Dense(1, activation='linear'))

In [392]:
model.summary()

Model: "auto_mpg"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 10)                80        
                                                                 
 dense_7 (Dense)             (None, 30)                330       
                                                                 
 dense_8 (Dense)             (None, 1)                 31        
                                                                 
Total params: 441
Trainable params: 441
Non-trainable params: 0
_________________________________________________________________


In [393]:
model.compile(optimizer='adam', loss='mean_squared_error',
             metrics=['mean_squared_error'])

In [394]:
model.fit(x_train, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100


Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x222f8a77670>

In [395]:
model.evaluate(x_test, y_test)



[0.03760173171758652, 0.03760173171758652]

### 머신러닝한 랜덤포레스트 모델 MSE 값

In [385]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, rfmodel.predict(x_test))

0.0012212499999999966