In [1]:
import mglearn
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['font.family']='Malgun Gothic'
matplotlib.rcParams['axes.unicode_minus'] = False
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import warnings
warnings.simplefilter('ignore')

In [2]:
train_df = pd.read_excel('data4/hyundaiCar.xlsx', sheet_name='train')
test_df  = pd.read_excel('data4/hyundaiCar.xlsx', sheet_name='test')

In [3]:
y_train = train_df['가격']
x_train = train_df.iloc[:, 1:]
y_test = test_df['가격']
x_test = test_df.iloc[:, 1:]

In [4]:
print(f'x_train shape: { x_train.shape } ')
print(f'x_test shape:  { x_test.shape } ' )
print(f'y_train shape: { y_train.shape } ')
print(f'y_test shape:  { y_test.shape } ' )

x_train shape: (71, 10) 
x_test shape:  (31, 10) 
y_train shape: (71,) 
y_test shape:  (31,) 


## 문자열 encoding

- Label encoding
- OneHot encoding
- pd.get_dummies
- replace
- make_column_transformer

In [5]:
x_train[:7]

Unnamed: 0,년식,종류,연비,마력,토크,연료,하이브리드,배기량,중량,변속기
0,2015,준중형,11.8,172,21.0,가솔린,0,1999,1300,자동
1,2015,준중형,12.3,204,27.0,가솔린,0,1591,1300,자동
2,2015,소형,15.0,100,13.6,가솔린,0,1368,1035,수동
3,2014,소형,14.0,140,17.0,가솔린,0,1591,1090,자동
4,2015,대형,9.6,175,46.0,디젤,0,2497,1990,자동
5,2015,중형,12.4,200,44.5,디젤,0,2199,1864,자동
6,2015,중형,13.8,200,44.5,디젤,0,2199,1799,자동


#### Label encoder

In [6]:
lbl = LabelEncoder()
x_trainLabel = lbl.fit_transform( x_train['종류'] )
x_trainLabel # 0: 대형, 1: 소형, 2: 준중형, 3: 중형

array([2, 2, 1, 1, 0, 3, 3, 1, 3, 1, 2, 3, 2, 0, 1, 0, 0, 0, 3, 0, 0, 3,
       2, 0, 3, 3, 3, 1, 1, 2, 0, 0, 0, 0, 1, 0, 2, 0, 2, 1, 1, 1, 0, 0,
       0, 2, 1, 0, 2, 1, 3, 3, 0, 2, 2, 0, 3, 2, 0, 0, 2, 0, 1, 0, 0, 1,
       3, 1, 2, 3, 0])

In [7]:
lbl.classes_

array(['대형', '소형', '준중형', '중형'], dtype=object)

#### OneHot encoding

In [8]:
oneH = OneHotEncoder()
x_trianOne = oneH.fit_transform( x_train['종류'].values.reshape(-1,1) )
x_trianOne 

<71x4 sparse matrix of type '<class 'numpy.float64'>'
	with 71 stored elements in Compressed Sparse Row format>

In [9]:
x_trianOne.toarray()

array([[0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],


In [10]:
oneH.categories_

[array(['대형', '소형', '준중형', '중형'], dtype=object)]

#### pd.get_dummies

In [11]:
pd.get_dummies( x_train['종류'] )

Unnamed: 0,대형,소형,준중형,중형
0,0,0,1,0
1,0,0,1,0
2,0,1,0,0
3,0,1,0,0
4,1,0,0,0
...,...,...,...,...
66,0,0,0,1
67,0,1,0,0
68,0,0,1,0
69,0,0,0,1


In [12]:
pd.get_dummies( x_train )

Unnamed: 0,년식,연비,마력,토크,하이브리드,배기량,중량,종류_대형,종류_소형,종류_준중형,종류_중형,연료_LPG,연료_가솔린,연료_디젤,변속기_수동,변속기_자동
0,2015,11.8,172,21.0,0,1999,1300,0,0,1,0,0,1,0,0,1
1,2015,12.3,204,27.0,0,1591,1300,0,0,1,0,0,1,0,0,1
2,2015,15.0,100,13.6,0,1368,1035,0,1,0,0,0,1,0,1,0
3,2014,14.0,140,17.0,0,1591,1090,0,1,0,0,0,1,0,0,1
4,2015,9.6,175,46.0,0,2497,1990,1,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,2015,8.5,290,34.8,0,3342,1901,0,0,0,1,0,1,0,0,1
67,2012,13.3,108,13.9,0,1396,1040,0,1,0,0,0,1,0,0,1
68,2015,12.8,186,41.0,0,1995,1665,0,0,1,0,0,0,1,0,1
69,2015,17.7,156,19.3,1,1999,1585,0,0,0,1,0,1,0,0,1


In [13]:
pd.get_dummies( x_train, columns=['연료', '변속기'] )

Unnamed: 0,년식,종류,연비,마력,토크,하이브리드,배기량,중량,연료_LPG,연료_가솔린,연료_디젤,변속기_수동,변속기_자동
0,2015,준중형,11.8,172,21.0,0,1999,1300,0,1,0,0,1
1,2015,준중형,12.3,204,27.0,0,1591,1300,0,1,0,0,1
2,2015,소형,15.0,100,13.6,0,1368,1035,0,1,0,1,0
3,2014,소형,14.0,140,17.0,0,1591,1090,0,1,0,0,1
4,2015,대형,9.6,175,46.0,0,2497,1990,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,2015,중형,8.5,290,34.8,0,3342,1901,0,1,0,0,1
67,2012,소형,13.3,108,13.9,0,1396,1040,0,1,0,0,1
68,2015,준중형,12.8,186,41.0,0,1995,1665,0,0,1,0,1
69,2015,중형,17.7,156,19.3,1,1999,1585,0,1,0,0,1


#### replace

In [14]:
x_train['종류'].replace( ['대형', '중형', '준중형', '소형'] , [3, 4, 5, 6] )

0     5
1     5
2     6
3     6
4     3
     ..
66    4
67    6
68    5
69    4
70    3
Name: 종류, Length: 71, dtype: int64

#### make_column_transformer

In [15]:
myt = make_column_transformer( ( OneHotEncoder(), ['종류', '연료', '변속기'] ) )
res = myt.fit_transform( x_train )
res

array([[0., 0., 1., 0., 0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 0., 1., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1., 0., 1., 0.],
       [0., 1., 0., 0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 1., 0., 1.],
       [0., 0., 0., 1., 0., 0., 1., 0., 1.],
       [0., 0., 0., 1., 0., 0., 1., 0., 1.],
       [0., 1., 0., 0., 0., 1., 0., 0., 1.],
       [0., 0., 0., 1., 0., 1., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0., 1., 0., 1.],
       [0., 0., 0., 1., 0., 0., 1., 0., 1.],
       [0., 0., 1., 0., 0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 0., 1., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0., 1., 0., 1.],
       [1., 0., 0., 0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 1., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 1., 0., 0., 1.],
       [0.

#### Train

In [16]:
model = make_pipeline( myt, Ridge(alpha=1) )
model.fit
model = make_pipeline( myt, StandardScaler(), Ridge( ) )
param_value = { 'ridge__alpha': [0.001, 0.01, 0.1, 1, 2, 3] }
gridS = GridSearchCV( model, param_grid=param_value )
gridS.fit(x_train, y_train)
print( gridS.best_params_ )
print( gridS.best_score_ )

{'ridge__alpha': 3}
0.043512662036927566


In [17]:
gridS.best_estimator_.predict( x_test )

array([1869.6491851 , 1636.6664639 , 2714.51889147, 2643.12591126,
       2407.75612938,  916.15227143, 1461.91015705, 2643.12591126,
       2000.01710133, 2182.42434952, 3435.03308394, 3435.03308394,
        916.15227143, 2000.01710133, 3901.74704801,  378.04532715,
       1461.91015705, 1636.6664639 , 1636.6664639 , 2182.42434952,
       1098.55951962, 2182.42434952, 3901.74704801, 2720.5312938 ,
       2720.5312938 , 2720.5312938 , 3901.74704801, 2643.12591126,
       3181.23285554, 1636.6664639 , 2176.41194719])