In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.datasets import load_boston, load_iris

from sklearn.linear_model import Ridge,Lasso,ElasticNet, LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler

from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

import mglearn
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['font.family']='Malgun Gothic'
matplotlib.rcParams['axes.unicode_minus'] = False

import warnings
warnings.simplefilter('ignore')

In [2]:
train_df = pd.read_excel('../data4/hyundaiCar.xlsx',sheet_name = 'train')
test_df = pd.read_excel('../data4/hyundaiCar.xlsx',sheet_name = 'test')

In [4]:
# 가격: label
# 나머지 : feature
x_train = train_df.iloc[:,1:]
y_train = train_df['가격']

In [5]:
x_train

Unnamed: 0,년식,종류,연비,마력,토크,연료,하이브리드,배기량,중량,변속기
0,2015,준중형,11.8,172,21.0,가솔린,0,1999,1300,자동
1,2015,준중형,12.3,204,27.0,가솔린,0,1591,1300,자동
2,2015,소형,15.0,100,13.6,가솔린,0,1368,1035,수동
3,2014,소형,14.0,140,17.0,가솔린,0,1591,1090,자동
4,2015,대형,9.6,175,46.0,디젤,0,2497,1990,자동
...,...,...,...,...,...,...,...,...,...,...
66,2015,중형,8.5,290,34.8,가솔린,0,3342,1901,자동
67,2012,소형,13.3,108,13.9,가솔린,0,1396,1040,자동
68,2015,준중형,12.8,186,41.0,디젤,0,1995,1665,자동
69,2015,중형,17.7,156,19.3,가솔린,1,1999,1585,자동


In [6]:
x_test = test_df.iloc[:,1:]
y_test = test_df['가격']

In [9]:
x_train.shape

(71, 10)

In [10]:
x_test.shape

(31, 10)

## 문자열 encoding
- Label Encoding
- OneHot Encoding

In [13]:
x_train[:5]

Unnamed: 0,년식,종류,연비,마력,토크,연료,하이브리드,배기량,중량,변속기
0,2015,준중형,11.8,172,21.0,가솔린,0,1999,1300,자동
1,2015,준중형,12.3,204,27.0,가솔린,0,1591,1300,자동
2,2015,소형,15.0,100,13.6,가솔린,0,1368,1035,수동
3,2014,소형,14.0,140,17.0,가솔린,0,1591,1090,자동
4,2015,대형,9.6,175,46.0,디젤,0,2497,1990,자동


In [12]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71 entries, 0 to 70
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   년식      71 non-null     int64  
 1   종류      71 non-null     object 
 2   연비      71 non-null     float64
 3   마력      71 non-null     int64  
 4   토크      71 non-null     float64
 5   연료      71 non-null     object 
 6   하이브리드   71 non-null     int64  
 7   배기량     71 non-null     int64  
 8   중량      71 non-null     int64  
 9   변속기     71 non-null     object 
dtypes: float64(2), int64(5), object(3)
memory usage: 5.7+ KB


In [22]:
x_train['하이브리드'].unique()

array([0, 1], dtype=int64)

## Label Encoder

In [18]:
# 숫자로 바뀌어진 ndarray를 반환한다
lbl = LabelEncoder()
x_trainLabel = lbl.fit_transform( x_train['종류'] )
x_trainLabel

array([2, 2, 1, 1, 0, 3, 3, 1, 3, 1, 2, 3, 2, 0, 1, 0, 0, 0, 3, 0, 0, 3,
       2, 0, 3, 3, 3, 1, 1, 2, 0, 0, 0, 0, 1, 0, 2, 0, 2, 1, 1, 1, 0, 0,
       0, 2, 1, 0, 2, 1, 3, 3, 0, 2, 2, 0, 3, 2, 0, 0, 2, 0, 1, 0, 0, 1,
       3, 1, 2, 3, 0])

In [23]:
# 문자열을 종류를 보여준다
lbl.classes_

array(['대형', '소형', '준중형', '중형'], dtype=object)

In [24]:
lbl.transform(['대형', '소형', '준중형', '중형'])

array([0, 1, 2, 3], dtype=int64)

## OneHot Encoder (0 과 1로)

In [31]:
oneH = OneHotEncoder()
x_trainOne = oneH.fit_transform( x_train['종류'].values.reshape(-1,1) )
x_trainOne.toarray()

array([[0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],


In [63]:
x_train['종류'].values.reshape(-1,1)

array([['준중형'],
       ['준중형'],
       ['소형'],
       ['소형'],
       ['대형'],
       ['중형'],
       ['중형'],
       ['소형'],
       ['중형'],
       ['소형'],
       ['준중형'],
       ['중형'],
       ['준중형'],
       ['대형'],
       ['소형'],
       ['대형'],
       ['대형'],
       ['대형'],
       ['중형'],
       ['대형'],
       ['대형'],
       ['중형'],
       ['준중형'],
       ['대형'],
       ['중형'],
       ['중형'],
       ['중형'],
       ['소형'],
       ['소형'],
       ['준중형'],
       ['대형'],
       ['대형'],
       ['대형'],
       ['대형'],
       ['소형'],
       ['대형'],
       ['준중형'],
       ['대형'],
       ['준중형'],
       ['소형'],
       ['소형'],
       ['소형'],
       ['대형'],
       ['대형'],
       ['대형'],
       ['준중형'],
       ['소형'],
       ['대형'],
       ['준중형'],
       ['소형'],
       ['중형'],
       ['중형'],
       ['대형'],
       ['준중형'],
       ['준중형'],
       ['대형'],
       ['중형'],
       ['준중형'],
       ['대형'],
       ['대형'],
       ['준중형'],
       ['대형'],
       ['소형'],
       ['대형'],
       ['대형'],
       ['소형

In [34]:
oneH.categories_

[array(['대형', '소형', '준중형', '중형'], dtype=object)]

##  판다스의 dummy함수

In [37]:
pd.get_dummies(x_train['년식'])

Unnamed: 0,2011,2012,2013,2014,2015
0,0,0,0,0,1
1,0,0,0,0,1
2,0,0,0,0,1
3,0,0,0,1,0
4,0,0,0,0,1
...,...,...,...,...,...
66,0,0,0,0,1
67,0,1,0,0,0
68,0,0,0,0,1
69,0,0,0,0,1


In [38]:
pd.get_dummies(x_train)

Unnamed: 0,년식,연비,마력,토크,하이브리드,배기량,중량,종류_대형,종류_소형,종류_준중형,종류_중형,연료_LPG,연료_가솔린,연료_디젤,변속기_수동,변속기_자동
0,2015,11.8,172,21.0,0,1999,1300,0,0,1,0,0,1,0,0,1
1,2015,12.3,204,27.0,0,1591,1300,0,0,1,0,0,1,0,0,1
2,2015,15.0,100,13.6,0,1368,1035,0,1,0,0,0,1,0,1,0
3,2014,14.0,140,17.0,0,1591,1090,0,1,0,0,0,1,0,0,1
4,2015,9.6,175,46.0,0,2497,1990,1,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,2015,8.5,290,34.8,0,3342,1901,0,0,0,1,0,1,0,0,1
67,2012,13.3,108,13.9,0,1396,1040,0,1,0,0,0,1,0,0,1
68,2015,12.8,186,41.0,0,1995,1665,0,0,1,0,0,0,1,0,1
69,2015,17.7,156,19.3,1,1999,1585,0,0,0,1,0,1,0,0,1


In [39]:
# 원하는 컬럼만 불러와서 인코딩 할수있음
pd.get_dummies( x_train, columns = ['연료','변속기'])

Unnamed: 0,년식,종류,연비,마력,토크,하이브리드,배기량,중량,연료_LPG,연료_가솔린,연료_디젤,변속기_수동,변속기_자동
0,2015,준중형,11.8,172,21.0,0,1999,1300,0,1,0,0,1
1,2015,준중형,12.3,204,27.0,0,1591,1300,0,1,0,0,1
2,2015,소형,15.0,100,13.6,0,1368,1035,0,1,0,1,0
3,2014,소형,14.0,140,17.0,0,1591,1090,0,1,0,0,1
4,2015,대형,9.6,175,46.0,0,2497,1990,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,2015,중형,8.5,290,34.8,0,3342,1901,0,1,0,0,1
67,2012,소형,13.3,108,13.9,0,1396,1040,0,1,0,0,1
68,2015,준중형,12.8,186,41.0,0,1995,1665,0,0,1,0,1
69,2015,중형,17.7,156,19.3,1,1999,1585,0,1,0,0,1


## replace

In [40]:
x_train['종류'].replace(['대형','중형','준중형','소형'],
                      [0,1,2,3])

0     2
1     2
2     3
3     3
4     0
     ..
66    1
67    3
68    2
69    1
70    0
Name: 종류, Length: 71, dtype: int64

# make_column_transformer

In [7]:
from sklearn.compose import make_column_transformer

In [8]:
# 원핫인코딩 여러개 한걸 한번에 묶어두는 느낌
#  (transformer, column(s))
myt = make_column_transformer((OneHotEncoder(),['종류','연료','변속기']))
result = myt.fit_transform( x_train )
result

array([[0., 0., 1., 0., 0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 0., 1., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1., 0., 1., 0.],
       [0., 1., 0., 0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 1., 0., 1.],
       [0., 0., 0., 1., 0., 0., 1., 0., 1.],
       [0., 0., 0., 1., 0., 0., 1., 0., 1.],
       [0., 1., 0., 0., 0., 1., 0., 0., 1.],
       [0., 0., 0., 1., 0., 1., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0., 1., 0., 1.],
       [0., 0., 0., 1., 0., 0., 1., 0., 1.],
       [0., 0., 1., 0., 0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 0., 1., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0., 1., 0., 1.],
       [1., 0., 0., 0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 1., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 1., 0., 0., 1.],
       [0.

### 학습

In [47]:
model = make_pipeline( myt, Ridge(alpha = 1) )
model.fit(x_train, y_train)
print(model.score(x_test,y_test))

0.2798606477101978


In [56]:
model = make_pipeline(myt,Ridge() )
# 파이프라인에서 클래스가 2개이상일때
# 클래스이름을 소문자로 쓰고 어느 클래스의 파라미터인지 입력해줘야한다
param_value = {'ridge__alpha':[0.001,0.1,0.01,1,2,3]}
gridS = GridSearchCV( model, param_grid=param_value,scoring = 'r2' )
gridS.fit(x_train, y_train)
print( gridS.best_params_ )
print( gridS.best_score_ )

{'ridge__alpha': 3}
0.10261383885816258
