In [1]:
%load_ext watermark
%watermark -u -d -p numpy,pandas,matplotlib,sklearn

last updated: 2020-04-01 

numpy 1.18.1
pandas 1.0.3
matplotlib 3.2.1
sklearn 0.22.2.post1


In [2]:
import numpy as np
import pandas as pd
from io import StringIO

csv_data = \
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''

df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [3]:
# 셀(cell)이 누락된 데이터를 가지고 있는지 확인하기
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [4]:
# 누락된 값이 있는 행 삭제
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [5]:
# 누락된 값이 있는 열 삭제
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [6]:
# 모든 열이 NaN일 때만 행을 삭제
# (여기서는 모든 값이 NaN인 행이 없기 때문에 전체 배열이 반환됩니다)
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [7]:
# 실수 값이 네 개보다 작은 행을 삭제
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [8]:
# 특정 열에 NaN이 있는 행만 삭제
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


### 4.1.3 누락된 값 대체
 가장 흔한 보간 기법 중 하나는 평균으로 대체하는 것

In [9]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer

In [10]:
help(SimpleImputer)
simr = SimpleImputer(missing_values=np.nan, strategy='mean')
simr = simr.fit(df.values)
imputed_data = simr.transform(df.values)
imputed_data

Help on class SimpleImputer in module sklearn.impute._base:

class SimpleImputer(_BaseImputer)
 |  SimpleImputer(missing_values=nan, strategy='mean', fill_value=None, verbose=0, copy=True, add_indicator=False)
 |  
 |  Imputation transformer for completing missing values.
 |  
 |  Read more in the :ref:`User Guide <impute>`.
 |  
 |  Parameters
 |  ----------
 |  missing_values : number, string, np.nan (default) or None
 |      The placeholder for the missing values. All occurrences of
 |      `missing_values` will be imputed.
 |  
 |  strategy : string, default='mean'
 |      The imputation strategy.
 |  
 |      - If "mean", then replace missing values using the mean along
 |        each column. Can only be used with numeric data.
 |      - If "median", then replace missing values using the median along
 |        each column. Can only be used with numeric data.
 |      - If "most_frequent", then replace missing using the most frequent
 |        value along each column. Can be used wi

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [11]:
ftr_simr = FunctionTransformer(lambda X: simr.fit_transform(X.T).T, validate=False)
imputed_data = ftr_simr.fit_transform(df.values)
imputed_data

array([[ 1.        ,  2.        ,  3.        ,  4.        ],
       [ 5.        ,  6.        ,  6.33333333,  8.        ],
       [10.        , 11.        , 12.        , 11.        ]])

## 4.2 범주형 데이터 다루기
### 4.2.1 순서가 있는 특성과 순서가 없는 특성
 - 범주형  : 순서가 있는 특성 (ex: 티셔츠 XL > L > M)
 - 비범주형: 순서가 없는 특성 (ex: 색상)

In [12]:
df = pd.DataFrame([
    ['green', 'M', 10.1, 'class1'],
    ['red', 'L', 13.5, 'class2'],
    ['blue', 'XL', 15.3, 'class1']])
df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


### 4.2.2 순서 특성 매핑
 학습 알고리즘이 순서 특성을 올바르게 인식하려면 범주형의 문자열 값을 정수로 바꿔야 함.

In [13]:
size_mapping = {
    'XL': 3,
    'L': 2,
    'M': 1
}

In [14]:
df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [15]:
inv_size_mapping = {v: k for k, v in size_mapping.items()}

In [16]:
df['size'].map(inv_size_mapping)

0     M
1     L
2    XL
Name: size, dtype: object

### 4.2.3 클래스 레이블 인코딩

In [17]:
class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classlabel']))}
class_mapping

{'class1': 0, 'class2': 1}

In [18]:
df['classlabel'] = df['classlabel'].map(class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


In [19]:
inv_class_mapping = {v: k for k, v in class_mapping.items()}
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [20]:
from sklearn.preprocessing import LabelEncoder

In [21]:
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
y

array([0, 1, 0])

In [22]:
class_le.inverse_transform(y)

array(['class1', 'class2', 'class1'], dtype=object)

### 4.2.4 순서가 없는 특성에 원-핫 인코딩 적용

In [23]:
X = df[['color', 'size', 'price']].values
color_le = LabelEncoder()
X[:, 0] = color_le.fit_transform(X[:, 0])
X

array([[1, 1, 10.1],
       [2, 2, 13.5],
       [0, 3, 15.3]], dtype=object)

In [24]:
from sklearn.preprocessing import OneHotEncoder

In [25]:
oh_enc = OneHotEncoder(categories='auto')
col_trans = ColumnTransformer([('oh_enc', oh_enc, [0])], remainder='passthrough')
col_trans.fit_transform(X)

NameError: name 'ColumnTransformer' is not defined

In [26]:
pd.get_dummies(df[['price', 'color', 'size']])

Unnamed: 0,price,size,color_blue,color_green,color_red
0,10.1,1,0,1,0
1,13.5,2,0,0,1
2,15.3,3,1,0,0


In [27]:
pd.get_dummies(df[['price', 'color', 'size']], drop_first=True)

Unnamed: 0,price,size,color_green,color_red
0,10.1,1,1,0
1,13.5,2,0,1
2,15.3,3,0,0


## 4.3 데이터셋을 훈련 세트와 데스트 세트로 나누기

In [28]:
df_wine = pd.read_csv('./data/wine.data', header=None)
df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
                   'Alcalinity of ash', 'Magnesium', 'Total phenols',
                   'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
                   'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
                   'Proline']
print('클래스 레이블', np.unique(df_wine['Class label']))
df_wine.head()

클래스 레이블 [1 2 3]


Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [34]:
# 데이터셋을 랜덤한 훈련 세트와 테스트 세트로 나누기
from sklearn.model_selection import train_test_split

X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=0, 
                                                    stratify=y)

In [38]:
print(df_wine.iloc[:, 1:].values)
print(df_wine.iloc[:, 0].values)

[[1.423e+01 1.710e+00 2.430e+00 ... 1.040e+00 3.920e+00 1.065e+03]
 [1.320e+01 1.780e+00 2.140e+00 ... 1.050e+00 3.400e+00 1.050e+03]
 [1.316e+01 2.360e+00 2.670e+00 ... 1.030e+00 3.170e+00 1.185e+03]
 ...
 [1.327e+01 4.280e+00 2.260e+00 ... 5.900e-01 1.560e+00 8.350e+02]
 [1.317e+01 2.590e+00 2.370e+00 ... 6.000e-01 1.620e+00 8.400e+02]
 [1.413e+01 4.100e+00 2.740e+00 ... 6.100e-01 1.600e+00 5.600e+02]]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3]


## 4.4 특성 스케일 맞추기

In [39]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test)

In [40]:
X_test_norm

array([[ 0.69005848,  0.22924901,  0.64052288,  0.30645161,  0.55555556,
         0.69655172,  0.51687764,  0.52      ,  0.39873418,  0.40497336,
         0.69148936,  0.60805861,  0.78245364],
       [ 0.22222222,  0.14031621,  0.54248366,  0.40860215,  0.41975309,
         0.3137931 ,  0.29746835,  0.64      ,  0.19303797,  0.10746004,
         1.03191489,  0.35164835,  0.05492154],
       [ 0.87134503,  0.22332016,  0.49019608,  0.17204301,  0.32098765,
         0.52413793,  0.45991561,  0.34      ,  0.49367089,  0.31172291,
         0.57446809,  0.84615385,  0.72182596],
       [ 0.52339181,  0.2055336 ,  0.2875817 , -0.01075269,  0.37037037,
         0.57586207,  0.51054852,  0.26      ,  0.2721519 ,  0.23445826,
         0.60638298,  0.78021978,  0.55064194],
       [ 0.28070175,  0.06521739,  0.26143791,  0.38172043,  0.22222222,
         0.87586207,  0.71940928,  0.22      ,  0.48417722,  0.24511545,
         0.59574468,  0.54945055,  0.2724679 ],
       [ 0.30409357,  0.452569

In [41]:
from sklearn.preprocessing import StandardScaler

stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

In [42]:
X_test_std

array([[ 8.94437367e-01, -3.88117877e-01,  1.10073064e+00,
        -8.12017114e-01,  1.13201117e+00,  1.09807851e+00,
         7.12041017e-01,  1.81013423e-01,  6.62804643e-02,
         5.12859235e-01,  7.96297849e-01,  4.48295020e-01,
         1.90593792e+00],
       [-1.04879931e+00, -7.72993966e-01,  5.41190056e-01,
        -2.40938809e-01,  3.49414498e-01, -7.07219221e-01,
        -3.08121293e-01,  6.76138376e-01, -1.03520519e+00,
        -9.06567274e-01,  2.24570604e+00, -5.61881713e-01,
        -1.22874035e+00],
       [ 1.64744158e+00, -4.13776283e-01,  2.42768413e-01,
        -1.56343594e+00, -2.19746720e-01,  2.84881333e-01,
         4.47191186e-01, -5.61674005e-01,  5.74658458e-01,
         6.79643589e-02,  2.98063782e-01,  1.38631627e+00,
         1.64471473e+00],
       [ 2.02159300e-01, -4.90751501e-01, -9.13615453e-01,
        -2.58536553e+00,  6.48338887e-02,  5.28840485e-01,
         6.82613258e-01, -8.91757306e-01, -6.11556861e-01,
        -3.00662824e-01,  4.33945800e

In [43]:
ex = np.array([0, 1, 2, 3, 4, 5])

print('표준화:', (ex - ex.mean()) / ex.std())

# 판다스는 기본적으로 ddof=1(샘플 표준 편차)을 사용합니다.
# 넘파이의 std 메서드와 사이킷런의 StandardScaler는 ddof=0(모집단 표준 편차)를 사용합니다.

print('정규화:', (ex - ex.min()) / (ex.max() - ex.min()))

표준화: [-1.46385011 -0.87831007 -0.29277002  0.29277002  0.87831007  1.46385011]
정규화: [0.  0.2 0.4 0.6 0.8 1. ]


## 4.5 유용한 특성 선택
### 4.5.3 L1 규제를 사용한 희소성

In [49]:
from sklearn.linear_model import LogisticRegression

In [50]:
help(LogisticRegression)
LogisticRegression(solver='liblinear', penalty='l1')

Help on class LogisticRegression in module sklearn.linear_model._logistic:

class LogisticRegression(sklearn.base.BaseEstimator, sklearn.linear_model._base.LinearClassifierMixin, sklearn.linear_model._base.SparseCoefMixin)
 |  LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)
 |  
 |  Logistic Regression (aka logit, MaxEnt) classifier.
 |  
 |  In the multiclass case, the training algorithm uses the one-vs-rest (OvR)
 |  scheme if the 'multi_class' option is set to 'ovr', and uses the
 |  cross-entropy loss if the 'multi_class' option is set to 'multinomial'.
 |  (Currently the 'multinomial' option is supported only by the 'lbfgs',
 |  'sag', 'saga' and 'newton-cg' solvers.)
 |  
 |  This class implements regularized logistic regression using the
 |  'liblinear' library, 'newton-cg', 'sag'

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [51]:
# 표준화 처리된 Wine 데이터에 L1 규제가 있는 로지스틱 회귀를 적용
lr = LogisticRegression(solver='liblinear', multi_class='auto', 
                        penalty='l1', C=1.0, random_state=42)

# C=1.0이 기본값입니다. 작을 수록 정규화가 더 강력합니다.
lr.fit(X_train_std, y_train)
print('훈련 정확도:', lr.score(X_train_std, y_train))
print('테스트 정확도:', lr.score(X_test_std, y_test))

훈련 정확도: 1.0
테스트 정확도: 1.0
