# encoding categorical variables

In [2]:
# label encoding
from sklearn.preprocessing import LabelEncoder

items = ['사과', '바나나', '조개']

encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)
print(labels)
# decoding
print(encoder.inverse_transform(labels))

[1 0 2]
['사과' '바나나' '조개']


In [4]:
# one-hot encoding은 sklearn이용하려면 labelencoder로 숫자로 변경한 후
# one-hot encodingh을 해주어야 가능
from sklearn.preprocessing import OneHotEncoder
import numpy as np

items = ['사과', '바나나', '배']

encoder = LabelEncoder()
labels = encoder.fit_transform(items)

# 2차원 데이터로 변환해주어야 함
labels = labels.reshape(-1,1)

one_encoder = OneHotEncoder()
one_labels = one_encoder.fit_transform(labels)

print(one_labels.toarray())

[[0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]]


# Feature scaling

In [9]:
# feature scaling
# Standard scaler 는 평균을 0 분산이 1로 만들어줌
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
import pandas as pd

iris = load_iris()
iris_x = iris.data
iris_df = pd.DataFrame(data=iris_x, columns=iris.feature_names)

scaler = StandardScaler()
iris_scaled = scaler.fit_transform(iris_df)
iris_scaled_df = pd.DataFrame(data=iris_scaled,
                             columns=iris.feature_names)
iris_scaled_df.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444


In [10]:
# MinMaxScaler는 최소값을 0 최대값을 1로 하고 사이의 값으로 scaling
# 원본 feature값에 음수가 있으면 최소값은 -1로 설정함
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
iris_scaled = scaler.fit_transform(iris_x)
iris_scaled_df = pd.DataFrame(data=iris_scaled,
                             columns=iris.feature_names)
iris_scaled_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,0.222222,0.625,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.5,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
4,0.194444,0.666667,0.067797,0.041667


# Titanic Example

In [13]:
import os
os.chdir('/Users/younghun/Desktop/inflearn강의자료/머신러닝강의/PerfectGuid수정ver01/1장/titanic/')

In [16]:
import numpy as np
import pandas as pd

df = pd.read_csv('train.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [20]:
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Cabin'].fillna('N', inplace=True)
df['Cabin'] = df['Cabin'].str[:1]
df['Embarked'].fillna('N', inplace=True)

In [21]:
# encoding하는 함수 정의해주기
from sklearn import preprocessing

def encode_features(row):
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = preprocessing.LabelEncoder()
        row[feature] = le.fit_transform(row[feature])
    
    return row

encoded_df = encode_features(df)
encoded_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,7,3
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,2,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,7,3
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,2,3
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,7,3


In [30]:
cols = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Cabin','Embarked']
data_x = encoded_df[cols]
data_y = encoded_df['Survived']

In [31]:
data_x.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked'], dtype='object')

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

train_x, test_x, train_y, test_y = train_test_split(data_x,
                                                   data_y,
                                                   test_size=0.2,
                                                   random_state=42)

models = [DecisionTreeClassifier, RandomForestClassifier,
         LogisticRegression]

for m in models:
    model = m()
    model = model.fit(train_x ,train_y)
    pred = model.predict(test_x)
    acc = accuracy_score(test_y, pred)
    
    print(f" 모델명 {m} - 정확도 : {acc}")

 모델명 <class 'sklearn.tree._classes.DecisionTreeClassifier'> - 정확도 : 0.7821229050279329
 모델명 <class 'sklearn.ensemble._forest.RandomForestClassifier'> - 정확도 : 0.7988826815642458
 모델명 <class 'sklearn.linear_model._logistic.LogisticRegression'> - 정확도 : 0.8212290502793296


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
