In [1]:
import numpy as np
import pandas as pd
from io import StringIO 
import warnings
warnings.filterwarnings('ignore')

## Creating data

In [2]:
csv_data = '''
A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
9.0,10.0,11.0,
13.0,14.0,15.0,16.0
'''
#missing value가 들어있는 데이터셋

In [3]:
df = pd.read_csv(StringIO(csv_data))

In [4]:
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,9.0,10.0,11.0,
3,13.0,14.0,15.0,16.0


In [5]:
type(df)

pandas.core.frame.DataFrame

isnull method returns a dataframe with boolean values that indicate whether a cell contians a nemuric value or if data is missing.

In [5]:
#missing value찾아주는 함수(true로 표시)
df.isnull() 

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,True,False
2,False,False,False,True
3,False,False,False,False


The number of missing values per column

In [6]:
df.isnull().sum(axis=0) #열별로 확인

A    0
B    0
C    1
D    1
dtype: int64

The number of missing values per sample

In [8]:
df.isnull().sum(axis=1) #행별로 확인 

0    0
1    1
2    1
3    0
dtype: int64

## 1. Eliminating samples or features with missing values

Remove missing samples

In [9]:
df.dropna(axis=0) #행을 삭제

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
3,13.0,14.0,15.0,16.0


Remove missing columns

In [10]:
df.dropna(axis=1) #열을 삭제

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,9.0,10.0
3,13.0,14.0


Remove rows where all columns are NaN

In [11]:
df.dropna(how='all') #모든 column이 missing value인 것만 삭제 

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,9.0,10.0,11.0,
3,13.0,14.0,15.0,16.0


Remove rows that have less than 4 real avlues

In [12]:
df.dropna(thresh=4) #최소 real value 개수

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
3,13.0,14.0,15.0,16.0


Remove rows that have less than 3 real avlues

In [13]:
df.dropna(thresh=3)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,9.0,10.0,11.0,
3,13.0,14.0,15.0,16.0


Only remove rows where NaN appear in specific columns

In [14]:
df.dropna(subset=['C']) #C컬럼의 missing value만 삭제

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,9.0,10.0,11.0,
3,13.0,14.0,15.0,16.0


## 2. Imputing missing values

In [7]:
from sklearn.impute import SimpleImputer

In [8]:
imr = SimpleImputer(missing_values=np.nan, strategy='mean')

In [10]:
imr = imr.fit(df.values)

In [12]:
imputed_data = imr.transform(df.values)

In [13]:
imputed_data

array([[ 1.        ,  2.        ,  3.        ,  4.        ],
       [ 5.        ,  6.        ,  9.66666667,  8.        ],
       [ 9.        , 10.        , 11.        ,  9.33333333],
       [13.        , 14.        , 15.        , 16.        ]])

In [14]:
df.mean(skipna=True) #missing value 를 제외하고 mean 구하기

A    7.000000
B    8.000000
C    9.666667
D    9.333333
dtype: float64

## Creating data

In [15]:
df = pd.DataFrame([
    ['green', 'M', 10.1, 'class1'],
    ['red', 'L', 13.5, 'class2'],
    ['blue', 'XL', 15.3, 'class1']
])

In [16]:
df.columns =['color', 'size', 'price', 'classlabel']

In [17]:
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


## 3. Mappling ordinal features

In [18]:
#카테고리 값들 숫자에 mapping-> dictionary 이용하기
size_mapping = {
    'XL': 3,
    'L': 2,
    'M': 1
}

In [19]:
df['size'] = df['size'].map(size_mapping)

In [20]:
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [21]:
#숫자를 다시 카테고리 값으로 돌리고 싶으면,
inv_size_mapping = {v: k for k, v in size_mapping.items()}

In [22]:
df['size'].map(inv_size_mapping)

0     M
1     L
2    XL
Name: size, dtype: object

## 4. Encoding class labels

In [29]:
class_mapping = {label:idx for idx, label in enumerate(np.unique(df['classlabel']))}

In [30]:
class_mapping

{'class1': 0, 'class2': 1}

In [31]:
df['classlabel'] = df['classlabel'].map(class_mapping)

In [32]:
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


In [33]:
inv_class_mapping = {v: k for k, v in class_mapping.items()}

In [34]:
df['classlabel'] = df['classlabel'].map(inv_class_mapping)

In [35]:
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


### scikit-learn LabelEncoder

In [36]:
from sklearn.preprocessing import LabelEncoder

In [37]:
class_le = LabelEncoder()

fit_transform method is a shortcut for calling fit and transform separately


In [38]:
y = class_le.fit_transform(df['classlabel'].values)

In [39]:
y

array([0, 1, 0])

## 5. Performing one-hot encoding on nominal features

###  LabelEncoder (not recommended)

In [40]:
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [24]:
X = df[['color', 'size', 'price']].values
X

array([['green', 1, 10.1],
       ['red', 2, 13.5],
       ['blue', 3, 15.3]], dtype=object)

In [42]:
color_le= LabelEncoder()

In [43]:
X[:, 0] = color_le.fit_transform(X[:, 0])

In [44]:
X #숫자에 따라 대소관계가,,, 

array([[1, 1, 10.1],
       [2, 2, 13.5],
       [0, 3, 15.3]], dtype=object)

In [45]:
color_le.classes_

array(['blue', 'green', 'red'], dtype=object)

### Using scikit-learn OneHotEncoder

In [28]:
from sklearn.preprocessing import OneHotEncoder

In [29]:
X = df[['color', 'size', 'price']].values

In [30]:
color_ohe = OneHotEncoder()

Convert sparse matrix representation into a regular(dense) NumPy array

In [31]:
#지금 입력값이 vector이기 때문에 matrix 로 transform해주기 위해,
#reshape가 꼭 필요함 
color_ohe.fit_transform(X[:,0].reshape(-1, 1)).toarray()

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [32]:
color_ohe.categories_

[array(['blue', 'green', 'red'], dtype=object)]

In [33]:
color_ohe.get_feature_names(['color'])

array(['color_blue', 'color_green', 'color_red'], dtype=object)

### Using scikit-learn ColumnTransformer to selectively transform columns

In [34]:
#column별로 어떤 transform할건지 한꺼번에 할 수 있음!!
from sklearn.compose import ColumnTransformer

In [35]:
X = df[['color', 'size', 'price']].values

Leave the other two columns untouched via the 'passthrough' argument.

In [54]:
c_transf = ColumnTransformer([
    ('onehot', OneHotEncoder(), [0]),
    ('nothing', 'passthrough', [1, 2])
])
#[0]은 0번째 column이라는 뜻

In [55]:
c_transf.fit_transform(X).astype(float)

array([[ 0. ,  1. ,  0. ,  1. , 10.1],
       [ 0. ,  0. ,  1. ,  2. , 13.5],
       [ 1. ,  0. ,  0. ,  3. , 15.3]])

### Using pandas get_dummies

get_dummies method creates dummy features via one-hot encoding, it only converts string columns.

In [36]:
#scikit-learn대신에 pandas 사용해서 one-hot encoding 가능!!
pd.get_dummies(df[['color', 'size', 'price']])

Unnamed: 0,size,price,color_blue,color_green,color_red
0,1,10.1,0,1,0
1,2,13.5,0,0,1
2,3,15.3,1,0,0


To address multicollinearity, use drop_first

In [37]:
pd.get_dummies(df[['color', 'size', 'price']], drop_first=True)

Unnamed: 0,size,price,color_green,color_red
0,1,10.1,1,0
1,2,13.5,0,1
2,3,15.3,0,0


## 6. Partitioning a dataset into separate training and test sets

In [38]:
from sklearn.model_selection import train_test_split
X, y = np.arange(10).reshape((5, 2)), range(5)
X, y

(array([[0, 1],
        [2, 3],
        [4, 5],
        [6, 7],
        [8, 9]]),
 range(0, 5))

In [42]:
#순서 제대로 써주기 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train

array([[4, 5],
       [0, 1],
       [6, 7]])

In [43]:
X_test

array([[2, 3],
       [8, 9]])

In [44]:
y_train

[2, 0, 3]

In [45]:
y_test

[1, 4]

## 7. Bringing features onto the samle scale

### Min-max scaling

In [46]:
ex = np.array([0, 1, 2, 3, 4, 5])

In [47]:
(ex-ex.min())/(ex.max()-ex.min())

array([0. , 0.2, 0.4, 0.6, 0.8, 1. ])

### Standardization

In [48]:
(ex-ex.mean())/ex.std()

array([-1.46385011, -0.87831007, -0.29277002,  0.29277002,  0.87831007,
        1.46385011])

### scikit-learn MinMaxScaler

In [66]:
from sklearn.preprocessing import MinMaxScaler

In [67]:
data = [[-1, 2],  
        [-0.5, 6], 
        [0, 10], 
        [1, 18]]

In [68]:
mms = MinMaxScaler()

In [69]:
mms.fit_transform(data)

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

### scikit-learn StandardScaler

In [70]:
from sklearn.preprocessing import StandardScaler

In [71]:
stds = StandardScaler()

In [72]:
stds.fit_transform(data)

array([[-1.18321596, -1.18321596],
       [-0.50709255, -0.50709255],
       [ 0.16903085,  0.16903085],
       [ 1.52127766,  1.52127766]])

## 8. Loan example

In [50]:
loan = pd.read_csv('/Users/yujinkim/Desktop/3학년/1학기/응용머신러닝/workspace/loan.csv')

In [74]:
loan.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Target
0,LP001032,Male,No,0,Graduate,No,4950,0.0,125,360,1,Urban,Y
1,LP001824,Male,Yes,1,Graduate,No,2882,1843.0,123,480,1,Semiurban,Y
2,LP002928,Male,Yes,0,Graduate,No,3000,3416.0,56,180,1,Semiurban,Y
3,LP001814,Male,Yes,2,Graduate,No,9703,0.0,112,360,1,Urban,Y
4,LP002244,Male,Yes,0,Graduate,No,2333,2417.0,136,360,1,Urban,Y


In [51]:
len(loan), loan.dtypes

(480,
 Loan_ID               object
 Gender                object
 Married               object
 Dependents            object
 Education             object
 Self_Employed         object
 ApplicantIncome        int64
 CoapplicantIncome    float64
 LoanAmount             int64
 Loan_Amount_Term       int64
 Credit_History         int64
 Property_Area         object
 Target                object
 dtype: object)

In [76]:
for c in loan.columns:
    print(loan[c].value_counts())

LP001404    1
LP002544    1
LP002755    1
LP002250    1
LP001900    1
           ..
LP002287    1
LP002776    1
LP002842    1
LP001164    1
LP001011    1
Name: Loan_ID, Length: 480, dtype: int64
Male      394
Female     86
Name: Gender, dtype: int64
Yes    311
No     169
Name: Married, dtype: int64
0     274
2      85
1      80
3+     41
Name: Dependents, dtype: int64
Graduate        383
Not Graduate     97
Name: Education, dtype: int64
No     414
Yes     66
Name: Self_Employed, dtype: int64
2500    7
4583    6
2600    5
6000    4
6250    4
       ..
5821    1
2750    1
3775    1
2755    1
3073    1
Name: ApplicantIncome, Length: 405, dtype: int64
0.0       216
1666.0      5
2500.0      4
5625.0      3
2083.0      3
         ... 
1881.0      1
1255.0      1
1644.0      1
1260.0      1
1483.0      1
Name: CoapplicantIncome, Length: 232, dtype: int64
110    13
120    13
100    13
187    12
128     9
       ..
216     1
218     1
228     1
230     1
9       1
Name: LoanAmount, Length: 186

In [52]:
X = loan.iloc[:, 1:-1]
y = loan.iloc[:, -1]

In [53]:
X.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,No,0,Graduate,No,4950,0.0,125,360,1,Urban
1,Male,Yes,1,Graduate,No,2882,1843.0,123,480,1,Semiurban
2,Male,Yes,0,Graduate,No,3000,3416.0,56,180,1,Semiurban
3,Male,Yes,2,Graduate,No,9703,0.0,112,360,1,Urban
4,Male,Yes,0,Graduate,No,2333,2417.0,136,360,1,Urban


In [54]:
y.head()

0    Y
1    Y
2    Y
3    Y
4    Y
Name: Target, dtype: object

kNN without preprocessing

In [55]:
from sklearn.neighbors import KNeighborsClassifier

In [58]:
knn = KNeighborsClassifier()

In [57]:
#knn.fit(X, y)

### 1. Mapping ordinal features

In [83]:
dependents_mapping = {
    '3+': 4,
    '2': 3,
    '1': 2,
    '0':1
}

In [84]:
X['Dependents'] = X['Dependents'].map(dependents_mapping)

In [85]:
X.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,No,1,Graduate,No,4950,0.0,125,360,1,Urban
1,Male,Yes,2,Graduate,No,2882,1843.0,123,480,1,Semiurban
2,Male,Yes,1,Graduate,No,3000,3416.0,56,180,1,Semiurban
3,Male,Yes,3,Graduate,No,9703,0.0,112,360,1,Urban
4,Male,Yes,1,Graduate,No,2333,2417.0,136,360,1,Urban


### 2. Encoding class labels

In [86]:
class_le = LabelEncoder()

In [87]:
class_le.fit(y.values)

LabelEncoder()

In [88]:
y = class_le.transform(y.values)

In [89]:
y

array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1,

### 3. Performing one-hot encoding on nominal features

In [90]:
X.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,No,1,Graduate,No,4950,0.0,125,360,1,Urban
1,Male,Yes,2,Graduate,No,2882,1843.0,123,480,1,Semiurban
2,Male,Yes,1,Graduate,No,3000,3416.0,56,180,1,Semiurban
3,Male,Yes,3,Graduate,No,9703,0.0,112,360,1,Urban
4,Male,Yes,1,Graduate,No,2333,2417.0,136,360,1,Urban


In [61]:
nominal_features = ['Gender', 'Married', 'Education','Self_Employed', 'Property_Area']

In [62]:
X_dummies = pd.get_dummies(X[nominal_features], drop_first=True)

In [63]:
X_dummies.head()

Unnamed: 0,Gender_Male,Married_Yes,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban
0,1,0,0,0,0,1
1,1,1,0,0,1,0
2,1,1,0,0,1,0
3,1,1,0,0,0,1
4,1,1,0,0,0,1


In [94]:
X = pd.concat([X, X_dummies],axis=1)

In [95]:
X.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Gender_Male,Married_Yes,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban
0,Male,No,1,Graduate,No,4950,0.0,125,360,1,Urban,1,0,0,0,0,1
1,Male,Yes,2,Graduate,No,2882,1843.0,123,480,1,Semiurban,1,1,0,0,1,0
2,Male,Yes,1,Graduate,No,3000,3416.0,56,180,1,Semiurban,1,1,0,0,1,0
3,Male,Yes,3,Graduate,No,9703,0.0,112,360,1,Urban,1,1,0,0,0,1
4,Male,Yes,1,Graduate,No,2333,2417.0,136,360,1,Urban,1,1,0,0,0,1


In [96]:
X = X.drop(columns = nominal_features)

In [97]:
X.head()

Unnamed: 0,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban
0,1,4950,0.0,125,360,1,1,0,0,0,0,1
1,2,2882,1843.0,123,480,1,1,1,0,0,1,0
2,1,3000,3416.0,56,180,1,1,1,0,0,1,0
3,3,9703,0.0,112,360,1,1,1,0,0,0,1
4,1,2333,2417.0,136,360,1,1,1,0,0,0,1


### 4. Imputing missing values

In [64]:
X.isnull().sum(axis=0)

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

### 5. Bringing features onto the samle scale

In [65]:
sc = StandardScaler()

NameError: name 'StandardScaler' is not defined

In [100]:
sc.fit(X)

StandardScaler()

In [101]:
X_sc = sc.transform(X)

### 6. Partitioning a dataset into training and test sets

In [102]:
X_train, X_test, y_train, y_test = train_test_split(X_sc, y, 
                                                    test_size=0.2, 
                                                    random_state=0, 
                                                    stratify=y)

In [103]:
len(X_train), len(X_test)

(384, 96)

### 7. Model training and prediction

In [104]:
from sklearn.metrics import accuracy_score

In [105]:
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [106]:
accuracy_score(y_test, knn.predict(X_test))

0.8229166666666666