In [1]:
import pandas as pd
df = pd.read_csv('./data/iris_missing_values.csv')
df.index.name = 'record'
print(df.head())

        sepal length in cm  sepal width in cm  petal length in cm  \
record                                                              
0                      NaN                3.5                 1.4   
1                      4.9                3.0                 1.4   
2                      NaN                3.2                 1.3   
3                      4.6                3.1                 1.5   
4                      5.0                3.6                 1.4   

        petal width in cm species  
record                             
0                     0.2  setosa  
1                     0.2  setosa  
2                     0.2  setosa  
3                     0.2  setosa  
4                     0.2  setosa  


In [2]:
# get boolean (True/False) response for each datapoint for NaNs
df['sepal length in cm'].isnull()

# check if any missing values in column
print(df['sepal length in cm'].isnull().values.any())

# get number of many missing values in column
print(df['sepal length in cm'].isnull().values.sum())

True
6


In [3]:
# fill missing values with new values, store in new "df_example" dataframe
df_example = df['sepal length in cm'].fillna('example')
print(df_example.head())

record
0    example
1        4.9
2    example
3        4.6
4          5
Name: sepal length in cm, dtype: object


In [4]:
#drop rows with missing data
df_dropped = df.dropna(axis=0)
print(df_dropped.head())

        sepal length in cm  sepal width in cm  petal length in cm  \
record                                                              
1                      4.9                3.0                 1.4   
3                      4.6                3.1                 1.5   
4                      5.0                3.6                 1.4   
6                      4.6                3.4                 1.4   
7                      5.0                3.4                 1.5   

        petal width in cm species  
record                             
1                     0.2  setosa  
3                     0.2  setosa  
4                     0.2  setosa  
6                     0.3  setosa  
7                     0.2  setosa  


In [5]:
# drop columns with missing data
df_dropped = df.dropna(axis = 1)
print(df_dropped.head())

Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4]


## 결측값 대체하기

In [6]:
# import imputer module from Scikit-learn and istantiate imputer object
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

#define columns to impute on 
cols=['sepal length in cm','sepal width in cm','petal length in cm','petal width in cm']


In [9]:
# fit imputer and transform dataset, store in df_new 임퓨터 적합시키고 변환
out_imp = imputer.fit_transform(df[cols])
df_new = pd.DataFrame(data= out_imp, columns = cols)
df_new = pd.concat([df_new,df[['species']]], axis=1)
print(df_new.head())

   sepal length in cm  sepal width in cm  petal length in cm  \
0            5.870139                3.5                 1.4   
1            4.900000                3.0                 1.4   
2            5.870139                3.2                 1.3   
3            4.600000                3.1                 1.5   
4            5.000000                3.6                 1.4   

   petal width in cm species  
0                0.2  setosa  
1                0.2  setosa  
2                0.2  setosa  
3                0.2  setosa  
4                0.2  setosa  


## MinMaxScaler()

In [10]:
# load module and instantiate scaler object
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# nomalize the data and store in out_scaled numpy array
out_scaled = scaler.fit_transform(df[cols])
print(out_scaled[:10])

[[       nan 0.68181818 0.06779661 0.04166667]
 [0.16666667 0.45454545 0.06779661 0.04166667]
 [       nan 0.54545455 0.05084746 0.04166667]
 [0.08333333 0.5        0.08474576 0.04166667]
 [0.19444444 0.72727273 0.06779661 0.04166667]
 [       nan 0.86363636 0.11864407 0.125     ]
 [0.08333333 0.63636364 0.06779661 0.08333333]
 [0.19444444 0.63636364 0.08474576 0.04166667]
 [0.02777778 0.40909091 0.06779661 0.04166667]
 [       nan        nan        nan        nan]]


## StandardScaler()

In [11]:
# load module and istantiate scaler object
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# standardize the data and store in out_scaled numpy array
out_scaled = scaler.fit_transform(df[cols])
print(out_scaled[:10])

[[        nan  1.0873808  -1.3749907  -1.34813339]
 [-1.17061344 -0.10017475 -1.3749907  -1.34813339]
 [        nan  0.37484747 -1.4322009  -1.34813339]
 [-1.53260701  0.13733636 -1.3177805  -1.34813339]
 [-1.04994892  1.32489192 -1.3749907  -1.34813339]
 [        nan  2.03742525 -1.2033601  -1.08336972]
 [-1.53260701  0.84986969 -1.3749907  -1.21575155]
 [-1.04994892  0.84986969 -1.3177805  -1.34813339]
 [-1.77393605 -0.33768587 -1.3749907  -1.34813339]
 [        nan         nan         nan         nan]]
