# Handling Dataset

## Missing Values

In [1]:
import pandas as pd
import numpy as np

예제 데이터 만들기

In [2]:
data = [
    np.random.choice(['A', 'B', 'C'], size=30), 
    np.random.choice(['Male', 'Female'], size=30),
    np.random.randint(100, size=30), 
    np.random.rand(30), 
    np.random.uniform(10, 20, size=30),
    np.random.randint(1000, 20000, size=30),
]

df = pd.DataFrame(data=zip(*data), columns=['col{}'.format(i+1) for i in range(len(data))])
df.head()

Unnamed: 0,col1,col2,col3,col4,col5,col6
0,A,Male,17,0.126836,18.177113,18120
1,B,Male,66,0.099365,11.794493,1929
2,B,Male,92,0.933195,14.338665,5175
3,A,Female,1,0.985366,12.510384,10087
4,B,Female,48,0.348517,17.407604,15230


예제 데이터 마지막 변수에 결측치 삽입 및 확인

In [3]:
df.iloc[np.random.choice(df.index, size=5, replace=False), -1] = np.nan
idx = df[df.iloc[:,-1].isnull()].index
df[df.iloc[:,-1].isnull()]

Unnamed: 0,col1,col2,col3,col4,col5,col6
7,B,Female,34,0.607565,15.604187,
12,A,Female,49,0.668603,10.201707,
14,B,Female,44,0.425936,15.385249,
23,C,Male,45,0.954287,12.371229,
29,A,Male,20,0.620673,19.389421,


In [4]:
df.groupby(by='col1')['col6'].agg('mean').to_frame()

Unnamed: 0_level_0,col6
col1,Unnamed: 1_level_1
A,11301.5
B,11829.222222
C,8524.333333


dataframe.groupby를 사용하여 조건부 평균으로 결측치 처리

In [5]:
df['col6'].fillna(df.groupby(by='col1')['col6'].transform('mean'), inplace=True)
df.iloc[idx,:]

Unnamed: 0,col1,col2,col3,col4,col5,col6
7,B,Female,34,0.607565,15.604187,11829.222222
12,A,Female,49,0.668603,10.201707,11301.5
14,B,Female,44,0.425936,15.385249,11829.222222
23,C,Male,45,0.954287,12.371229,8524.333333
29,A,Male,20,0.620673,19.389421,11301.5


다중 조건으로 처리

In [6]:
df.iloc[np.random.choice(df.index, size=5, replace=False), -1] = np.nan
idx = df[df.iloc[:,-1].isnull()].index
df[df.iloc[:,-1].isnull()]

Unnamed: 0,col1,col2,col3,col4,col5,col6
6,C,Female,37,0.591396,12.981837,
10,C,Female,0,0.972729,18.909436,
18,B,Female,52,0.778195,18.785425,
20,B,Male,86,0.303224,18.019758,
28,B,Male,81,0.137864,19.902223,


In [7]:
df.groupby(['col1', 'col2'])['col6'].mean().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,col6
col1,col2,Unnamed: 2_level_1
A,Female,9585.25
A,Male,13017.75
B,Female,14474.361111
B,Male,6109.5
C,Female,11667.0
C,Male,7514.666667


In [8]:
df['col6'].fillna(df.groupby(['col1', 'col2'])['col6'].transform(np.mean), inplace=True)
df.iloc[idx,:]

Unnamed: 0,col1,col2,col3,col4,col5,col6
6,C,Female,37,0.591396,12.981837,11667.0
10,C,Female,0,0.972729,18.909436,11667.0
18,B,Female,52,0.778195,18.785425,14474.361111
20,B,Male,86,0.303224,18.019758,6109.5
28,B,Male,81,0.137864,19.902223,6109.5


범주형 데이터 빈도로 결측치 처리

In [9]:
df.loc[np.random.choice(df.index, 5), 'col2'] = np.nan
idx = df[df['col2'].isnull()].index
df[df['col2'].isnull()]

Unnamed: 0,col1,col2,col3,col4,col5,col6
1,B,,66,0.099365,11.794493,1929.0
5,A,,8,0.25089,15.287146,10899.0
12,A,,49,0.668603,10.201707,11301.5
15,A,,73,0.240882,16.550941,8146.0
17,A,,82,0.912706,15.229143,9550.0


In [10]:
df.groupby('col1')['col2'].agg(lambda x: x.value_counts().index[0]).to_frame()

Unnamed: 0_level_0,col2
col1,Unnamed: 1_level_1
A,Male
B,Male
C,Female


In [11]:
df['col2'].fillna(df.groupby(by='col1')['col2'].transform(lambda x: x.value_counts().index[0]), inplace=True)
df.iloc[idx,:]

Unnamed: 0,col1,col2,col3,col4,col5,col6
1,B,Male,66,0.099365,11.794493,1929.0
5,A,Male,8,0.25089,15.287146,10899.0
12,A,Male,49,0.668603,10.201707,11301.5
15,A,Male,73,0.240882,16.550941,8146.0
17,A,Male,82,0.912706,15.229143,9550.0


결측치가 없는 경우, pd.Series.mode가 빠름

In [12]:
df.groupby('col1')['col2'].agg(pd.Series.mode).to_frame()

Unnamed: 0_level_0,col2
col1,Unnamed: 1_level_1
A,Male
B,Male
C,Female


범주형 변수 합치기

In [13]:
df['new'] = df['col1'].str.cat(df['col2'], sep='+')
df.head()

Unnamed: 0,col1,col2,col3,col4,col5,col6,new
0,A,Male,17,0.126836,18.177113,18120.0,A+Male
1,B,Male,66,0.099365,11.794493,1929.0,B+Male
2,B,Male,92,0.933195,14.338665,5175.0,B+Male
3,A,Female,1,0.985366,12.510384,10087.0,A+Female
4,B,Female,48,0.348517,17.407604,15230.0,B+Female


object 형태를 category 형태로 변환하기: 데이터셋의 용량을 현저히 줄일 수 있다.

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 7 columns):
col1    30 non-null object
col2    30 non-null object
col3    30 non-null int64
col4    30 non-null float64
col5    30 non-null float64
col6    30 non-null float64
new     30 non-null object
dtypes: float64(3), int64(1), object(3)
memory usage: 1.7+ KB


In [15]:
cols = df.select_dtypes(include='object').columns
print(cols)

Index(['col1', 'col2', 'new'], dtype='object')


In [16]:
df[cols] = df[cols].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 7 columns):
col1    30 non-null category
col2    30 non-null category
col3    30 non-null int64
col4    30 non-null float64
col5    30 non-null float64
col6    30 non-null float64
new     30 non-null category
dtypes: category(3), float64(3), int64(1)
memory usage: 1.5 KB


In [17]:
df['col2'].cat.categories

Index(['Female', 'Male'], dtype='object')

In [18]:
df['col2'].cat.codes[:5]

0    1
1    1
2    1
3    0
4    0
dtype: int8

pd.get_dummies로 더미변수 또는 원-핫 인코딩 만들기, scikit-learn의 one-hot보다 빠르다고 함

In [19]:
dummy = pd.get_dummies(df['col2'])
pd.concat([df, dummy], axis=1).head()

Unnamed: 0,col1,col2,col3,col4,col5,col6,new,Female,Male
0,A,Male,17,0.126836,18.177113,18120.0,A+Male,0,1
1,B,Male,66,0.099365,11.794493,1929.0,B+Male,0,1
2,B,Male,92,0.933195,14.338665,5175.0,B+Male,0,1
3,A,Female,1,0.985366,12.510384,10087.0,A+Female,1,0
4,B,Female,48,0.348517,17.407604,15230.0,B+Female,1,0
