# 자료 준비 및 손질(읽어오기, 정제, 변환, 정렬 등)

In [2]:
import pandas as pd
import numpy as np

In [3]:
fruits = pd.Series(['바나나', '아보카도', np.nan, '키위'])
fruits

0     바나나
1    아보카도
2     NaN
3      키위
dtype: object

In [4]:
fruits.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
fruits[0] = None
fruits.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [6]:
fruits.notnull()

0    False
1     True
2    False
3     True
dtype: bool

In [7]:
fruits.dropna()

1    아보카도
3      키위
dtype: object

In [8]:
fruits[fruits.notnull()]

1    아보카도
3      키위
dtype: object

In [9]:
frame = pd.DataFrame([[1., 2.5, 2.2], [1., np.nan, np.nan], [np.nan, np.nan, np.nan], [np.nan, 6.2, 4.8]])
frame

Unnamed: 0,0,1,2
0,1.0,2.5,2.2
1,1.0,,
2,,,
3,,6.2,4.8


In [10]:
frame.dropna()

Unnamed: 0,0,1,2
0,1.0,2.5,2.2


In [11]:
frame.dropna(how="all")

Unnamed: 0,0,1,2
0,1.0,2.5,2.2
1,1.0,,
3,,6.2,4.8


In [12]:
frame[3] = np.nan
frame 

Unnamed: 0,0,1,2,3
0,1.0,2.5,2.2,
1,1.0,,,
2,,,,
3,,6.2,4.8,


In [13]:
frame.dropna(how='all', axis=1)

Unnamed: 0,0,1,2
0,1.0,2.5,2.2
1,1.0,,
2,,,
3,,6.2,4.8


In [15]:
frame.dropna(thresh=2) # 2초과

Unnamed: 0,0,1,2,3
0,1.0,2.5,2.2,
3,,6.2,4.8,


In [16]:
frame.fillna(0)

Unnamed: 0,0,1,2,3
0,1.0,2.5,2.2,0.0
1,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,6.2,4.8,0.0


In [17]:
frame.fillna({1 : 0, 2:1, 3:2.5}, inplace=True)

In [18]:
frame

Unnamed: 0,0,1,2,3
0,1.0,2.5,2.2,2.5
1,1.0,0.0,1.0,2.5
2,,0.0,1.0,2.5
3,,6.2,4.8,2.5


In [19]:
frame = pd.DataFrame(np.random.randn(6,3))
frame.iloc[2:, 1] = np.nan
frame.iloc[4:, 2] = np.nan
frame

Unnamed: 0,0,1,2
0,-0.307324,0.181312,1.009105
1,-0.731385,0.51369,0.727378
2,1.648516,,0.1183
3,0.468725,,-1.714764
4,-0.665607,,
5,-0.776388,,


In [20]:
frame.fillna(method="ffill")

  frame.fillna(method="ffill")


Unnamed: 0,0,1,2
0,-0.307324,0.181312,1.009105
1,-0.731385,0.51369,0.727378
2,1.648516,0.51369,0.1183
3,0.468725,0.51369,-1.714764
4,-0.665607,0.51369,-1.714764
5,-0.776388,0.51369,-1.714764


In [21]:
frame.fillna(method="ffill", limit=2)

  frame.fillna(method="ffill", limit=2)


Unnamed: 0,0,1,2
0,-0.307324,0.181312,1.009105
1,-0.731385,0.51369,0.727378
2,1.648516,0.51369,0.1183
3,0.468725,0.51369,-1.714764
4,-0.665607,,-1.714764
5,-0.776388,,-1.714764


In [22]:
frame.fillna(frame.mean())

Unnamed: 0,0,1,2
0,-0.307324,0.181312,1.009105
1,-0.731385,0.51369,0.727378
2,1.648516,0.347501,0.1183
3,0.468725,0.347501,-1.714764
4,-0.665607,0.347501,0.035005
5,-0.776388,0.347501,0.035005


In [23]:
frame.fillna(frame.median())

Unnamed: 0,0,1,2
0,-0.307324,0.181312,1.009105
1,-0.731385,0.51369,0.727378
2,1.648516,0.347501,0.1183
3,0.468725,0.347501,-1.714764
4,-0.665607,0.347501,0.422839
5,-0.776388,0.347501,0.422839


In [24]:
frame = pd.DataFrame({"k1":['one','two'] * 3 + ['two'], 'k2':[1,1,2,3,3,4,4]})
frame

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [25]:
frame.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [26]:
frame.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [27]:
frame['k3'] = range(7)
frame

Unnamed: 0,k1,k2,k3
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [28]:
frame.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,k3
0,one,1,0
1,two,1,1


In [29]:
frame.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,k3
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


In [30]:
s = pd.Series([1., -999, 2, -999, -1000, 3])
s

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [31]:
s.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [32]:
s.replace([-999,-1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [33]:
s.replace([-999,-1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [34]:
s.replace({-999:np.nan, -1000:0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [35]:
frame = pd.DataFrame(np.arange(12).reshape(3,4), index=['Ohio','Colorado','New York'], columns=['one','two','three','four'])
frame

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [36]:
lam = lambda x: x[:4].upper()
frame.index.map(lam)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [37]:
frame.index = frame.index.map(lam)
frame

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [38]:
frame.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [39]:
frame.rename(index={'OHIO':'CALIFORNIA'}, columns={'three':'five'}, inplace=True)

In [40]:
frame

Unnamed: 0,one,two,five,four
CALIFORNIA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [41]:
age = [20, 22, 25, 27, 21, 23, 34, 37, 35, 60, 42, 66, 80]
dis = [18, 25, 35, 60, 100]

ge = pd.cut(age, dis)
ge

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (35, 60], (35, 60], (60, 100], (60, 100]]
Length: 13
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [42]:
ge.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [43]:
ge.codes

array([0, 0, 0, 1, 0, 0, 1, 2, 1, 2, 2, 3, 3], dtype=int8)

In [44]:
ge.value_counts()

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    2
Name: count, dtype: int64

In [45]:
pd.cut(age, dis, right=False)

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [35, 60), [60, 100), [35, 60), [60, 100), [60, 100)]
Length: 13
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [46]:
names = ['청년', '중장년', '중년', '노년']
pd.cut(age, dis, labels=names)

['청년', '청년', '청년', '중장년', '청년', ..., '중장년', '중년', '중년', '노년', '노년']
Length: 13
Categories (4, object): ['청년' < '중장년' < '중년' < '노년']

In [47]:
rng = np.random.RandomState(0)
data = rng.randn(20)
c = pd.cut(data, 4, precision=2)
c

[(1.44, 2.24], (-0.17, 0.63], (0.63, 1.44], (1.44, 2.24], (1.44, 2.24], ..., (-0.17, 0.63], (1.44, 2.24], (-0.98, -0.17], (-0.17, 0.63], (-0.98, -0.17]]
Length: 20
Categories (4, interval[float64, right]): [(-0.98, -0.17] < (-0.17, 0.63] < (0.63, 1.44] < (1.44, 2.24]]

In [48]:
pd.value_counts(c)

  pd.value_counts(c)


(-0.17, 0.63]     9
(1.44, 2.24]      5
(-0.98, -0.17]    3
(0.63, 1.44]      3
Name: count, dtype: int64

In [49]:
data = rng.randn(1000)

qc = pd.qcut(data, 4)
pd.value_counts(qc)

  pd.value_counts(qc)


(-3.0469999999999997, -0.708]    250
(-0.708, -0.0601]                250
(-0.0601, 0.582]                 250
(0.582, 2.759]                   250
Name: count, dtype: int64

In [50]:
pd.qcut(data, [0, 0.1, 0.3, 0.5, 0.7, 1])

[(-3.0469999999999997, -1.307], (0.416, 2.759], (0.416, 2.759], (-1.307, -0.559], (0.416, 2.759], ..., (-1.307, -0.559], (-0.0601, 0.416], (-3.0469999999999997, -1.307], (-0.0601, 0.416], (-0.0601, 0.416]]
Length: 1000
Categories (5, interval[float64, right]): [(-3.0469999999999997, -1.307] < (-1.307, -0.559] < (-0.559, -0.0601] < (-0.0601, 0.416] < (0.416, 2.759]]

In [51]:
# 이상치 걸러내기
import random

random.seed(100)
random.random()

0.1456692551041303

In [52]:
random.random()

0.45492700451402135

In [53]:
random.seed(50)
random.random()

0.4975365687586023

In [54]:
random.seed(100)
random.random()

0.1456692551041303