In [2]:
import numpy as np
import pandas as pd

In [10]:
data = pd.DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


### 중복제거 함수
* `duplicated()` 중복된 행을 나타내는 Boolean Series를 반환
* `drop_duplicates()` 중복된 데이터를 DataFrame에서 제거

In [11]:
data.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [12]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


In [14]:
data['v1'] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,one,1,1
2,one,2,2
3,two,3,3
4,two,3,4
5,two,4,5
6,two,4,6


### drop_duplicates()

In [20]:
data.drop_duplicates('k1') # 특정 열에 중복된 행들을 제거
data.drop_duplicates(['k1', 'k2']) # 여러개의 열에서 중복된 행들을 제거할 
data.drop_duplicates(['k1', 'k2'], keep='last') # 중복된 행을 발견했을 때, 마지막으로 발견된 행을 제외한 나머지 제거

Unnamed: 0,k1,k2,v1
1,one,1,1
2,one,2,2
4,two,3,4
6,two,4,6


In [33]:
data = pd.DataFrame({'food': ['bacon', 'pork', 'bacon', 'beef', 'Bacon', 'ham'],
                     'ounces': [4, 3, 12, 6, 8, 3]})

data

Unnamed: 0,food,ounces
0,bacon,4
1,pork,3
2,bacon,12
3,beef,6
4,Bacon,8
5,ham,3


### 소문자 처리

In [34]:
data['food'] = data['food'].str.lower()

### 딕셔너리를 참조하여 새로운 열 생성

In [39]:
animal_type = {
    'bacon': 'pig',
    'pork': 'pig',
    'beef': 'cow',
    'ham': 'pig'
}

data['animal'] = data['food'].map(animal_type)
data

Unnamed: 0,food,ounces,animal
0,bacon,4,pig
1,pork,3,pig
2,bacon,12,pig
3,beef,6,cow
4,bacon,8,pig
5,ham,3,pig


In [41]:
data = pd.Series([1, -900, 2, -999, -1000, 3])
data

0       1
1    -900
2       2
3    -999
4   -1000
5       3
dtype: int64

In [42]:
data.replace(-900, np.nan)

0       1.0
1       NaN
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

### 인덱스 소문자 처리

In [52]:
data = pd.DataFrame(np.arange(12).reshape(3, 4),
                    index=['Seoul', 'Busan', 'Daejeon'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Seoul,0,1,2,3
Busan,4,5,6,7
Daejeon,8,9,10,11


In [45]:
data.index.map(lambda x: x.lower())

Index(['seoul', 'busan', 'daejeon'], dtype='object')

### 캐릭터 단위 문자열 처리

In [54]:
tf = lambda x: x[:7].upper()
data.index.map(tf)

Index(['SEOUL', 'BUSAN', 'DAEJEON'], dtype='object')

### 함수를 이용하여 행과 열 인덱스 문자열 처리

In [58]:
data.rename(index=str.title, columns=str.upper, inplace=True)
data

Unnamed: 0,ONE,TWO,THREE,FOUR
Seoul,0,1,2,3
Busan,4,5,6,7
Daejeon,8,9,10,11


## 구간 나누기

In [61]:
ages = [20, 22, 25, 27, 31, 21, 23, 37, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (25, 35], ..., (35, 60], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [63]:
cats.codes

array([0, 0, 0, 1, 1, 0, 0, 2, 3, 2, 2, 1], dtype=int8)

In [64]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [65]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [66]:
pd.cut(ages, bins, right=False)

[[18, 25), [18, 25), [25, 35), [25, 35), [25, 35), ..., [35, 60), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [67]:
pd.cut(ages, bins, labels=['youth', 'young', 'middle_aged', 'senior'], right=False)

[youth, youth, young, young, young, ..., middle_aged, senior, middle_aged, middle_aged, young]
Length: 12
Categories (4, object): [youth < young < middle_aged < senior]

In [85]:
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.016678,0.104579,0.067963,-0.007954
std,1.003874,1.012133,1.003527,0.99499
min,-3.393779,-3.918353,-3.931971,-2.989461
25%,-0.710713,-0.580253,-0.604024,-0.664885
50%,0.022635,0.164153,0.084648,0.015494
75%,0.697422,0.771331,0.73426,0.616535
max,3.654972,2.97373,3.265664,3.319336


### 절대값이 3이상인 행 추출
어느하나라도 참이라면 조건을 충족시켜주는 함수: `any()` 또는 `or()`

In [86]:
data[np.abs(data > 3).any(1)] # 어느 하나라도 조건을 충족한다면 -> 전체를 추출

Unnamed: 0,0,1,2,3
129,3.009854,-0.814129,-0.476299,-1.909768
343,3.476822,-0.693904,0.579766,2.079011
377,0.086937,-1.501809,3.104341,0.773532
403,-0.419586,2.073437,-0.000917,3.228143
523,0.470984,-0.943782,-1.799592,3.319336
714,3.654972,1.426012,-0.375844,1.374241
728,-0.484662,-0.60916,1.549726,3.255571
886,-0.741282,-1.009617,3.265664,0.245916


### 숫자 부호 추출
`np.sign()`: -1, 0, 1

In [87]:
# data에 저장된 값중 절대값이 3보다 큰 데이터는 3 아니라면 -3으로 변경

data[np.abs(data) > 3] = np.sign(data) * 3
data

Unnamed: 0,0,1,2,3
0,-1.535743,0.277116,0.613786,-0.440339
1,-1.507419,0.531296,1.556973,-0.830151
2,0.235260,2.263089,0.475366,0.145272
3,-0.307447,0.699940,-1.511203,-1.533048
4,0.405435,0.295572,2.684453,-1.425797
...,...,...,...,...
995,1.043175,-1.568235,1.331228,2.518241
996,-1.426819,0.029448,0.832642,0.580153
997,0.951354,-0.066035,-1.224121,-0.571625
998,0.084676,0.348561,1.615592,-0.662086


### 랜덤하게 데이터 추출

In [84]:
df = pd.DataFrame(np.arange(5 * 4).reshape(5, 4))
sampler = np.random.permutation(5)
sampler

array([2, 1, 0, 4, 3])

In [88]:
x = np.arange(10)
x

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [90]:
np.random.permutation(x)

array([0, 6, 2, 1, 3, 5, 7, 4, 8, 9])

In [93]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
4,16,17,18,19
1,4,5,6,7
2,8,9,10,11


In [95]:
ch = pd.Series([5, 6, -1, 4, 3])
ch

0    5
1    6
2   -1
3    4
4    3
dtype: int64

In [96]:
ch.sample(n=3)

2   -1
3    4
4    3
dtype: int64

#### 데이터 순서변경

### permutation vs. shuffle
* `shuffle()`의 경우 inplace지만 `permutation()`은 순서가 변경된 새로운 배열을 리턴 

In [103]:
np.random.seed(921)
x = np.random.rand(5)
x

array([0.9656181 , 0.29582799, 0.03620148, 0.59954018, 0.72301574])

In [104]:
np.random.shuffle(x)
x

array([0.59954018, 0.29582799, 0.03620148, 0.72301574, 0.9656181 ])

In [106]:
print(np.random.permutation(x))
print(x)

[0.59954018 0.9656181  0.72301574 0.29582799 0.03620148]
[0.59954018 0.29582799 0.03620148 0.72301574 0.9656181 ]


#### choice()
* `np.random.choice(a, size, replace, probability)`
    * `np.random.choice(5, 3, replace=True)`:  0 ~ 4사이의 배열: 3: 3개 추출, replace=True(디폴트, 복원)

In [117]:
print(np.random.choice(5, 5, replace=True))  # 복원: permutation with repetition
print(np.random.choice(5, 5, replace=False)) # 비복원: permutation

[1 2 0 2 1]
[2 4 3 1 0]
