## 데이터 조작

In [88]:
import numpy as np 
import pandas as pd

### 데이터 갯수 세기

In [89]:
s = pd.Series(range(10))
s[3] = np.nan
s

0    0.0
1    1.0
2    2.0
3    NaN
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
dtype: float64

In [90]:
s.count()

9

In [91]:
np.random.seed(2)
df = pd.DataFrame(np.random.randint(5, size=(4, 4)), dtype=float)
df.iloc[2, 3] = np.nan
df

Unnamed: 0,0,1,2,3
0,0.0,0.0,3.0,2.0
1,3.0,0.0,2.0,1.0
2,3.0,2.0,4.0,
3,4.0,3.0,4.0,2.0


In [92]:
df.count()      # axis=0 가 디폴트

0    4
1    4
2    4
3    3
dtype: int64

In [93]:
df.count(axis=1)

0    4
1    4
2    3
3    4
dtype: int64

In [94]:
import seaborn as sns 
titanic = sns.load_dataset('titanic')
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [95]:
# 타이타닉호 승객 데이터의 데이터 개수를 각 열마다 구해보기
titanic.count()

survived       891
pclass         891
sex            891
age            714
sibsp          891
parch          891
fare           891
embarked       889
class          891
who            891
adult_male     891
deck           203
embark_town    889
alive          891
alone          891
dtype: int64

In [96]:
titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


### 카테고리 값 세기

In [97]:
vec = np.random.randint(6, size=100)
s2 = pd.Series(vec)
s2.tail()

95    3
96    0
97    0
98    2
99    5
dtype: int32

In [98]:
s2.value_counts()

2    22
0    18
3    17
1    15
5    15
4    13
dtype: int64

In [99]:
titanic['class'].value_counts()

Third     491
First     216
Second    184
Name: class, dtype: int64

In [100]:
titanic['sibsp'].value_counts()

0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: sibsp, dtype: int64

### 정렬

In [101]:
s2.value_counts().sort_index()

0    18
1    15
2    22
3    17
4    13
5    15
dtype: int64

In [102]:
s.sort_values()     # ascending=True 가 디폴트

0    0.0
1    1.0
2    2.0
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
3    NaN
dtype: float64

In [103]:
s.sort_values(ascending=False)

9    9.0
8    8.0
7    7.0
6    6.0
5    5.0
4    4.0
2    2.0
1    1.0
0    0.0
3    NaN
dtype: float64

In [104]:
df = pd.DataFrame(np.random.rand(16).reshape(4,4))
df

Unnamed: 0,0,1,2,3
0,0.662202,0.387523,0.497074,0.414906
1,0.350872,0.550978,0.972911,0.112776
2,0.313259,0.041798,0.7384,0.657512
3,0.214636,0.416753,0.643842,0.661481


In [105]:
df.sort_values(by=1)

Unnamed: 0,0,1,2,3
2,0.313259,0.041798,0.7384,0.657512
0,0.662202,0.387523,0.497074,0.414906
3,0.214636,0.416753,0.643842,0.661481
1,0.350872,0.550978,0.972911,0.112776


In [106]:
df

Unnamed: 0,0,1,2,3
0,0.662202,0.387523,0.497074,0.414906
1,0.350872,0.550978,0.972911,0.112776
2,0.313259,0.041798,0.7384,0.657512
3,0.214636,0.416753,0.643842,0.661481


### 행/열 합계

In [107]:
np.random.seed(1)
df2 = pd.DataFrame(np.random.randint(10, size=(4, 8)))
df2

Unnamed: 0,0,1,2,3,4,5,6,7
0,5,8,9,5,0,0,1,7
1,6,9,2,4,5,2,4,2
2,4,7,7,9,1,7,0,6
3,9,9,7,6,9,1,0,1


In [108]:
df2.sum()       # axis=0 이 디폴트

0    24
1    33
2    25
3    24
4    15
5    10
6     5
7    16
dtype: int64

In [109]:
df2.sum(axis=1)

0    35
1    34
2    41
3    42
dtype: int64

In [110]:
df2['행의 합'] = df2.sum(axis=1)
df2

Unnamed: 0,0,1,2,3,4,5,6,7,행의 합
0,5,8,9,5,0,0,1,7,35
1,6,9,2,4,5,2,4,2,34
2,4,7,7,9,1,7,0,6,41
3,9,9,7,6,9,1,0,1,42


In [111]:
df2.loc['열의 합', :] = df2.sum()
df2

Unnamed: 0,0,1,2,3,4,5,6,7,행의 합
0,5.0,8.0,9.0,5.0,0.0,0.0,1.0,7.0,35.0
1,6.0,9.0,2.0,4.0,5.0,2.0,4.0,2.0,34.0
2,4.0,7.0,7.0,9.0,1.0,7.0,0.0,6.0,41.0
3,9.0,9.0,7.0,6.0,9.0,1.0,0.0,1.0,42.0
열의 합,24.0,33.0,25.0,24.0,15.0,10.0,5.0,16.0,152.0


### 연습문제

- 1) 타이타닉호 승객의 평균 나이
- 2) 타이타닉호 승객중 여성 승객의 평균 나이
- 3) 타이타닉호 승객중 1등실 선실의 여성 승객의 평균 나이


In [112]:
# 1)
round(titanic['age'].mean(), 2)

29.7

In [113]:
# 2)
t2 = titanic[titanic['sex'] == 'female']
round(t2['age'].mean(), 2)

27.92

In [114]:
# 3)
t3 = titanic[(titanic['class'] == 'First') & (titanic['sex'] == 'female')]
round(t3['age'].mean(), 2)

34.61

### apply 변환

In [125]:
x, y = map(int, input("정수 2개 입력> ").split())

ValueError: invalid literal for int() with base 10: '10,20'

In [None]:
x, y

In [53]:
df3 = pd.DataFrame({
    'A': [1, 3, 4, 3, 4],
    'B': [2, 3, 1, 2, 3],
    'C': [1, 5, 2, 4, 4]
})
df3

Unnamed: 0,A,B,C
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [54]:
def diff(x):            # x 는 Series type
    return x.max() - x.min()

In [55]:
df3.apply(diff)

A    3
B    2
C    4
dtype: int64

In [56]:
df3.apply(lambda x: x.max() - x.min())

A    3
B    2
C    4
dtype: int64

In [57]:
df3.apply(lambda x: x.max() - x.min(), axis=1)

0    1
1    2
2    3
3    2
4    1
dtype: int64

In [58]:
titanic['adult/child'] = titanic.apply(lambda r: 'adult' if r.age >= 20 else 'child', axis=1)
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,adult/child,category2
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,adult,male22
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,adult,female38
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,adult,female26
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,adult,female35
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,adult,male35


### fillna() method

In [59]:
df3.apply(pd.value_counts)

Unnamed: 0,A,B,C
1,1.0,1.0,1.0
2,,2.0,1.0
3,2.0,2.0,
4,2.0,,2.0
5,,,1.0


In [116]:
df3.apply(pd.value_counts).fillna(0.0)

Unnamed: 0,A,B,C
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


### astype() method

In [118]:
    df3.apply(pd.value_counts).fillna(0.0).astype(int)

Unnamed: 0,A,B,C
1,1,1,1
2,0,2,1
3,2,2,0
4,2,0,2
5,0,0,1


In [119]:
# 연습문제 4.4.6
titanic['age'] = titanic['age'].fillna(0)
titanic['category2'] = titanic['sex'] + titanic['age'].astype(int).astype(str)
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,category2
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,male22
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,female38
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,female26
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,female35
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,male35
