# 판다스(Pandas) 심화

## 데이터프레임 정렬

In [2]:
import numpy as np
import pandas as pd
print("DataFrame: ")
data = {
    'col1' : [2,1,9,8,7,4],
    'col2' : ['A','A','B',np.nan,'D','C'],
    'col3' : [0,1,9,4,2,3]
}
index = [1,3,2,5,0,4]
df = pd.DataFrame(data,index=index)
df

DataFrame: 


Unnamed: 0,col1,col2,col3
1,2,A,0
3,1,A,1
2,9,B,9
5,8,,4
0,7,D,2
4,4,C,3


In [7]:
# index axis=0(행)을 축으로 정렬
df = df.sort_index(axis=0)
df

Unnamed: 0,col1,col2,col3
0,7,D,2
1,2,A,0
2,9,B,9
3,1,A,1
4,4,C,3
5,8,,4


In [4]:
df = df.sort_index(axis=1)
df

Unnamed: 0,col1,col2,col3
1,2,A,0
3,1,A,1
2,9,B,9
5,8,,4
0,7,D,2
4,4,C,3


In [6]:
# index axis=1(열)을 축으로 정렬, 내림차순
df = df.sort_index(axis=0, ascending=False)  # descending
df

Unnamed: 0,col1,col2,col3
5,8,,4
4,4,C,3
3,1,A,1
2,9,B,9
1,2,A,0
0,7,D,2


- ascending : 오름차순
- descending : 내림차순

In [5]:
# sort_value 1
sorted_df1 = df.sort_values('col1', ascending=True)
sorted_df1

Unnamed: 0,col1,col2,col3
3,1,A,1
1,2,A,0
4,4,C,3
0,7,D,2
5,8,,4
2,9,B,9


In [7]:
# sort_value 2
sorted_df2 = df.sort_values(['col2','col1'], ascending=[True,False])
sorted_df2

Unnamed: 0,col1,col2,col3
1,2,A,0
3,1,A,1
2,9,B,9
4,4,C,3
0,7,D,2
5,8,,4


## 데이터 분석용 함수

In [40]:
# 데이터 프레임의 index와 columns
import numpy as np
import pandas as pd
data = {
    'korean' : [50,60,70,30],
    'math' : [30,np.nan,40,20]
}
df = pd.DataFrame(data, index=['a','b','c','d'])
# index=['a','b','c','d']
# df = pd.DataFrame(data, index=index)

print(df.index)
print(df.columns)
print(df)

Index(['a', 'b', 'c', 'd'], dtype='object')
Index(['korean', 'math'], dtype='object')
   korean  math
a      50  30.0
b      60   NaN
c      70  40.0
d      30  20.0


In [34]:
# count() 함수
print(df.count(axis=0)) # axis=0 기본값(생략가능)
print(df.count(axis=1))

korean    4
math      3
dtype: int64
a    2
b    1
c    2
d    2
dtype: int64


In [35]:
# max(), min(), sum(), mean()
print(df.max())
print(df.max(axis=1))
print(df.min())
print(df.min(axis=1))
print(df.sum())
print(df.sum(axis=1))
print(df.mean())
print(df.mean(axis=1))
print(df.mean(axis=1, skipna=False))  # NaN값 보이게 하기

korean    70.0
math      40.0
dtype: float64
a    50.0
b    60.0
c    70.0
d    30.0
dtype: float64
korean    30.0
math      20.0
dtype: float64
a    30.0
b    60.0
c    40.0
d    20.0
dtype: float64
korean    210.0
math       90.0
dtype: float64
a     80.0
b     60.0
c    110.0
d     50.0
dtype: float64
korean    52.5
math      30.0
dtype: float64
a    40.0
b    60.0
c    55.0
d    25.0
dtype: float64
a    40.0
b     NaN
c    55.0
d    25.0
dtype: float64


In [37]:
# 결측값 대체 fillna
avg = df['math'].mean()
df['math']=df['math'].fillna(avg)
df

Unnamed: 0,korean,math
a,50,30.0
b,60,30.0
c,70,40.0
d,30,20.0


 ## 그룹으로 묶기 / groupby

In [62]:
import numpy as np
import pandas as pd

df = pd.DataFrame({
    'key':['A','B','C','A','B','C'],
    'data1':[1,2,3,1,2,3],
    'data2':[4,4,6,0,6,1],
    # 'data3':['안녕','a','b','하세요','pple','3']
})
print("DataFrame:")
df

DataFrame:


Unnamed: 0,key,data1,data2
0,A,1,4
1,B,2,4
2,C,3,6
3,A,1,0
4,B,2,6
5,C,3,1


In [63]:
df.groupby('key').sum()
# 문자형이 아닌 숫자 3일 경우 에러발생 / TypeError: can only concatenate str (not "int") to str

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,2,4
B,4,10
C,6,7


In [64]:
df.groupby(['key','data1']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key,data1,Unnamed: 2_level_1
A,1,4
B,2,10
C,3,7


In [67]:
df.groupby('key').aggregate(['min', 'median', 'max'])

Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,min,median,max,min,median,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,1,1.0,1,0,2.0,4
B,2,2.0,2,4,5.0,6
C,3,3.0,3,1,3.5,6


In [69]:
df.groupby('key').aggregate({'data1':'max', 'data2':'sum'})

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,1,4
B,2,10
C,3,7
