# pandas - DataFrame
- 2차원 행렬(표), Series를 묶어낸 자료형

In [3]:
import numpy as np
import pandas as pd

In [4]:
# dictionary-list 활용
data = {
    'one': [1,2,3,4,5],
    'two': ['가','나','다','라','마'],
    'three': [1.23, 2.34, 3.45, 4.56, 5.67],
    'four': True
}

df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,가,1.23,True
1,2,나,2.34,True
2,3,다,3.45,True
3,4,라,4.56,True
4,5,마,5.67,True


In [9]:
# list-dictionary 활용
data = [
    {'a':1,'b':2,'c':3},
    {'b':5,'c':6},
    {'a':7,'b':8,'c':9}
]

df = pd.DataFrame(data)
df.index = ['토끼','판다','코알라']
print(df.index)

df.columns = ['협동심','성실도','인내심']
print(df.columns)
df

Index(['토끼', '판다', '코알라'], dtype='object')
Index(['협동심', '성실도', '인내심'], dtype='object')


Unnamed: 0,협동심,성실도,인내심
토끼,1.0,2,3
판다,,5,6
코알라,7.0,8,9


In [10]:
# 2차원 ndarray 활용
arr = np.random.randn(2,3)
df = pd.DataFrame(arr, index=['ㄱ','ㄴ'], columns=['A','B','C']) # df로 만드는 과정에서 바로 index와 columns를 설정
df

Unnamed: 0,A,B,C
ㄱ,-1.085395,-0.146031,-1.339903
ㄴ,-1.095149,-0.993352,-1.440397


In [11]:
data = {
    '이름':['토순','곰돌','멍냥'],
    '위치':['독산','종로','천호'],
    '성별':['F','F','M']
}

df = pd.DataFrame(data)
df

Unnamed: 0,이름,위치,성별
0,토순,독산,F
1,곰돌,종로,F
2,멍냥,천호,M


In [12]:
df.T

Unnamed: 0,0,1,2
이름,토순,곰돌,멍냥
위치,독산,종로,천호
성별,F,F,M


### DataFrame 속성

In [15]:
print(df.index)
print(df.columns)
print(df.values)

RangeIndex(start=0, stop=3, step=1)
Index(['이름', '위치', '성별'], dtype='object')
[['토순' '독산' 'F']
 ['곰돌' '종로' 'F']
 ['멍냥' '천호' 'M']]


In [16]:
print(df.shape) # 형태
print(df.size) # 요소 개수
print(df.ndim) # 깊이
print(df.dtypes) # 요소의 자료형

(3, 3)
9
2
이름    object
위치    object
성별    object
dtype: object


---
### DataFrame 메서드

In [45]:
bank_client_df = pd.DataFrame({
    'Client ID': [1, 2, 3, 4],
    'Client Name': ['Aly', 'Steve', 'Nicole', 'Morris'],
    'Net worth [$]': [35000, 3000, 100000, 2000],
    'Years with bank': [4, 7, 10, 15]
})

bank_client_df

Unnamed: 0,Client ID,Client Name,Net worth [$],Years with bank
0,1,Aly,35000,4
1,2,Steve,3000,7
2,3,Nicole,100000,10
3,4,Morris,2000,15


In [18]:
print(bank_client_df.head(2))
print(bank_client_df.tail(2))

   Client ID Client Name  Net worth [$]  Years with bank
0          1         Aly          35000                4
1          2       Steve           3000                7
   Client ID Client Name  Net worth [$]  Years with bank
2          3      Nicole         100000               10
3          4      Morris           2000               15


In [19]:
bank_client_df.info() # metadata

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Client ID        4 non-null      int64 
 1   Client Name      4 non-null      object
 2   Net worth [$]    4 non-null      int64 
 3   Years with bank  4 non-null      int64 
dtypes: int64(3), object(1)
memory usage: 260.0+ bytes


In [20]:
bank_client_df.describe() # 숫자형태만 나옴 object는 안나옴 -> 평균, 표준편차 등은 object는 못구하기 때문

Unnamed: 0,Client ID,Net worth [$],Years with bank
count,4.0,4.0,4.0
mean,2.5,35000.0,9.0
std,1.290994,45963.753836,4.690416
min,1.0,2000.0,4.0
25%,1.75,2750.0,6.25
50%,2.5,19000.0,8.5
75%,3.25,51250.0,11.25
max,4.0,100000.0,15.0


In [50]:
# indexing/slicing
# iloc 인덱스 - 행/열 순서로 조회
# loc 라벨 - 행/열 순서로 조회
print(bank_client_df.iloc[0])
print(type(bank_client_df.iloc[0]))
print(bank_client_df.loc[0].index)
print(type(bank_client_df.loc[0].name))

Client ID              1
Client Name          Aly
Net worth [$]      35000
Years with bank        4
Name: 0, dtype: object
<class 'pandas.core.series.Series'>
Index(['Client ID', 'Client Name', 'Net worth [$]', 'Years with bank'], dtype='object')
<class 'int'>


In [24]:
bank_client_df.iloc[:2], type(bank_client_df.iloc[:2])

(   Client ID Client Name  Net worth [$]  Years with bank
 0          1         Aly          35000                4
 1          2       Steve           3000                7,
 pandas.core.frame.DataFrame)

In [29]:
bank_client_df.iloc[[0,2]], type(bank_client_df.iloc[[0,2]])

(   Client ID Client Name  Net worth [$]  Years with bank
 0          1         Aly          35000                4
 2          3      Nicole         100000               10,
 pandas.core.frame.DataFrame)

In [38]:
# fancy indexing을 통한 조회는 결과가 1개여도 dataframe 타입으로 반환
# -> (인덱싱) Series를 반환한다는 것은 차원을 축소(제거)하는 ㅓㄳ
# -> DataFrame을 반환한다는 것은 차원을 유지하는 것
print(bank_client_df.iloc[[2]].shape)
print(bank_client_df.iloc[2].shape)
bank_client_df.iloc[[2]], type(bank_client_df.iloc[[2]])

(1, 4)
(4,)


(   Client ID Client Name  Net worth [$]  Years with bank
 2          3      Nicole         100000               10,
 pandas.core.frame.DataFrame)

In [40]:
# 2차원 indexing - 한 차원 낮아짐 /slicing - 차원 유지
# bank_client_df.iloc[0,1]
bank_client_df.iloc[:2,1], type(bank_client_df.iloc[:2,1])

(0      Aly
 1    Steve
 Name: Client Name, dtype: object,
 pandas.core.series.Series)

In [46]:
bank_client_df

Unnamed: 0,Client ID,Client Name,Net worth [$],Years with bank
0,1,Aly,35000,4
1,2,Steve,3000,7
2,3,Nicole,100000,10
3,4,Morris,2000,15


In [52]:
bank_client_df.index =  ['client1','client2','client3','client4']
bank_client_df

Unnamed: 0,Client ID,Client Name,Net worth [$],Years with bank
client1,1,Aly,35000,4
client2,2,Steve,3000,7
client3,3,Nicole,100000,10
client4,4,Morris,2000,15


In [54]:
bank_client_df.loc['client3'] # loc: 라벨로 가져옴

Client ID               3
Client Name        Nicole
Net worth [$]      100000
Years with bank        10
Name: client3, dtype: object

In [None]:
bank_client_df.loc['client2':'client4':2] # 시작 idx : 끝 idx :offset

Unnamed: 0,Client ID,Client Name,Net worth [$],Years with bank
client2,2,Steve,3000,7
client4,4,Morris,2000,15


In [57]:
bank_client_df.loc['client2':'client4':2, 'Client Name'], type(bank_client_df.loc['client2':'client4':2, 'Client Name'])

(client2     Steve
 client4    Morris
 Name: Client Name, dtype: object,
 pandas.core.series.Series)

In [59]:
bank_client_df.loc['client2':'client4':2, 'Client Name': 'Net worth [$]'], type(bank_client_df.loc['client2':'client4':2, 'Client Name': 'Net worth [$]'])

(        Client Name  Net worth [$]
 client2       Steve           3000
 client4      Morris           2000,
 pandas.core.frame.DataFrame)

In [None]:
# 열에 해당하는 것 fancy indexing 주기
bank_client_df.loc['client2':'client4':2, ['Client Name','Years with bank']]

Unnamed: 0,Client Name,Years with bank
client2,Steve,7
client4,Morris,15


In [63]:
bank_client_df

Unnamed: 0,Client ID,Client Name,Net worth [$],Years with bank
client1,1,Aly,35000,4
client2,2,Steve,3000,7
client3,3,Nicole,100000,10
client4,4,Morris,2000,15


In [None]:
# 이름이 Steve인 고객 정보 출력 # loc와 iloc의 차이가 뭘까나..
bank_client_df.loc[bank_client_df['Client Name'] == 'Steve']

Unnamed: 0,Client ID,Client Name,Net worth [$],Years with bank
client2,2,Steve,3000,7


In [None]:
# client name만 출력
bank_client_df['Client Name'] # series 형태로 client name만

client1       Aly
client2     Steve
client3    Nicole
client4    Morris
Name: Client Name, dtype: object

In [None]:
# client name과 net worth [$] 출력 -> fancy indexing으로 가져오기
bank_client_df[['Client Name','Net worth [$]']]

Unnamed: 0,Client Name,Net worth [$]
client1,Aly,35000
client2,Steve,3000
client3,Nicole,100000
client4,Morris,2000


In [None]:
# 바로 위의 fancy indexing과 같은 결과가 나옴
bank_client_df.filter(items =['Client Name','Net worth [$]'])

Unnamed: 0,Client Name,Net worth [$]
client1,Aly,35000
client2,Steve,3000
client3,Nicole,100000
client4,Morris,2000


In [92]:
bank_client_df.filter(like='$', axis=1) # 열을 가리킴

Unnamed: 0,Net worth [$]
client1,35000
client2,3000
client3,100000
client4,2000


In [93]:
bank_client_df.filter(like='4', axis=0) 

Unnamed: 0,Client ID,Client Name,Net worth [$],Years with bank
client4,4,Morris,2000,15


---
### 행 추가 및 삭제

In [120]:
students = [
    {'name': '호랑이', 'midterm': 95, 'final': 85},
    {'name': '늑대', 'midterm': 93, 'final': 90},
    {'name': '양', 'midterm': 100, 'final': 10}
]

df = pd.DataFrame(students)
df

Unnamed: 0,name,midterm,final
0,호랑이,95,85
1,늑대,93,90
2,양,100,10


In [117]:
df.loc[len(df)] = ['하마',88,72]
df

Unnamed: 0,name,midterm,final
0,호랑이,95,85
1,늑대,93,90
2,양,100,10
3,하마,88,72


In [118]:
add_student_df = pd.DataFrame([['곰',99,24]], columns=['name','midterm','final']) # columns를 지정해야 함
df = pd.concat([df, add_student_df], ignore_index=True) # index가 이상하게 꼬이는걸 방지할 수 있음
df

Unnamed: 0,name,midterm,final
0,호랑이,95,85
1,늑대,93,90
2,양,100,10
3,하마,88,72
4,곰,99,24


In [122]:
# 행 삭제 - index 이용
df.drop(df.index[[0]], inplace=True) # inplace 속성을 넣어서 df에 저장하지 않아도 변화준 것이 그대로 반영됨
df

Unnamed: 0,name,midterm,final
2,양,100,10


---
### 컬럼 추가

In [123]:
df = pd.DataFrame({
    '이름':['토순','곰돌','멍냥'],
    '위치':['독산','종로','천호'],
    '성별':['F','F','M']
})

In [127]:
# 컬럼명 이용
df['취미'] = ['인공지능 공부','NaN','NaN']
df

Unnamed: 0,이름,위치,성별,취미
0,토순,독산,F,인공지능 공부
1,곰돌,종로,F,
2,멍냥,천호,M,


In [129]:
# 컬럼명과 np.where(조건, True일 때 값, False일 때 값) => ndarray를 이용
df['성별(한글)'] = np.where(df['성별'] == 'M', '남성','여성')
df

Unnamed: 0,이름,위치,성별,취미,성별(한글)
0,토순,독산,F,인공지능 공부,여성
1,곰돌,종로,F,,여성
2,멍냥,천호,M,,남성


In [None]:
df = pd.DataFrame({
    '이름':['토순','곰돌','멍냥'],
    '위치':['독산','종로','천호'],
    '성별':['F','F','M'],
    '키': [179,165,157],
    '체중': [50, 48.2, 51.3]
})

In [None]:
# 컬럼명과 기존 컬렴 연산 이용
df['BMI'] = # bmi = kg/m^2