# [Pandas] DataFrame

## 1. 개요

### 1.1 라이브러리 임포트

| **라이브러리명** | **용도** | **별칭** | **import 구문 예시** |
| :--- | :--- | :--- | :--- |
| \*numpy | 수치형 자료 계산 및 연산 | np | import numpy as np |
| \*pandas | 정형 데이터 전처리 및 분석 | pd | import pandas as pd |
| matplotlib | 데이터 차트 시각화 | plt | import matplotlib.pyplot as plt |
| seaborn | 보다 개선된 데이터 시각화 | sns | import seaborn as sns |
| beautifulsoup4(bs4) | 웹 데이터 추출(web scraping) |   | from bs4 import ~~ |
| nltk | 자연어 데이터 전처리 |   | import nltk |
| scikit-learn | 전통적인 머신러닝 알고리즘 |   | from sklearn import ~~ |

### 1.2 pandas
- 데이터타입
    - Series : index, value
    - DataFrame : index, value, column

In [2]:
# Pandas는 Numpy를 기반으로 만들어졌다.
import pandas as pd
import numpy as np

### 1.3 예제 데이터 생성

In [None]:
# random

import random as r

r.random() : 0부터 1까지의 부동소수점 숫자를 리턴
r.randint(a, b) : a부터 b까지의 임의의 정수를 리턴
r.uniform(a, b) : a부터 b까지의 부동소수점 숫자를 리턴
r.randn() : 가우시안 표준 정규분포
r.randrange(a, b, n) : range(a,b,n)에서 랜덤하게 숫자를 리턴
r.shuffle(l) : 안에 있는 요소의 순서를 섞어 리턴
r.choice(l) : 안에 있는 요소 중 하나를 리턴
r.sample(l,n) : 안에 잇는 요소 중 n개를 리턴 r.seed() : 시드값 설정

In [None]:
# numpy

import numpy as np

np.array()
np.arrange()
np.linsapace()
reshape()
np.ones()
np.zeros()
np.eye()
np.empty()
np.full()
np.flatten() / np.ravel()

In [None]:
# pandas

pd.date_range("20220101", periods=5)

---

## 2. 데이터프레임
- 시리즈의 집합

### 2.1 데이터 세팅
#### 2.1.1 데이터 저장하기

In [None]:
# 데이터 저장
df.to_csv("../data/ooo.csv", sep=",", encoding="utf-8") # utf-8-sig도 가능

#### 2.1.2  데이터 불러오기

In [None]:
# df = pd.read_확장자('파일명')

df = pd.read_excel('ooo.xlsx')   
df = pd.read_csv('ooo.csv', encoding='utf-8')  

# 한글이 깨지거나 오류가 생기면 cp949 / euc-kr로 인코딩 시도

In [None]:
# 자료를 읽기 시작할 행(header) 지정
# 인덱스(index_col)로 설정할 컬럼 설정
# 읽어올 엑셀의 컬럼(usecoles) 지정

df = pd.read_excel('ooo.xlsx', header=2, index_col=A, usecols="B, C") 

In [None]:
# df = pd.read_확장자('파일명')
ⓛ df_2 = pd.read_csv('ooo', encoding='utf-8', index_col='기준 열이름')   
② df_2 = pd.read_csv('ooo.csv', encoding='utf-8').set_index('기준 열이름')   

#### 2.1.3 데이터 프레임 직접 생성

In [4]:
# 딕셔너리 형태로 데이터프레임 생성
df = pd.DataFrame({
        'A': ['1','2','3','4'],
        'B': ['5','6','7','8'],
        'C': ['9','10','11','12']
})

df

Unnamed: 0,A,B,C
0,1,5,9
1,2,6,10
2,3,7,11
3,4,8,12


In [5]:
# 딕셔너리 형태로 데이터프레임 생성
df = pd.DataFrame([
    {'A':'1', 'B':'5', 'C':'9'},
    {'A':'2', 'B':'6', 'C':'10'},
    {'A':'3', 'B':'7', 'C':'11'},
    {'A':'4', 'B':'8', 'C':'12'}
])

df

Unnamed: 0,A,B,C
0,1,5,9
1,2,6,10
2,3,7,11
3,4,8,12


In [6]:
# 난수 생성 후 데이터프레임 생성 1

data = np.linspace(0, 1, 25).reshape(5,5)
index_sample = ['idx_1','idx_2','idx_3','idx_4','idx_5']
columns_sample = ['col_1','col_2','col_3','col_4','col_5']

df = pd.DataFrame(data, index=index_sample, columns=columns_sample)
df

Unnamed: 0,col_1,col_2,col_3,col_4,col_5
idx_1,0.0,0.041667,0.083333,0.125,0.166667
idx_2,0.208333,0.25,0.291667,0.333333,0.375
idx_3,0.416667,0.458333,0.5,0.541667,0.583333
idx_4,0.625,0.666667,0.708333,0.75,0.791667
idx_5,0.833333,0.875,0.916667,0.958333,1.0


In [15]:
# 난수 생성 후 데이터프레임 생성 2
# pd.DataFrame(data, index=oo, columns=xx)
df = pd.DataFrame(np.random.randint(50, 100, size=(10, 5)),
     index = [['1학년' for i in range(5)]+
              ['2학년' for i in range(5)],
              [str(i)+'반' for i in range(1, 6)] + 
              [str(i)+'반' for i in range(1, 6)]],
     columns = ['국', '영', '수', '사', '과'])
df

Unnamed: 0,Unnamed: 1,국,영,수,사,과
1학년,1반,62,78,96,91,61
1학년,2반,58,62,82,70,71
1학년,3반,91,86,82,97,75
1학년,4반,71,72,78,72,94
1학년,5반,88,78,64,66,69
2학년,1반,85,70,93,93,68
2학년,2반,68,65,77,98,96
2학년,3반,67,96,95,80,63
2학년,4반,72,64,58,86,54
2학년,5반,85,93,96,62,95


---

## 2. 전처리 방향설정
### 2.1 데이터프레임 정보 탐색
> df.head()   
> df.tail()   
> df.info()   
> df.dtypes   
> df.describe()   

In [4]:
# 예제 데이터 생성

data = np.linspace(0, 1, 25).reshape(5,5)
index_sample = ['idx_1','idx_2','idx_3','idx_4','idx_5']
columns_sample = ['col_1','col_2','col_3','col_4','col_5']

df = pd.DataFrame(data, index=index_sample, columns=columns_sample)
df

Unnamed: 0,col_1,col_2,col_3,col_4,col_5
idx_1,0.0,0.041667,0.083333,0.125,0.166667
idx_2,0.208333,0.25,0.291667,0.333333,0.375
idx_3,0.416667,0.458333,0.5,0.541667,0.583333
idx_4,0.625,0.666667,0.708333,0.75,0.791667
idx_5,0.833333,0.875,0.916667,0.958333,1.0


In [5]:
# 데이터프레임 상단 n개 행 확인
df.head(2)

Unnamed: 0,col_1,col_2,col_3,col_4,col_5
idx_1,0.0,0.041667,0.083333,0.125,0.166667
idx_2,0.208333,0.25,0.291667,0.333333,0.375


In [6]:
# 데이터프레임 하단 n개 행 확인
df.tail(2)

Unnamed: 0,col_1,col_2,col_3,col_4,col_5
idx_4,0.625,0.666667,0.708333,0.75,0.791667
idx_5,0.833333,0.875,0.916667,0.958333,1.0


In [7]:
# 데이터프레임 열 타입, 결측값 개수 확인
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, idx_1 to idx_5
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   col_1   5 non-null      float64
 1   col_2   5 non-null      float64
 2   col_3   5 non-null      float64
 3   col_4   5 non-null      float64
 4   col_5   5 non-null      float64
dtypes: float64(5)
memory usage: 240.0+ bytes


In [11]:
df.dtypes

col_1    float64
col_2    float64
col_3    float64
col_4    float64
col_5    float64
dtype: object

In [8]:
# 데이터프레임 기술통계 정보 확인
df.describe()

Unnamed: 0,col_1,col_2,col_3,col_4,col_5
count,5.0,5.0,5.0,5.0,5.0
mean,0.416667,0.458333,0.5,0.541667,0.583333
std,0.329404,0.329404,0.329404,0.329404,0.329404
min,0.0,0.041667,0.083333,0.125,0.166667
25%,0.208333,0.25,0.291667,0.333333,0.375
50%,0.416667,0.458333,0.5,0.541667,0.583333
75%,0.625,0.666667,0.708333,0.75,0.791667
max,0.833333,0.875,0.916667,0.958333,1.0


### 2.2 인덱스, 컬럼, 값 확인

In [12]:
df.index

Index(['idx_1', 'idx_2', 'idx_3', 'idx_4', 'idx_5'], dtype='object')

In [13]:
df.columns

Index(['col_1', 'col_2', 'col_3', 'col_4', 'col_5'], dtype='object')

In [None]:
# 해당 컬럼의 중복을 제외한 값들 확인
df['컬럼명'].unique()

---

## 3. 데이터 전처리
> 1. 데이터 선택하기   
> 2. 정렬   
> 3. 필터링   
> 4. 결측치 처리
> 5. 깊은 복사   
> 6. 데이터프레임 복사   
> 7. 생성   
> 8. 변경   
> 9. 연산   
> 10. 삭제   
> 11. pivot, pivot_table   
> 12. 데이터프레임 합치기   
> 13. 멀티인덱스, 멀티컬럼   
> 14. 상관계수   
> 15. itterows    

### 3.1 데이터 선택하기

### 3.1.1 열 선택하기
> df['A']   
> df.A   
> df[['A']]   
> df[['A','C']]   

In [547]:
# 실습 데이터 생성

data = np.linspace(0, 1, 15).reshape(5,3)
index_sample = pd.date_range("20220101", periods=5)
columns_sample = ['A','B','C']

df = pd.DataFrame(data, index=index_sample, columns=columns_sample)
df

Unnamed: 0,A,B,C
2022-01-01,0.0,0.071429,0.142857
2022-01-02,0.214286,0.285714,0.357143
2022-01-03,0.428571,0.5,0.571429
2022-01-04,0.642857,0.714286,0.785714
2022-01-05,0.857143,0.928571,1.0


In [548]:
# 한 개 컬럼 선택
df['A']

2022-01-01    0.000000
2022-01-02    0.214286
2022-01-03    0.428571
2022-01-04    0.642857
2022-01-05    0.857143
Freq: D, Name: A, dtype: float64

In [549]:
# 한 개 컬럼 선택
df.A

2022-01-01    0.000000
2022-01-02    0.214286
2022-01-03    0.428571
2022-01-04    0.642857
2022-01-05    0.857143
Freq: D, Name: A, dtype: float64

In [550]:
df[['A']]

Unnamed: 0,A
2022-01-01,0.0
2022-01-02,0.214286
2022-01-03,0.428571
2022-01-04,0.642857
2022-01-05,0.857143


In [551]:
df[['A','C']]

Unnamed: 0,A,C
2022-01-01,0.0,0.142857
2022-01-02,0.214286,0.357143
2022-01-03,0.428571,0.571429
2022-01-04,0.642857,0.785714
2022-01-05,0.857143,1.0


---

### 3.1.2 행 선택하기

> df.loc[행이름]    
> df.loc[[행이름]]   
> df.loc[[행이름1, 행이름2]]   
> df[n:m] : n부터 m-1 까지   

In [552]:
df

Unnamed: 0,A,B,C
2022-01-01,0.0,0.071429,0.142857
2022-01-02,0.214286,0.285714,0.357143
2022-01-03,0.428571,0.5,0.571429
2022-01-04,0.642857,0.714286,0.785714
2022-01-05,0.857143,0.928571,1.0


In [553]:
# df.loc[행이름]
df.loc['20220101']

A    0.000000
B    0.071429
C    0.142857
Name: 2022-01-01 00:00:00, dtype: float64

In [554]:
# df.loc[[행이름]]
df.loc[['20220101']]

Unnamed: 0,A,B,C
2022-01-01,0.0,0.071429,0.142857


In [555]:
# df.loc[[행이름1, 행이름2]]
df.loc[['20220101','20220105']]

Unnamed: 0,A,B,C
2022-01-01,0.0,0.071429,0.142857
2022-01-05,0.857143,0.928571,1.0


In [556]:
# df[n:m]
df['20220101':'20220105']

Unnamed: 0,A,B,C
2022-01-01,0.0,0.071429,0.142857
2022-01-02,0.214286,0.285714,0.357143
2022-01-03,0.428571,0.5,0.571429
2022-01-04,0.642857,0.714286,0.785714
2022-01-05,0.857143,0.928571,1.0


---

### 3.1.3 특정 값 조회하기
> df.loc[]   
> df.iloc[]   
> df.at[]   

- df.loc[]

In [557]:
# 행열 전체
df.loc[:,:]

Unnamed: 0,A,B,C
2022-01-01,0.0,0.071429,0.142857
2022-01-02,0.214286,0.285714,0.357143
2022-01-03,0.428571,0.5,0.571429
2022-01-04,0.642857,0.714286,0.785714
2022-01-05,0.857143,0.928571,1.0


In [558]:
# 행 3일~5일까지, 열 B부터 C까지
df.loc['20220103':'20220105', 'B':'c']

Unnamed: 0,B,C
2022-01-03,0.5,0.571429
2022-01-04,0.714286,0.785714
2022-01-05,0.928571,1.0


In [559]:
# 행 3일~5일까지, 열 B와 C
df.loc['20220103':'20220105', ['B','C']]

Unnamed: 0,B,C
2022-01-03,0.5,0.571429
2022-01-04,0.714286,0.785714
2022-01-05,0.928571,1.0


In [560]:
# 행 3일과 5일, 열 B부터 C까지
df.loc[['20220103','20220105'], 'B':'C']

Unnamed: 0,B,C
2022-01-03,0.5,0.571429
2022-01-05,0.928571,1.0


In [561]:
# 행 3일과 5일, 열 B와 C
df.loc[['20220103','20220105'], ['B','C']]

Unnamed: 0,B,C
2022-01-03,0.5,0.571429
2022-01-05,0.928571,1.0


In [562]:
# 행:1월3일, 열:C
df.loc['20220103', 'C']

0.5714285714285714

- df.at

In [563]:
# df.loc['행이름', '열이름']의 결과와 같음
df.at['20220103', 'C']

0.5714285714285714

- df.iloc

In [564]:
df

Unnamed: 0,A,B,C
2022-01-01,0.0,0.071429,0.142857
2022-01-02,0.214286,0.285714,0.357143
2022-01-03,0.428571,0.5,0.571429
2022-01-04,0.642857,0.714286,0.785714
2022-01-05,0.857143,0.928571,1.0


In [565]:
# 인덱스 번호 기준 3번째 행
df.iloc[3]

A    0.642857
B    0.714286
C    0.785714
Name: 2022-01-04 00:00:00, dtype: float64

In [566]:
# 인덱스 번호 기준 3번째 행, 1번째 열
df.iloc[3,1]

0.7142857142857142

In [567]:
# 인덱스 번호 기준 0~2번째 행, 0~2번째 열
df.iloc[0:3, 0:3]

Unnamed: 0,A,B,C
2022-01-01,0.0,0.071429,0.142857
2022-01-02,0.214286,0.285714,0.357143
2022-01-03,0.428571,0.5,0.571429


In [568]:
# 인덱스 번호 기준 2번째 행, 0번째/2번째 열
df.iloc[2,[0,2]]

A    0.428571
C    0.571429
Name: 2022-01-03 00:00:00, dtype: float64

In [569]:
# 인덱스 번호 기준 전체 행, 0번째/2번째 열
df.iloc[:,[0,2]]

Unnamed: 0,A,C
2022-01-01,0.0,0.142857
2022-01-02,0.214286,0.357143
2022-01-03,0.428571,0.571429
2022-01-04,0.642857,0.785714
2022-01-05,0.857143,1.0


In [570]:
# 인덱스 번호 기준 0번째/2번째 행, 2번째 열
df.iloc[[0,2], 2]

2022-01-01    0.142857
2022-01-03    0.571429
Freq: 2D, Name: C, dtype: float64

In [571]:
# 인덱스 번호 기준 0번째/2번째 행, 전체 열
df.iloc[[0,2], :]

Unnamed: 0,A,B,C
2022-01-01,0.0,0.071429,0.142857
2022-01-03,0.428571,0.5,0.571429


---

### 3.2 정렬

In [572]:
df

Unnamed: 0,A,B,C
2022-01-01,0.0,0.071429,0.142857
2022-01-02,0.214286,0.285714,0.357143
2022-01-03,0.428571,0.5,0.571429
2022-01-04,0.642857,0.714286,0.785714
2022-01-05,0.857143,0.928571,1.0


In [573]:
# 오름차순
df.sort_values(by='A', inplace=False) # inplce : 덮어쓰기 여부 

Unnamed: 0,A,B,C
2022-01-01,0.0,0.071429,0.142857
2022-01-02,0.214286,0.285714,0.357143
2022-01-03,0.428571,0.5,0.571429
2022-01-04,0.642857,0.714286,0.785714
2022-01-05,0.857143,0.928571,1.0


In [574]:
# 내림차순
df.sort_values(by='A', ascending=False, inplace=False) # inplce : 덮어쓰기 여부 

Unnamed: 0,A,B,C
2022-01-05,0.857143,0.928571,1.0
2022-01-04,0.642857,0.714286,0.785714
2022-01-03,0.428571,0.5,0.571429
2022-01-02,0.214286,0.285714,0.357143
2022-01-01,0.0,0.071429,0.142857


In [575]:
# 내부열을 기준열로 설정
df.set_index('A', inplace=False)

Unnamed: 0_level_0,B,C
A,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.071429,0.142857
0.214286,0.285714,0.357143
0.428571,0.5,0.571429
0.642857,0.714286,0.785714
0.857143,0.928571,1.0


In [None]:
# df = pd.read_확장자('파일명')
ⓛ df_2 = pd.read_csv('ooo', encoding='utf-8', index_col='기준 열 이름')   
② df_2 = pd.read_csv('ooo.csv', encoding='utf-8').set_index('기준 열 이름') 

---

### 3.3 필터링

> df[조건] or df.loc[조건]      
> df.isin()   
> str.contains()   
> 조건을 만족하는 갯수   
> df.isnull()   
> df.isnull.sum()   
> df.notnull()   
> df.notnull().sum()  

- df[조건] or df.loc[조건]

In [577]:
df

Unnamed: 0,A,B,C
2022-01-01,0.0,0.071429,0.142857
2022-01-02,0.214286,0.285714,0.357143
2022-01-03,0.428571,0.5,0.571429
2022-01-04,0.642857,0.714286,0.785714
2022-01-05,0.857143,0.928571,1.0


In [578]:
# 시리즈에 조건을 주면, boolean으로 반환
df['A'] > 0.5

2022-01-01    False
2022-01-02    False
2022-01-03    False
2022-01-04     True
2022-01-05     True
Freq: D, Name: A, dtype: bool

In [579]:
# 필터링
df[df['A'] > 0.5]

Unnamed: 0,A,B,C
2022-01-04,0.642857,0.714286,0.785714
2022-01-05,0.857143,0.928571,1.0


In [580]:
df.loc[df['A'] > 0.5, :]

Unnamed: 0,A,B,C
2022-01-04,0.642857,0.714286,0.785714
2022-01-05,0.857143,0.928571,1.0


In [581]:
# 데이터프레임에 조건을 주면, boolean으로 반환
df[['A','C']] > 0.5

Unnamed: 0,A,C
2022-01-01,False,False
2022-01-02,False,False
2022-01-03,False,True
2022-01-04,True,True
2022-01-05,True,True


In [582]:
# 필터링
df[df[['A','C']] > 0.5] # NaN : Not a Number

Unnamed: 0,A,B,C
2022-01-01,,,
2022-01-02,,,
2022-01-03,,,0.571429
2022-01-04,0.642857,,0.785714
2022-01-05,0.857143,,1.0


- df.isin()

In [583]:
df['E'] = ['가', '가', '나', '다', '라']
df

Unnamed: 0,A,B,C,E
2022-01-01,0.0,0.071429,0.142857,가
2022-01-02,0.214286,0.285714,0.357143,가
2022-01-03,0.428571,0.5,0.571429,나
2022-01-04,0.642857,0.714286,0.785714,다
2022-01-05,0.857143,0.928571,1.0,라


In [584]:
df['E'].isin(["가"])

2022-01-01     True
2022-01-02     True
2022-01-03    False
2022-01-04    False
2022-01-05    False
Freq: D, Name: E, dtype: bool

In [585]:
df['E'].isin(["가", "다"])

2022-01-01     True
2022-01-02     True
2022-01-03    False
2022-01-04     True
2022-01-05    False
Freq: D, Name: E, dtype: bool

In [586]:
# 필터링
df[df['E'].isin(["가", "다"])]

Unnamed: 0,A,B,C,E
2022-01-01,0.0,0.071429,0.142857,가
2022-01-02,0.214286,0.285714,0.357143,가
2022-01-04,0.642857,0.714286,0.785714,다


In [587]:
# 필터링
df.loc[df['E'].isin(["가", "다"]),:]

Unnamed: 0,A,B,C,E
2022-01-01,0.0,0.071429,0.142857,가
2022-01-02,0.214286,0.285714,0.357143,가
2022-01-04,0.642857,0.714286,0.785714,다


- str.contains() / sum / value_counts()

In [588]:
df

Unnamed: 0,A,B,C,E
2022-01-01,0.0,0.071429,0.142857,가
2022-01-02,0.214286,0.285714,0.357143,가
2022-01-03,0.428571,0.5,0.571429,나
2022-01-04,0.642857,0.714286,0.785714,다
2022-01-05,0.857143,0.928571,1.0,라


In [589]:
df['E'].str.contains('가')

2022-01-01     True
2022-01-02     True
2022-01-03    False
2022-01-04    False
2022-01-05    False
Freq: D, Name: E, dtype: bool

In [590]:
df[df['E'].str.contains('가')]

Unnamed: 0,A,B,C,E
2022-01-01,0.0,0.071429,0.142857,가
2022-01-02,0.214286,0.285714,0.357143,가


In [591]:
df.loc[df['E'].str.contains('가'), :]

Unnamed: 0,A,B,C,E
2022-01-01,0.0,0.071429,0.142857,가
2022-01-02,0.214286,0.285714,0.357143,가


- 조건을 만족하는 갯수

In [592]:
df

Unnamed: 0,A,B,C,E
2022-01-01,0.0,0.071429,0.142857,가
2022-01-02,0.214286,0.285714,0.357143,가
2022-01-03,0.428571,0.5,0.571429,나
2022-01-04,0.642857,0.714286,0.785714,다
2022-01-05,0.857143,0.928571,1.0,라


In [593]:
# 컬럼 E의 값의 갯수
df['E'].value_counts()

가    2
라    1
다    1
나    1
Name: E, dtype: int64

In [594]:
# '가'를 요소로 하는 것의 행의 갯수
sum(df['E'].str.contains('가'))

2

In [595]:
# 원상복구
del df['E']
df

Unnamed: 0,A,B,C
2022-01-01,0.0,0.071429,0.142857
2022-01-02,0.214286,0.285714,0.357143
2022-01-03,0.428571,0.5,0.571429
2022-01-04,0.642857,0.714286,0.785714
2022-01-05,0.857143,0.928571,1.0


- df.isnull() / df.isnull.sum()

In [596]:
df['D'] = np.nan
df.iloc[0,3] = 0.275414
df.loc['2022-01-03','D'] = 0.156769
df

Unnamed: 0,A,B,C,D
2022-01-01,0.0,0.071429,0.142857,0.275414
2022-01-02,0.214286,0.285714,0.357143,
2022-01-03,0.428571,0.5,0.571429,0.156769
2022-01-04,0.642857,0.714286,0.785714,
2022-01-05,0.857143,0.928571,1.0,


In [597]:
df.isnull()

Unnamed: 0,A,B,C,D
2022-01-01,False,False,False,False
2022-01-02,False,False,False,True
2022-01-03,False,False,False,False
2022-01-04,False,False,False,True
2022-01-05,False,False,False,True


In [598]:
df.isnull().sum()

A    0
B    0
C    0
D    3
dtype: int64

- df.notnull() / df.notnull().sum()

In [599]:
df.notnull()

Unnamed: 0,A,B,C,D
2022-01-01,True,True,True,True
2022-01-02,True,True,True,False
2022-01-03,True,True,True,True
2022-01-04,True,True,True,False
2022-01-05,True,True,True,False


In [600]:
df.notnull().sum()

A    5
B    5
C    5
D    2
dtype: int64

---

### 3.4 결측치 처리

> pd.to_numeric
> df.dropna()   
> df.fillna()    

In [None]:
# 'ignore' -> 만약 숫자로 변경할 수 없는 데이터라면 숫자로 변경하지 않고 원본 데이터를 그대로 반환합니다.
# 'coerce' -> 만약 숫자로 변경할 수 없는 데이터라면 기존 데이터를 지우고 NaN으로 설정하여 반환합니다.
# 'raise' -> 만약 숫자로 변경할 수 없는 데이터라면 에러를 일으키며 코드를 중단합니다.

pd.to_numeric(숫자로 변경할 대상, errors='coerce')

In [601]:
df

Unnamed: 0,A,B,C,D
2022-01-01,0.0,0.071429,0.142857,0.275414
2022-01-02,0.214286,0.285714,0.357143,
2022-01-03,0.428571,0.5,0.571429,0.156769
2022-01-04,0.642857,0.714286,0.785714,
2022-01-05,0.857143,0.928571,1.0,


In [602]:
df.dropna()

Unnamed: 0,A,B,C,D
2022-01-01,0.0,0.071429,0.142857,0.275414
2022-01-03,0.428571,0.5,0.571429,0.156769


In [603]:
df.fillna(1)

Unnamed: 0,A,B,C,D
2022-01-01,0.0,0.071429,0.142857,0.275414
2022-01-02,0.214286,0.285714,0.357143,1.0
2022-01-03,0.428571,0.5,0.571429,0.156769
2022-01-04,0.642857,0.714286,0.785714,1.0
2022-01-05,0.857143,0.928571,1.0,1.0


---

### 3.5 깊은 복사

In [604]:
# 얕은 복사 (shallow copy)   
shallow_copy = df
shallow_copy

Unnamed: 0,A,B,C,D
2022-01-01,0.0,0.071429,0.142857,0.275414
2022-01-02,0.214286,0.285714,0.357143,
2022-01-03,0.428571,0.5,0.571429,0.156769
2022-01-04,0.642857,0.714286,0.785714,
2022-01-05,0.857143,0.928571,1.0,


In [605]:
del shallow_copy['D']
shallow_copy

Unnamed: 0,A,B,C
2022-01-01,0.0,0.071429,0.142857
2022-01-02,0.214286,0.285714,0.357143
2022-01-03,0.428571,0.5,0.571429
2022-01-04,0.642857,0.714286,0.785714
2022-01-05,0.857143,0.928571,1.0


In [606]:
# 원본 데이터에도 영향을 줌
df

Unnamed: 0,A,B,C
2022-01-01,0.0,0.071429,0.142857
2022-01-02,0.214286,0.285714,0.357143
2022-01-03,0.428571,0.5,0.571429
2022-01-04,0.642857,0.714286,0.785714
2022-01-05,0.857143,0.928571,1.0


In [607]:
# 깊은 복사 (deep copy)
deep_copy = df.copy()

In [608]:
del deep_copy['C']
deep_copy

Unnamed: 0,A,B
2022-01-01,0.0,0.071429
2022-01-02,0.214286,0.285714
2022-01-03,0.428571,0.5
2022-01-04,0.642857,0.714286
2022-01-05,0.857143,0.928571


In [609]:
# 원본 데이터에 영향을 미치지 않음
df

Unnamed: 0,A,B,C
2022-01-01,0.0,0.071429,0.142857
2022-01-02,0.214286,0.285714,0.357143
2022-01-03,0.428571,0.5,0.571429
2022-01-04,0.642857,0.714286,0.785714
2022-01-05,0.857143,0.928571,1.0


---

### 3.5 데이터프레임 복사
- df의 컬럼을 df_2에 복사할 경우

In [610]:
df

Unnamed: 0,A,B,C
2022-01-01,0.0,0.071429,0.142857
2022-01-02,0.214286,0.285714,0.357143
2022-01-03,0.428571,0.5,0.571429
2022-01-04,0.642857,0.714286,0.785714
2022-01-05,0.857143,0.928571,1.0


In [611]:
# 실습 데이터 생성

data = np.zeros(10).reshape(5,2)
index_sample = pd.date_range("20220101", periods=5)
columns_sample = ['가','나']

df_2 = pd.DataFrame(data, index=index_sample, columns=columns_sample)
df_2

Unnamed: 0,가,나
2022-01-01,0.0,0.0
2022-01-02,0.0,0.0
2022-01-03,0.0,0.0
2022-01-04,0.0,0.0
2022-01-05,0.0,0.0


In [612]:
# 인덱스가 같았을 때 사용 가능
copy_columns=['A','B']
df_2[copy_columns] = df[copy_columns]
df_2

Unnamed: 0,가,나,A,B
2022-01-01,0.0,0.0,0.0,0.071429
2022-01-02,0.0,0.0,0.214286,0.285714
2022-01-03,0.0,0.0,0.428571,0.5
2022-01-04,0.0,0.0,0.642857,0.714286
2022-01-05,0.0,0.0,0.857143,0.928571


In [613]:
# 컬럼명을 바꾸어 복사를 할때
df_2[['C','D']] = df[['A','B']]
df_2

Unnamed: 0,가,나,A,B,C,D
2022-01-01,0.0,0.0,0.0,0.071429,0.0,0.071429
2022-01-02,0.0,0.0,0.214286,0.285714,0.214286,0.285714
2022-01-03,0.0,0.0,0.428571,0.5,0.428571,0.5
2022-01-04,0.0,0.0,0.642857,0.714286,0.642857,0.714286
2022-01-05,0.0,0.0,0.857143,0.928571,0.857143,0.928571


In [614]:
# 원본데이터에 영향 없음
df

Unnamed: 0,A,B,C
2022-01-01,0.0,0.071429,0.142857
2022-01-02,0.214286,0.285714,0.357143
2022-01-03,0.428571,0.5,0.571429
2022-01-04,0.642857,0.714286,0.785714
2022-01-05,0.857143,0.928571,1.0


---

### 3.6 변경

> 컬럼명, 인덱스명 변경   
> 값 변경   
> 인덱스 초기화   
> 열의 순서 바꾸기   

- 컬럼명, 인덱스명 변경
- df.rename(columns={'before_1':'after_1'}, index={'before_2':'after_2'}, inplace=True)

In [651]:
# 원본 데이터 손실 없도록 깊은 복사
df_change = df.copy()
df_change

Unnamed: 0,A,B,C
2022-01-01,0.0,0.071429,0.142857
2022-01-02,0.214286,0.285714,0.357143
2022-01-03,0.428571,0.5,0.571429
2022-01-04,0.642857,0.714286,0.785714
2022-01-05,0.857143,0.928571,1.0


In [652]:
df_change.rename(columns={'A':'a', 'B':'b', 'C':'c'}, inplace=False)

Unnamed: 0,a,b,c
2022-01-01,0.0,0.071429,0.142857
2022-01-02,0.214286,0.285714,0.357143
2022-01-03,0.428571,0.5,0.571429
2022-01-04,0.642857,0.714286,0.785714
2022-01-05,0.857143,0.928571,1.0


In [653]:
df_change.rename(index= 
    {df.index[0]:'1일차',
     df.index[1]:'2일차',
     df.index[2]:'3일차',
     df.index[3]:'4일차',
     df.index[4]:'5일차',
    }, inplace=False)

Unnamed: 0,A,B,C
1일차,0.0,0.071429,0.142857
2일차,0.214286,0.285714,0.357143
3일차,0.428571,0.5,0.571429
4일차,0.642857,0.714286,0.785714
5일차,0.857143,0.928571,1.0


- 값 변경
    - 데이터타입 : astype()
    - 데이터 변경 : replace()

In [654]:
df_change = df_change.astype(int)
df_change

Unnamed: 0,A,B,C
2022-01-01,0,0,0
2022-01-02,0,0,0
2022-01-03,0,0,0
2022-01-04,0,0,0
2022-01-05,0,0,1


In [655]:
df_change['C'] = df_change['C'].replace(1, 100)
df_change

Unnamed: 0,A,B,C
2022-01-01,0,0,0
2022-01-02,0,0,0
2022-01-03,0,0,0
2022-01-04,0,0,0
2022-01-05,0,0,100


- 인덱스 초기화

In [656]:
# drop 인덱스 생성 여부
df_change.reset_index(drop=False, inplace=False)

Unnamed: 0,index,A,B,C
0,2022-01-01,0,0,0
1,2022-01-02,0,0,0
2,2022-01-03,0,0,0
3,2022-01-04,0,0,0
4,2022-01-05,0,0,100


In [657]:
# drop 인덱스 생성 여부
df_change.reset_index(drop=True, inplace=False)

Unnamed: 0,A,B,C
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,100


- 열의 순서 바꾸기

In [666]:
df_change

Unnamed: 0,A,B,C
2022-01-01,0,0,0
2022-01-02,0,0,0
2022-01-03,0,0,0
2022-01-04,0,0,0
2022-01-05,0,0,100


In [667]:
df_change = df_change[['B','A','C']]
df_change

Unnamed: 0,B,A,C
2022-01-01,0,0,0
2022-01-02,0,0,0
2022-01-03,0,0,0
2022-01-04,0,0,0
2022-01-05,0,0,100


In [668]:
df_change = pd.DataFrame(df_change, columns=['C','B','A'])
df_change

Unnamed: 0,C,B,A
2022-01-01,0,0,0
2022-01-02,0,0,0
2022-01-03,0,0,0
2022-01-04,0,0,0
2022-01-05,100,0,0


---

### 3.7 생성

#### 3.7.1 컬럼 만들기

In [620]:
df

Unnamed: 0,A,B,C
2022-01-01,0.0,0.071429,0.142857
2022-01-02,0.214286,0.285714,0.357143
2022-01-03,0.428571,0.5,0.571429
2022-01-04,0.642857,0.714286,0.785714
2022-01-05,0.857143,0.928571,1.0


In [621]:
# 기존 칼럼이 없으면 추가
# 기존 칼럼이 있으면 수정
# df['A'] = np.nan : NaN 값으로 새로운 열 채우기

df['E'] = ['가', '나', '다', '라', '마']
df

Unnamed: 0,A,B,C,E
2022-01-01,0.0,0.071429,0.142857,가
2022-01-02,0.214286,0.285714,0.357143,나
2022-01-03,0.428571,0.5,0.571429,다
2022-01-04,0.642857,0.714286,0.785714,라
2022-01-05,0.857143,0.928571,1.0,마


In [622]:
del df['E']

#### 2.7.2 `apply`

> sum / np.sum   
> mean / np.mean   
> std / np.std    
> min / np.min   
> max / np.max   
> 람다함수

In [623]:
# 중앙값
df['A'].apply('sum')

2.142857142857143

In [624]:
df['A'].apply(np.sum)

2022-01-01    0.000000
2022-01-02    0.214286
2022-01-03    0.428571
2022-01-04    0.642857
2022-01-05    0.857143
Freq: D, Name: A, dtype: float64

In [625]:
# 여러 열을 동시에 함수 적용
df[['A','C']].apply('sum')

A    2.142857
C    2.857143
dtype: float64

In [626]:
# 새로운 열 생성시 함께 사용 가능
df['총합'] = df[['A','B','C']].apply('sum', axis=1)
df

Unnamed: 0,A,B,C,총합
2022-01-01,0.0,0.071429,0.142857,0.214286
2022-01-02,0.214286,0.285714,0.357143,0.857143
2022-01-03,0.428571,0.5,0.571429,1.5
2022-01-04,0.642857,0.714286,0.785714,2.142857
2022-01-05,0.857143,0.928571,1.0,2.785714


In [627]:
del df['총합']

---

### 3.8 연산
- dtype이 숫자여야 연산이 가능하다

In [697]:
# 실습 데이터 생성

data = np.linspace(0, 1, 20).reshape(5,4)
index_sample = pd.date_range("20220101", periods=5)
columns_sample = ['A','B','C','D']

df = pd.DataFrame(data, index=index_sample, columns=columns_sample)
df

Unnamed: 0,A,B,C,D
2022-01-01,0.0,0.052632,0.105263,0.157895
2022-01-02,0.210526,0.263158,0.315789,0.368421
2022-01-03,0.421053,0.473684,0.526316,0.578947
2022-01-04,0.631579,0.684211,0.736842,0.789474
2022-01-05,0.842105,0.894737,0.947368,1.0


- df의 열B를 열C로 나누는 두가지 방법

In [698]:
# case 1
df['C/D_1'] = df['C']/df['D']
df

Unnamed: 0,A,B,C,D,C/D_1
2022-01-01,0.0,0.052632,0.105263,0.157895,0.666667
2022-01-02,0.210526,0.263158,0.315789,0.368421,0.857143
2022-01-03,0.421053,0.473684,0.526316,0.578947,0.909091
2022-01-04,0.631579,0.684211,0.736842,0.789474,0.933333
2022-01-05,0.842105,0.894737,0.947368,1.0,0.947368


In [699]:
# case 2
df['C/D_2'] = df['C'].div(df['D'], axis=0)
df

Unnamed: 0,A,B,C,D,C/D_1,C/D_2
2022-01-01,0.0,0.052632,0.105263,0.157895,0.666667,0.666667
2022-01-02,0.210526,0.263158,0.315789,0.368421,0.857143,0.857143
2022-01-03,0.421053,0.473684,0.526316,0.578947,0.909091,0.909091
2022-01-04,0.631579,0.684211,0.736842,0.789474,0.933333,0.933333
2022-01-05,0.842105,0.894737,0.947368,1.0,0.947368,0.947368


- df의 하나의 컬럼 다수의 컬럼을 나누는 방법

In [700]:
df[['C/D_3','B/D']] = df[['C','B']].div(df['D'], axis=0)
df

Unnamed: 0,A,B,C,D,C/D_1,C/D_2,C/D_3,B/D
2022-01-01,0.0,0.052632,0.105263,0.157895,0.666667,0.666667,0.666667,0.333333
2022-01-02,0.210526,0.263158,0.315789,0.368421,0.857143,0.857143,0.857143,0.714286
2022-01-03,0.421053,0.473684,0.526316,0.578947,0.909091,0.909091,0.909091,0.818182
2022-01-04,0.631579,0.684211,0.736842,0.789474,0.933333,0.933333,0.933333,0.866667
2022-01-05,0.842105,0.894737,0.947368,1.0,0.947368,0.947368,0.947368,0.894737


---

### 3.9 삭제

- del : 한 번에 하나만 삭제할 수 있음
- drop : 여러개를 동시에 삭제할 수 있음

In [701]:
df

Unnamed: 0,A,B,C,D,C/D_1,C/D_2,C/D_3,B/D
2022-01-01,0.0,0.052632,0.105263,0.157895,0.666667,0.666667,0.666667,0.333333
2022-01-02,0.210526,0.263158,0.315789,0.368421,0.857143,0.857143,0.857143,0.714286
2022-01-03,0.421053,0.473684,0.526316,0.578947,0.909091,0.909091,0.909091,0.818182
2022-01-04,0.631579,0.684211,0.736842,0.789474,0.933333,0.933333,0.933333,0.866667
2022-01-05,0.842105,0.894737,0.947368,1.0,0.947368,0.947368,0.947368,0.894737


In [702]:
del df['C/D_1']
df

Unnamed: 0,A,B,C,D,C/D_2,C/D_3,B/D
2022-01-01,0.0,0.052632,0.105263,0.157895,0.666667,0.666667,0.333333
2022-01-02,0.210526,0.263158,0.315789,0.368421,0.857143,0.857143,0.714286
2022-01-03,0.421053,0.473684,0.526316,0.578947,0.909091,0.909091,0.818182
2022-01-04,0.631579,0.684211,0.736842,0.789474,0.933333,0.933333,0.866667
2022-01-05,0.842105,0.894737,0.947368,1.0,0.947368,0.947368,0.894737


In [703]:
# 컬럼 하나 삭제
df.drop(['C/D_2'], axis=1, inplace=True)
df

Unnamed: 0,A,B,C,D,C/D_3,B/D
2022-01-01,0.0,0.052632,0.105263,0.157895,0.666667,0.333333
2022-01-02,0.210526,0.263158,0.315789,0.368421,0.857143,0.714286
2022-01-03,0.421053,0.473684,0.526316,0.578947,0.909091,0.818182
2022-01-04,0.631579,0.684211,0.736842,0.789474,0.933333,0.866667
2022-01-05,0.842105,0.894737,0.947368,1.0,0.947368,0.894737


In [709]:
# 컬럼 여러개 삭제
df.drop(['C/D_3','B/D'], axis=1, inplace=True)
df

Unnamed: 0,A,B,C,D
2022-01-01,0.0,0.052632,0.105263,0.157895
2022-01-02,0.210526,0.263158,0.315789,0.368421
2022-01-03,0.421053,0.473684,0.526316,0.578947
2022-01-04,0.631579,0.684211,0.736842,0.789474
2022-01-05,0.842105,0.894737,0.947368,1.0


In [713]:
# 로우 하나 삭제
df.drop([df.index[-1]], axis=0, inplace=True)
df

Unnamed: 0,A,B,C,D
2022-01-01,0.0,0.052632,0.105263,0.157895
2022-01-02,0.210526,0.263158,0.315789,0.368421
2022-01-03,0.421053,0.473684,0.526316,0.578947
2022-01-04,0.631579,0.684211,0.736842,0.789474


In [716]:
# 로우 여러개 삭제
df.drop([df.index[0], df.index[1]], axis=0, inplace=False)

Unnamed: 0,A,B,C,D
2022-01-03,0.421053,0.473684,0.526316,0.578947
2022-01-04,0.631579,0.684211,0.736842,0.789474


---

### 3.11 pivot, pivot_table

**pivot**
- 피벗 필요 요소 : index, columns, values
- pivot_table과는 달리 연산 불가능

In [None]:
pivot_df = df.pivot(index='A' , columns='B' , values='C')
pivot_df

**pivot_table**
- 피벗테이블 기본 형태 : index, columns, values, aggfunc
- pivot과는 달리 연산 가능(aggfunc)

In [None]:
# 기준열로 정렬되며, 숫자데이터가 aggfunc으로 처리된다.
pivot_df = pd.pivot_table(df, index='기준 열이름', aggfunc=np.mean)

# Aggregation function == 집계 함수
# np.mean, np.max, np.min, len ... (평균이 기본 설정값)

- index, columns, value, aggfunc를 여러개 넣어 사용 가능하다

In [None]:
df.pivot_table(
    index=["A", "B", "C"],
    columns=["a", "b"],
    values=["1", "2"], 
    aggfunc=[np.sum, np.mean],
    fill_value=0, # NaN 0으로 처리
    margins=True) # 총계(All) 추가

---

### 3.12 데이터프레임 합치기

- pd.merge()
- pd.join()
- pd.concat()
- append()

#### 3.12.1 pd.merge()

- 두 데이터프레임에 기준이 되는 키값이 모두 있어야 한다

In [727]:
left = pd.DataFrame({
        'key' : ['K1','K2','K3','K4'],
        'A': ['A1','A2','A3','A4'],
        'B': ['B1','B2','B3','B4']
})

left

Unnamed: 0,key,A,B
0,K1,A1,B1
1,K2,A2,B2
2,K3,A3,B3
3,K4,A4,B4


In [729]:
right = pd.DataFrame({
        'key' : ['K2','K3','K4','K5'],
        'C': ['C1','C2','C3','C4'],
        'D': ['D1','D2','D3','D4']
})

right

Unnamed: 0,key,C,D
0,K2,C1,D1
1,K3,C2,D2
2,K4,C3,D3
3,K5,C4,D4


In [732]:
# how ='inner'가 기본 설정값 (교집합)
pd.merge(left,right, on='key')

Unnamed: 0,key,A,B,C,D
0,K2,A2,B2,C1,D1
1,K3,A3,B3,C2,D2
2,K4,A4,B4,C3,D3


In [733]:
# 왼쪽에 있는 데이터프레임 기준으로 데이터를 병합
pd.merge(left,right, how='left', on='key')

Unnamed: 0,key,A,B,C,D
0,K1,A1,B1,,
1,K2,A2,B2,C1,D1
2,K3,A3,B3,C2,D2
3,K4,A4,B4,C3,D3


In [734]:
# 오른쪽에 있는 데이터프레임 기준으로 데이터를 병합
pd.merge(left,right, how='right', on='key')

Unnamed: 0,key,A,B,C,D
0,K2,A2,B2,C1,D1
1,K3,A3,B3,C2,D2
2,K4,A4,B4,C3,D3
3,K5,,,C4,D4


In [735]:
# 합집합
pd.merge(left,right, how='outer', on='key')

Unnamed: 0,key,A,B,C,D
0,K1,A1,B1,,
1,K2,A2,B2,C1,D1
2,K3,A3,B3,C2,D2
3,K4,A4,B4,C3,D3
4,K5,,,C4,D4


#### 3.12.2 pd.join()

- 두 데이터프레임의 기준열이 맞지 않아도 합치기 가능
- 단 행의 길이가 같아야 하며, 열의 이름이 중복되면 안됨

In [744]:
A = pd.DataFrame({
        'key' : ['K1','K2','K3','K4'],
        'A': ['A1','A2','A3','A4'],
        'B': ['B1','B2','B3','B4']
})

A

Unnamed: 0,key,A,B
0,K1,A1,B1
1,K2,A2,B2
2,K3,A3,B3
3,K4,A4,B4


In [746]:
B = pd.DataFrame({
        'C': ['C1','C2','C3','C4'],
        'D': ['D1','D2','D3','D4']
})

B

Unnamed: 0,C,D
0,C1,D1
1,C2,D2
2,C3,D3
3,C4,D4


In [747]:
A.join(B)

Unnamed: 0,key,A,B,C,D
0,K1,A1,B1,C1,D1
1,K2,A2,B2,C2,D2
2,K3,A3,B3,C3,D3
3,K4,A4,B4,C4,D4


In [748]:
B.join(A)

Unnamed: 0,C,D,key,A,B
0,C1,D1,K1,A1,B1
1,C2,D2,K2,A2,B2
2,C3,D3,K3,A3,B3
3,C4,D4,K4,A4,B4


#### 3.12.3 pd.concat()
- pd.concat([A,B], axis=0)

In [749]:
A = pd.DataFrame({
        'key' : ['K1','K2','K3','K4'],
        'A': ['A1','A2','A3','A4'],
        'B': ['B1','B2','B3','B4']
})

A

Unnamed: 0,key,A,B
0,K1,A1,B1
1,K2,A2,B2
2,K3,A3,B3
3,K4,A4,B4


In [751]:
B = pd.DataFrame({
        'key' : ['K2','K3','K4','K5'],
        'C': ['C1','C2','C3','C4'],
        'D': ['D1','D2','D3','D4']
})

B

Unnamed: 0,key,C,D
0,K2,C1,D1
1,K3,C2,D2
2,K4,C3,D3
3,K5,C4,D4


In [752]:
# 행방향으로 병합
pd.concat([A, B], axis = 0)

Unnamed: 0,key,A,B,C,D
0,K1,A1,B1,,
1,K2,A2,B2,,
2,K3,A3,B3,,
3,K4,A4,B4,,
0,K2,,,C1,D1
1,K3,,,C2,D2
2,K4,,,C3,D3
3,K5,,,C4,D4


In [753]:
# 열방향으로 병합
pd.concat([A, B], axis = 1)

Unnamed: 0,key,A,B,key.1,C,D
0,K1,A1,B1,K2,C1,D1
1,K2,A2,B2,K3,C2,D2
2,K3,A3,B3,K4,C3,D3
3,K4,A4,B4,K5,C4,D4


In [87]:
# concat 옵션
a = pd.DataFrame(['A', 'B', 'C', 'D', 'E','F'], index=range(1, 7))
b = pd.DataFrame(['a', 'b', 'c', 'd', 'e'], index=range(1, 6))
c = pd.DataFrame(['ㄱ', 'ㄴ', 'ㄷ', 'ㄹ', 'ㅁ'], index=range(1, 6))

In [88]:
pd.concat([a,b,c])

Unnamed: 0,0
1,A
2,B
3,C
4,D
5,E
6,F
1,a
2,b
3,c
4,d


In [80]:
pd.concat([a,b,c], axis=1)

Unnamed: 0,0,0.1,0.2
1,A,a,ㄱ
2,B,b,ㄴ
3,C,c,ㄷ
4,D,d,ㄹ
5,E,e,ㅁ


In [93]:
# 교집합
pd.concat([a,b,c], axis=1, join='inner')

Unnamed: 0,0,0.1,0.2
1,A,a,ㄱ
2,B,b,ㄴ
3,C,c,ㄷ
4,D,d,ㄹ
5,E,e,ㅁ


In [91]:
# 합집합
pd.concat([a,b,c], axis=1, join='outer')

Unnamed: 0,0,0.1,0.2
1,A,a,ㄱ
2,B,b,ㄴ
3,C,c,ㄷ
4,D,d,ㄹ
5,E,e,ㅁ
6,F,,


In [94]:
# verify_integrity : 인덱스에 중복되는값이 있을 때, 에러처리
# ignore_index : 인덱스 초기화
# copy : 덮어쓰기 여부
# axis : 0(행방향), 1(열방향)

pd.concat([a, b, c], verify_integrity=False, ignore_index=True, copy=False, axis=1)

Unnamed: 0,0,1,2
1,A,a,ㄱ
2,B,b,ㄴ
3,C,c,ㄷ
4,D,d,ㄹ
5,E,e,ㅁ
6,F,,


#### 3.12.3 append()

In [84]:
a

Unnamed: 0,0
1,A
2,B
3,C
4,D
5,E


In [85]:
a.append(b)

Unnamed: 0,0
1,A
2,B
3,C
4,D
5,E
1,a
2,b
3,c
4,d
5,e


In [96]:
a.append(b)

Unnamed: 0,0
1,A
2,B
3,C
4,D
5,E
6,F
1,a
2,b
3,c
4,d


---

### 3.13 멀티인덱스, 멀티컬럼
> multi index   
> multi coloumns   
> index / slicing   

- multi index

In [64]:
# 예제 데이터 생성
df_1 = pd.DataFrame(np.random.randint(50, 100, size=(10, 5)),
     index = [['1학년' for i in range(5)]+
              ['2학년' for i in range(5)],
              [str(i)+'반' for i in range(1, 6)] + 
              [str(i)+'반' for i in range(1, 6)]],
     columns = ['국', '영', '수', '사', '과'])
df_1

Unnamed: 0,Unnamed: 1,국,영,수,사,과
1학년,1반,97,65,88,57,53
1학년,2반,60,74,77,60,95
1학년,3반,68,64,65,96,84
1학년,4반,90,69,55,50,82
1학년,5반,83,51,88,53,97
2학년,1반,81,70,53,85,51
2학년,2반,75,84,91,61,85
2학년,3반,75,63,90,57,98
2학년,4반,62,56,92,91,87
2학년,5반,83,93,65,99,91


In [65]:
# 인덱스를 단계별로 조회하는 방법
df_1.index.get_level_values

<bound method MultiIndex.get_level_values of MultiIndex([('1학년', '1반'),
            ('1학년', '2반'),
            ('1학년', '3반'),
            ('1학년', '4반'),
            ('1학년', '5반'),
            ('2학년', '1반'),
            ('2학년', '2반'),
            ('2학년', '3반'),
            ('2학년', '4반'),
            ('2학년', '5반')],
           )>

In [66]:
df_1.loc['1학년']

Unnamed: 0,국,영,수,사,과
1반,97,65,88,57,53
2반,60,74,77,60,95
3반,68,64,65,96,84
4반,90,69,55,50,82
5반,83,51,88,53,97


In [67]:
df_1.loc['1학년','1반']

국    97
영    65
수    88
사    57
과    53
Name: (1학년, 1반), dtype: int32

In [68]:
# 특정 레벨의 컬럼을 제거할 수 있다
df_1.droplevel(axis=0, level=0)

Unnamed: 0,국,영,수,사,과
1반,97,65,88,57,53
2반,60,74,77,60,95
3반,68,64,65,96,84
4반,90,69,55,50,82
5반,83,51,88,53,97
1반,81,70,53,85,51
2반,75,84,91,61,85
3반,75,63,90,57,98
4반,62,56,92,91,87
5반,83,93,65,99,91


- multi columns

In [69]:
# 예제 데이터 생성
df_2 = pd.DataFrame(np.random.randint(50, 100, size=(5, 10)),
       index = ['국', '영', '수', '사', '과'],
       columns = [['1학년' for i in range(5)]+
                  ['2학년' for i in range(5)],
                  [str(i)+'반' for i in range(1, 6)] + 
                  [str(i)+'반' for i in range(1, 6)]],)
df_2

Unnamed: 0_level_0,1학년,1학년,1학년,1학년,1학년,2학년,2학년,2학년,2학년,2학년
Unnamed: 0_level_1,1반,2반,3반,4반,5반,1반,2반,3반,4반,5반
국,95,89,78,69,95,61,64,76,84,71
영,99,70,81,64,50,88,65,62,52,84
수,75,76,55,80,76,61,64,97,80,77
사,56,67,50,78,76,53,66,54,98,84
과,79,70,58,96,67,64,77,59,68,71


In [70]:
# 컬럼을 단계별로 조회하는 방법
df_2.columns.get_level_values

<bound method MultiIndex.get_level_values of MultiIndex([('1학년', '1반'),
            ('1학년', '2반'),
            ('1학년', '3반'),
            ('1학년', '4반'),
            ('1학년', '5반'),
            ('2학년', '1반'),
            ('2학년', '2반'),
            ('2학년', '3반'),
            ('2학년', '4반'),
            ('2학년', '5반')],
           )>

In [71]:
df_2.columns.get_level_values(0)[1]

'1학년'

In [72]:
# df['level0 컬럼명']['level1 컬럼명']
df_2['1학년']['1반']

국    95
영    99
수    75
사    56
과    79
Name: 1반, dtype: int32

In [73]:
# df['level0 컬럼','level1 컬럼','level2 컬럼', ...]
df_2['1학년','1반']

국    95
영    99
수    75
사    56
과    79
Name: (1학년, 1반), dtype: int32

In [74]:
# 특정 레벨의 컬럼을 제거할 수 있다
df_2.droplevel(axis=1, level=0)

Unnamed: 0,1반,2반,3반,4반,5반,1반.1,2반.1,3반.1,4반.1,5반.1
국,95,89,78,69,95,61,64,76,84,71
영,99,70,81,64,50,88,65,62,52,84
수,75,76,55,80,76,61,64,97,80,77
사,56,67,50,78,76,53,66,54,98,84
과,79,70,58,96,67,64,77,59,68,71


- index, slicing

In [120]:
df = pd.DataFrame(np.random.randint(50, 100, size=(10, 5)),
     index = [['1학년' for i in range(5)]+
              ['2학년' for i in range(5)],
              [str(i)+'반' for i in range(1, 6)] + 
              [str(i)+'반' for i in range(1, 6)]],
     columns = ['국', '영', '수', '사', '과'])
df

Unnamed: 0,Unnamed: 1,국,영,수,사,과
1학년,1반,84,73,61,91,55
1학년,2반,61,62,87,94,60
1학년,3반,77,57,58,91,99
1학년,4반,71,91,96,77,67
1학년,5반,72,67,71,72,64
2학년,1반,51,93,65,82,93
2학년,2반,67,60,97,61,52
2학년,3반,85,76,87,90,63
2학년,4반,71,51,75,90,62
2학년,5반,60,64,78,61,70


In [121]:
# 인덱스
df.loc['1학년','1반']

국    84
영    73
수    61
사    91
과    55
Name: (1학년, 1반), dtype: int32

In [122]:
# 인덱스
df[0:1]

Unnamed: 0,Unnamed: 1,국,영,수,사,과
1학년,1반,84,73,61,91,55


In [123]:
# 컬럼
df['국']

1학년  1반    84
     2반    61
     3반    77
     4반    71
     5반    72
2학년  1반    51
     2반    67
     3반    85
     4반    71
     5반    60
Name: 국, dtype: int32

In [124]:
# 컬럼
df[['국','영','수']]

Unnamed: 0,Unnamed: 1,국,영,수
1학년,1반,84,73,61
1학년,2반,61,62,87
1학년,3반,77,57,58
1학년,4반,71,91,96
1학년,5반,72,67,71
2학년,1반,51,93,65
2학년,2반,67,60,97
2학년,3반,85,76,87
2학년,4반,71,51,75
2학년,5반,60,64,78


In [125]:
# 인덱스 + 컬럼
df.loc['1학년',['국','영','수']]

Unnamed: 0,국,영,수
1반,84,73,61
2반,61,62,87
3반,77,57,58
4반,71,91,96
5반,72,67,71


In [126]:
# 인덱스 + 컬럼 + 정렬`
df.loc['1학년',['국']][::-1]

Unnamed: 0,국
5반,72
4반,71
3반,77
2반,61
1반,84


In [127]:
# 인덱스 + 컬럼 + 정렬
df.iloc[::2, [0, 3]]

Unnamed: 0,Unnamed: 1,국,사
1학년,1반,84,91
1학년,3반,77,91
1학년,5반,72,72
2학년,2반,67,61
2학년,4반,71,90


In [128]:
df_2 = pd.DataFrame(np.random.randint(50, 100, size=(5, 5)),
     index = [['3학년' for i in range(5)],
              [str(i)+'반' for i in range(1, 6)]],
     columns = ['국', '영', '수', '사', '과'])
df_2

Unnamed: 0,Unnamed: 1,국,영,수,사,과
3학년,1반,57,56,87,65,91
3학년,2반,69,82,61,66,73
3학년,3반,83,56,70,56,80
3학년,4반,52,79,77,50,68
3학년,5반,73,88,94,85,84


In [129]:
df = df.append(df_2)
df

Unnamed: 0,Unnamed: 1,국,영,수,사,과
1학년,1반,84,73,61,91,55
1학년,2반,61,62,87,94,60
1학년,3반,77,57,58,91,99
1학년,4반,71,91,96,77,67
1학년,5반,72,67,71,72,64
2학년,1반,51,93,65,82,93
2학년,2반,67,60,97,61,52
2학년,3반,85,76,87,90,63
2학년,4반,71,51,75,90,62
2학년,5반,60,64,78,61,70


In [130]:
df['평균'] = (df['국'] + df['영'] + df['수'] + df['사'] + df['과']) / 5
df

Unnamed: 0,Unnamed: 1,국,영,수,사,과,평균
1학년,1반,84,73,61,91,55,72.8
1학년,2반,61,62,87,94,60,72.8
1학년,3반,77,57,58,91,99,76.4
1학년,4반,71,91,96,77,67,80.4
1학년,5반,72,67,71,72,64,69.2
2학년,1반,51,93,65,82,93,76.8
2학년,2반,67,60,97,61,52,67.4
2학년,3반,85,76,87,90,63,80.2
2학년,4반,71,51,75,90,62,69.8
2학년,5반,60,64,78,61,70,66.6


---

### 3.14 통계
> 공분산 : df.cov()   
> 상관계수 (피어슨 상관계수): df.corr()   

#### 3.14.1 공분산
- 양의 상관관계면 값은 양수 (a 증가시, b 증가)
- 음의 상관관계면 값은 음수 (a 증가시, b 감소)

In [132]:
df.cov()

Unnamed: 0,국,영,수,사,과,평균
국,119.980952,-25.166667,-19.842857,59.2,-32.919048,20.250476
영,-25.166667,192.380952,15.785714,12.857143,18.404762,42.852381
수,-19.842857,15.785714,175.114286,-16.328571,-57.9,19.365714
사,59.2,12.857143,-16.328571,217.257143,-1.1,54.377143
과,-32.919048,18.404762,-57.9,-1.1,204.495238,26.19619
평균,20.250476,42.852381,19.365714,54.377143,26.19619,32.608381


#### 3.14.2 상관계수 (피어슨 상관계수)
- 회귀 분석을 통해, 두 변수간의 인과관계의 방향, 정도 등을 확인

In [None]:
# r이 -1.0과 -0.7 사이이면, 강한 음적 선형관계,
# r이 -0.7과 -0.3 사이이면, 뚜렷한 음적 선형관계,
# r이 -0.3과 -0.1 사이이면, 약한 음적 선형관계,
# r이 -0.1과 +0.1 사이이면, 거의 무시될 수 있는 선형관계,
# r이 +0.1과 +0.3 사이이면, 약한 양적 선형관계,
# r이 +0.3과 +0.7 사이이면, 뚜렷한 양적 선형관계,
# r이 +0.7과 +1.0 사이이면, 강한 양적 선형관계

In [133]:
# correlation의 약자
df.corr()

Unnamed: 0,국,영,수,사,과,평균
국,1.0,-0.165649,-0.136895,0.366673,-0.21016,0.323753
영,-0.165649,1.0,0.086005,0.062889,0.092791,0.54104
수,-0.136895,0.086005,1.0,-0.083714,-0.305968,0.256276
사,0.366673,0.062889,-0.083714,1.0,-0.005219,0.646048
과,-0.21016,0.092791,-0.305968,-0.005219,1.0,0.320798
평균,0.323753,0.54104,0.256276,0.646048,0.320798,1.0


In [135]:
df['국'].corr(df['영'])

-0.16564888206423067

---

### 3.15 itterows

- 반복문에서 사용하며, 인덱스와 내용으로 나누어 받을 수 있다.   
- 내용에 있는 값에 접근하는 방법 : rows["열이름"] or rows.열이름   

In [None]:
for idx, rows in df.iterrows():
  print(idx) # 인덱스
  print(rows) # 내용