# Pandas

In [None]:
import pandas as pd
import numpy as np

In [None]:
li1 = [1, 2, 3]

ar1 = np.array(li1)
ar1 
# array([1, 2, 3])

sr = pd.Series(li1)
sr
# 0    1
# 1    2
# 2    3
# dtype: int64

In [None]:
li1 = [1, 2.0, 3]

ar1 = np.array(li1)
ar1 
# array([1., 2., 3.])

sr = pd.Series(li1)
sr
# 0    1.0
# 1    2.0
# 2    3.0
# dtype: float64

In [None]:
li1 = [1, 2.0, '3']

ar1 = np.array(li1)
ar1 
# array(['1', '2.0', '3'], dtype='<U32')

sr = pd.Series(li1)
sr
# 0      1
# 1    2.0
# 2      3
# dtype: object

In [None]:
type(sr) # pandas.core.series.Series
sr.ndim # 1
sr.shape # (3,)
sr.size # 3
sr.dtype # dtype('O')

### 시리즈 원소별 자료형 확인

In [None]:
for i in sr:
    print(type(i))
# <class 'int'>
# <class 'float'>
# <class 'str'>

### 시리즈 원소의 자료형 변환

In [None]:
sr.astype(str) + '1'
# 0      11
# 1    2.01
# 2      31
# dtype: object

In [None]:
sr.astype(int) + 1
# 0    2
# 1    3
# 2    4
# dtype: int64

In [None]:
sr.astype(float) + 1
# 0    2.0
# 1    3.0
# 2    4.0
# dtype: float64

In [None]:
sr.astype(bool)
# 0    True
# 1    True
# 2    True
# dtype: bool

### 시리즈 인덱스

In [None]:
sr.index # RangeIndex(start=0, stop=3, step=1)

In [None]:
pd.Series(li1, index=[1, 2, 3])
# 1      1
# 2    2.0
# 3      3
# dtype: object

In [None]:
pd.Series(li1, index=[1.0, 2.0, 3.0])
# 1.0      1
# 2.0    2.0
# 3.0      3
# dtype: object

In [None]:
pd.Series(li1, index=['a', 'b', 'c'])
# a      1
# b    2.0
# c      3
# dtype: object

In [None]:
pd.Series(li1, index=[1, 1, 1])
# 1      1
# 1    2.0
# 1      3
# dtype: object

### 시리즈 인덱스 변경

In [None]:
sr.index = [1, 2, 3]
sr
# 1      1
# 2    2.0
# 3      3
# dtype: object

In [None]:
sr.index = [1.0, 2.0, 3.0]
sr
# 1.0      1
# 2.0    2.0
# 3.0      3
# dtype: object

In [None]:
sr.index = ['a', 'b', 'c']
sr
# a      1
# b    2.0
# c      3
# dtype: object

In [None]:
sr.index = [1, 1, 1]
sr
# 1      1
# 1    2.0
# 1      3
# dtype: object

### 딕셔너리로 시리즈 생성

In [None]:
di = {'a': 1, 'b': 2, 'c': 3}

sr = pd.Series(di)
sr
# a    1
# b    2
# c    3
# dtype: int64

### 시리즈 기본 인덱싱 및 슬라이싱

In [None]:
sr
# a    1
# b    2
# c    3
# dtype: int64

In [None]:
sr[0]
# FutureWarning
# np.int64(1)

sr.iloc[0]
# np.int64(1)

sr['a']
# np.int64(1)

In [None]:
sr[0:3]
# a    1
# b    2
# c    3
# dtype: int64

In [None]:
sr['a':'c']
# a    1
# b    2
# c    3
# dtype: int64

In [None]:
sr0 = pd.Series([1, 2, 3], index=['c', 'a', 'b'])
sr0
# c    1
# a    2
# b    3
# dtype: int64

In [None]:
# 정수 인덱스는 제외 

sr0['a':'c']
# Series([], dtype: int64)

sr0['c':'a']
# c    1
# a    2
# dtype: int64

### 시리즈 고급 인덱싱

In [None]:
sr[[0, 2]]
# a    1
# c    3
# dtype: int64

In [None]:
sr[[True, False, False]]
# a    1
# dtype: int64

sr[sr < 3]
# a    1
# b    2
# dtype: int64

### iloc 인덱서

In [None]:
sr.iloc[0] 
# np.int64(1)

sr.iloc[0:3]
# a    1
# b    2
# c    3
# dtype: int64

sr.iloc[[0]]
# a    1
# dtype: int64

sr.iloc[[0, 2]]
# a    1
# c    3
# dtype: int64

# 불리언 인덱싱 지원 X
sr.iloc[sr < 3]
# ValueError: iLocation based boolean indexing cannot use an indexable as a mask

### loc 인덱서

In [None]:
sr.loc['a']
# np.int64(1)

sr.loc['a':'c']
# a    1
# b    2
# c    3
# dtype: int64

sr.loc[['a']]
# a    1
# dtype: int64

sr.loc[['a', 'c']]
# a    1
# c    3
# dtype: int64

sr.loc[sr < 3]
# a    1
# b    2
# dtype: int64

### 인덱서 사용 이유

In [None]:
sr1 = pd.Series(li1, index=range(1, 4))
sr1
# 1      1
# 2    2.0
# 3      3
# dtype: object

In [None]:
# 시리즈에 대해 인덱싱을 할 때, loc 인덱서가 생략되었다고 보는 것이 좋음
sr1[0]
# KeyError: 0

In [None]:
sr1.iloc[0] # 1
sr1.loc[1] # 1

In [None]:
sr2 = pd.Series(li1, index=[1] * 3)
sr2
# 1      1
# 1    2.0
# 1      3
# dtype: object

In [None]:
sr2.loc[1]
# 1      1
# 1    2.0
# 1      3
# dtype: object

In [None]:
sr2.iloc[0] # 1

### 시리즈 원소 추가, 변경 및 삭제

In [None]:
sr
# a    1
# b    2
# c    3
# dtype: int64

In [None]:
sr.loc['d'] = 4
sr
# a    1
# b    2
# c    3
# d    4
# dtype: int64

In [None]:
sr.loc['d'] = 5
sr
# a    1
# b    2
# c    3
# d    5
# dtype: int64

In [None]:
sr = sr.drop(index='d')
sr
# a    1
# b    2
# c    3
# dtype: int64

### 시리즈의 비교 연산

In [None]:
sr >= 1
# a    True
# b    True
# c    True
# dtype: bool

In [None]:
for i in sr:
    print(i >= 1 and i < 3)
# True
# True
# False

sr >= 1 and sr < 3
# ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

### 비트 연산자

In [None]:
(sr >= 1) & (sr < 3)
# a     True
# b     True
# c    False
# dtype: bool

In [None]:
(sr >= 1) | (sr < 3)
# a    True
# b    True
# c    True
# dtype: bool

In [None]:
(sr >= 1) ^ (sr < 3)
# a    False
# b    False
# c     True
# dtype: bool

In [None]:
~((sr >= 1) & (sr < 3))
# a    False
# b    False
# c     True
# dtype: bool

## 데이터프레임

### 데이터프레임 생성

In [None]:
li2 = [[1, 2, 3], [4, 5.0, '6']]

# 넘파이 배열
np.array(li2)
# array([['1', '2', '3'],
#        ['4', '5.0', '6']], dtype='<U32')

# 데이터프레임
df = pd.DataFrame(li2)
df
#   0	1	2
# 0	1	2.0	3
# 1	4	5.0	6

In [None]:
df.ndim # 2
df.shape # (2, 3)
df.index # RangeIndex(start=0, stop=2, step=1)
df.columns # RangeIndex(start=0, stop=3, step=1)

df.dtypes
# 0      int64
# 1    float64
# 2     object
# dtype: object

df.values
# array([[1, 2.0, 3],
#        [4, 5.0, '6']], dtype=object)

df.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 2 entries, 0 to 1
# Data columns (total 3 columns):
#  #   Column  Non-Null Count  Dtype  
# ---  ------  --------------  -----  
#  0   0       2 non-null      int64  
#  1   1       2 non-null      float64
#  2   2       2 non-null      object 
# dtypes: float64(1), int64(1), object(1)
# memory usage: 180.0+ bytes

### 행이름과 열이름 변경

In [None]:
df.index = range(1, 3)
df
#   0	1	2
# 1	1	2.0	3
# 2	4	5.0	6

In [None]:
df.columns = ['A', 'B', 'C']
df
#   A	B	C
# 1	1	2.0	3
# 2	4	5.0	6

In [None]:
pd.DataFrame(li2, index=range(1,3), columns=['A', 'B', 'C'])
#   A	B	C
# 1	1	2.0	3
# 2	4	5.0	6

### 데이터프레임 인덱싱

- iloc

In [None]:
df
#   A	B	C
# 1	1	2.0	3
# 2	4	5.0	6

In [None]:
df.iloc[0, :]
# A      1
# B    2.0
# C      3
# Name: 1, dtype: object

In [None]:
df.iloc[0:2, :]
#   A	B	C
# 1	1	2.0	3
# 2	4	5.0	6

In [None]:
df.iloc[[1, 0], :]
#   A	B	C
# 2	4	5.0	6
# 1	1	2.0	3

In [None]:
df.iloc[:, 0]
# 1    1
# 2    4
# Name: A, dtype: int64

In [None]:
df.iloc[:, 0:2]
#   A	B
# 1	1	2.0
# 2	4	5.0

In [None]:
df.iloc[:, [1, 0]]
#   B	A
# 1	2.0	1
# 2	5.0	4

- loc

In [None]:
df.loc[1, :]
# A      1
# B    2.0
# C      3
# Name: 1, dtype: object

In [None]:
df.loc[1:2, :]
#   A	B	C
# 1	1	2.0	3
# 2	4	5.0	6

In [None]:
df.loc[[2, 1], :]
#   A	B	C
# 2	4	5.0	6
# 1	1	2.0	3

In [None]:
df.loc[:, 'A']
# 1    1
# 2    4
# Name: A, dtype: int64

In [None]:
df.loc[:, 'A':'B']
#   A	B
# 1	1	2.0
# 2	4	5.0

In [None]:
df.loc[:, ['B', 'A']]
#   B	A
# 1	2.0	1
# 2	5.0	4

### 행 / 열 추가, 변경 및 삭제

In [None]:
df.loc[4, :] = [7, 8, 9.0]
df
#   A	B	C
# 1	1.0	2.0	3
# 2	4.0	5.0	6
# 4	7.0	8.0	9.0

In [None]:
df.loc[:, 'D'] = ['a', 'b', 'c']
df
#   A	B	C	D
# 1	1.0	2.0	3	a
# 2	4.0	5.0	6	b
# 4	7.0	8.0	9.0	c

In [None]:
df.loc[4, 'D'] = 'd'
df
#   A	B	C	D
# 1	1.0	2.0	3	a
# 2	4.0	5.0	6	b
# 4	7.0	8.0	9.0	d

In [None]:
df = df.drop(index=4, columns='D')
df
#   A	B	C
# 1	1.0	2.0	3
# 2	4.0	5.0	6