## <strong> 8. Pandas 객체 생성 및 조작 </strong>

```Pandas``` 라이브러리 설치

In [2]:
!pip install pandas



In [3]:
import pandas as pd
import numpy as np

In [27]:
# 헬프 문서 출력
pd?

[1;31mType:[0m        module
[1;31mString form:[0m <module 'pandas' from 'C:\\Users\\neh17\\anaconda3\\Lib\\site-packages\\pandas\\__init__.py'>
[1;31mFile:[0m        c:\users\neh17\anaconda3\lib\site-packages\pandas\__init__.py
[1;31mDocstring:[0m  
pandas - a powerful data analysis and manipulation library for Python

**pandas** is a Python package providing fast, flexible, and expressive data
structures designed to make working with "relational" or "labeled" data both
easy and intuitive. It aims to be the fundamental high-level building block for
doing practical, **real world** data analysis in Python. Additionally, it has
the broader goal of becoming **the most powerful and flexible open source data
analysis / manipulation tool available in any language**. It is already well on
its way toward this goal.

Main Features
-------------
Here are just a few of the things that pandas does well:

  - Easy handling of missing data in floating point as well as non-floating
    point 

In [28]:
# 라이브러리 버전 확인
pd.__version__

'2.2.2'

### Pandas 객체: <strong> Series </strong>

In [29]:
# [+] List 객체에서 Series 객체를 생성
ser = pd.Series([0.3, 0.5, 0.8, 1.0])
ser

0    0.3
1    0.5
2    0.8
3    1.0
dtype: float64

In [30]:
# Pandas 객체 속성: .values
vals = ser.values
vals

array([0.3, 0.5, 0.8, 1. ])

In [31]:
# Pandas 객체 속성: .index
ind = ser.index
print(ind)
print(list(ind))

RangeIndex(start=0, stop=4, step=1)
[0, 1, 2, 3]


In [32]:
# [+] 레이블 기반 인덱싱
ser = pd.Series([0.3, 0.5, 0.8, 1.0], index=['a','b','c','d'])
print(ser)
print(ser['c'])

a    0.3
b    0.5
c    0.8
d    1.0
dtype: float64
0.8


### <strong> Dictionary와 Series 객체 </strong>

In [33]:
# Dictionary 객체에서 Series 객체를 생성
population_dict = {
    'California': 38332521,   # 미국 연방주 별 인구
    'Texas': 26448193,
    'New York': 19651127,
    'Florida': 19552860,
    'Illinois': 12882135
}

population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [34]:
# [+] 레이블 기반 인덱싱: 'California'에서 'New York'까지
population['California':'New York']

California    38332521
Texas         26448193
New York      19651127
dtype: int64

### Pandas 객체:<strong> DataFrame </strong>

In [35]:
# 미국 연방주 별 면적
area_dict = {
    'California': 423967,
    'Texas': 695662,
    'New York': 141297,
    'Florida': 170312,
    'Illinois': 149995
}

area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [36]:
# [+] 'population'과 'area' Series 객체를 포함하는 DataFrame 객체 생성
states = pd.DataFrame({'population': population, 'area': area})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [37]:
# DataFrame 객체의 인덱스와 컬럼
print(states.index)
print(states.columns)

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')
Index(['population', 'area'], dtype='object')


In [38]:
# [+] DataFrame으로부터 Series 객체 접근하기
states.area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

#### NumPy 배열로부터 DataFrame 객체 생성
+ 예제, 오늘의 운세: 금전운(```money_fortune```), 연애운(```love_fortune```)

In [39]:
# [+] 실수 난수로 이루어진 12X2 크기의 NumPy 배열 생성
arr = np.random.rand(12, 2)
arr

array([[8.14557217e-02, 7.83135231e-01],
       [1.62356672e-04, 3.52190816e-01],
       [2.58112147e-01, 6.14238799e-01],
       [8.34748525e-01, 8.20791067e-01],
       [4.74032714e-01, 5.52704561e-01],
       [1.01602369e-02, 1.33003753e-01],
       [4.10693949e-01, 2.54098895e-01],
       [7.73403434e-03, 4.81010792e-01],
       [1.73152217e-01, 7.99790094e-01],
       [4.71617479e-02, 8.13260418e-01],
       [7.92519311e-01, 3.56938906e-01],
       [5.27264139e-01, 7.07685459e-02]])

In [40]:
# NumPy 배열로부터 DataFrame 객체 생성
df = pd.DataFrame(
    arr, 
    columns=['money_fortune', 'love_fortune'], 
    index=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
          'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
)

df

Unnamed: 0,money_fortune,love_fortune
Jan,0.081456,0.783135
Feb,0.000162,0.352191
Mar,0.258112,0.614239
Apr,0.834749,0.820791
May,0.474033,0.552705
Jun,0.01016,0.133004
Jul,0.410694,0.254099
Aug,0.007734,0.481011
Sep,0.173152,0.79979
Oct,0.047162,0.81326


### <strong> Series 객체 조작 </strong>

#### Dictionary 스타일 조작

In [41]:
# Series 객체 생성
ser = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])

print(ser)    
print('a' in ser)   # [+] Key + in 키워드
print(0.25 in ser)  # [+] Value + in 키워드
print(ser.index)    # [+] 인덱스
print(ser.keys())   # [+] 키 집합
ser['e'] = 1.25    # [+] 값 추가
ser['a'] = 0.3    # [+] 값 수정
print(ser)

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
True
False
Index(['a', 'b', 'c', 'd'], dtype='object')
Index(['a', 'b', 'c', 'd'], dtype='object')
a    0.30
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64


#### 배열 스타일 조작

In [42]:
print(ser['a':'c'])   # 슬라이싱
print(ser[(ser > 0.3) & (ser < 0.8)])   # 논리 연산
# 팬시 인덱싱
ind = ['a', 'e'] 
print(ser[ind])

a    0.30
b    0.50
c    0.75
dtype: float64
b    0.50
c    0.75
dtype: float64
a    0.30
e    1.25
dtype: float64


#### Pandas 객체 인덱싱
+ 정수 기반 인덱싱(암묵적, implicit)
+ 레이블 기반 인덱싱(명시적, explicit)

In [43]:
# 슬라이싱을 이용한 'a', 'b', 'c' 선택
print(ser[0:3])   # 정수 기반 인덱싱
print(ser['a':'c'])     # 레이블 기반 인덱싱

a    0.30
b    0.50
c    0.75
dtype: float64
a    0.30
b    0.50
c    0.75
dtype: float64


### <strong> DataFrame 객체 조작 </strong>

In [44]:
# 특정 Series 객체 접근
states['area']
states.area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [45]:
# [+] Series 객체 추가 (밀도 = 인구 / 면적)
states['density'] = states['population'] / states['area'] 
states

Unnamed: 0,population,area,density
California,38332521,423967,90.413926
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763


#### 인덱서: ```loc```, ```iloc```

In [46]:
# Series 객체 생성
ser = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
ser

1    a
3    b
5    c
dtype: object

In [47]:
# loc 인덱서
print(ser.loc[1])
print(ser.loc[1:3])

a
1    a
3    b
dtype: object


In [48]:
# iloc 인덱서
print(ser.iloc[1])
print(ser.iloc[1:3])

b
3    b
5    c
dtype: object


In [49]:
# DataFrame 객체 = 2차원 배열
print(states.values, '\n')        # 값들을 얻어오기
print(states.T, '\n')             # 전치행렬
print(states.iloc[:3, :2], '\n')  # 정수 기반 슬라이싱
print(states.loc[: 'Illinois', : 'population'])  # 레이블 기반 슬라이싱

[[3.83325210e+07 4.23967000e+05 9.04139261e+01]
 [2.64481930e+07 6.95662000e+05 3.80187404e+01]
 [1.96511270e+07 1.41297000e+05 1.39076746e+02]
 [1.95528600e+07 1.70312000e+05 1.14806121e+02]
 [1.28821350e+07 1.49995000e+05 8.58837628e+01]] 

              California         Texas      New York       Florida  \
population  3.833252e+07  2.644819e+07  1.965113e+07  1.955286e+07   
area        4.239670e+05  6.956620e+05  1.412970e+05  1.703120e+05   
density     9.041393e+01  3.801874e+01  1.390767e+02  1.148061e+02   

                Illinois  
population  1.288214e+07  
area        1.499950e+05  
density     8.588376e+01   

            population    area
California    38332521  423967
Texas         26448193  695662
New York      19651127  141297 

            population
California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135


In [50]:
# 마스킹 + 팬시 인덱싱
states.loc[states.density > 100, ['population', 'density']]

Unnamed: 0,population,density
New York,19651127,139.076746
Florida,19552860,114.806121


In [56]:
# 값 수정
states.iloc[0, 0] = 2656589
states

Unnamed: 0,population,area,density
California,2656589,4239967,90.6
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763
