## <strong> 8. Pandas 객체 생성 및 조작 </strong>

```Pandas``` 라이브러리 설치

In [1]:
!pip install pandas



In [4]:
import pandas as pd
import numpy as np

In [5]:
# 헬프 문서 출력
pd?

[1;31mType:[0m        module
[1;31mString form:[0m <module 'pandas' from 'C:\\Users\\USER\\anaconda3\\Lib\\site-packages\\pandas\\__init__.py'>
[1;31mFile:[0m        c:\users\user\anaconda3\lib\site-packages\pandas\__init__.py
[1;31mDocstring:[0m  
pandas - a powerful data analysis and manipulation library for Python

**pandas** is a Python package providing fast, flexible, and expressive data
structures designed to make working with "relational" or "labeled" data both
easy and intuitive. It aims to be the fundamental high-level building block for
doing practical, **real world** data analysis in Python. Additionally, it has
the broader goal of becoming **the most powerful and flexible open source data
analysis / manipulation tool available in any language**. It is already well on
its way toward this goal.

Main Features
-------------
Here are just a few of the things that pandas does well:

  - Easy handling of missing data in floating point as well as non-floating
    point da

In [9]:
# 라이브러리 버전 확인
pd.__version__

'2.2.2'

### Pandas 객체: <strong> Series </strong>

In [11]:
# [+] List 객체에서 Series 객체를 생성
ser = pd.Series([0.25, 0.5, 0.75, 1.0])
ser

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [19]:
# Pandas 객체 속성: .values pandas -> ndarray
vals = ser.values
vals

array([0.25, 0.5 , 0.75, 1.  ])

In [21]:
# Pandas 객체 속성: .index
ind = ser.index
print(ind)
print(list(ind))

RangeIndex(start=0, stop=4, step=1)
[0, 1, 2, 3]


In [27]:
# [+] 레이블 기반 인덱싱
ser = pd.Series([0.25, 0.5, 0.75, 1.0], 
                index=['a','b','c','d'])
print(ser)
print(ser['c'])

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
0.75


### <strong> Dictionary와 Series 객체 </strong>

In [14]:
# Dictionary 객체에서 Series 객체를 생성
population_dict = {
    'California': 38332521,   # 미국 연방주 별 인구
    'Texas': 26448193,
    'New York': 19651127,
    'Florida': 19552860,
    'Illinois': 12882135
}

population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [16]:
# [+] 레이블 기반 인덱싱: 'California'에서 'New York'까지
population['California':'New York']

California    38332521
Texas         26448193
New York      19651127
dtype: int64

### Pandas 객체:<strong> DataFrame </strong>

In [19]:
# 미국 연방주 별 면적
area_dict = {
    'California': 423967,
    'Texas': 695662,
    'New York': 141297,
    'Florida': 170312,
    'Illinois': 149995
}

area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [21]:
# [+] 'population'과 'area' Series 객체를 포함하는 DataFrame 객체 생성
states = pd.DataFrame({'population' : population,
                      'area' : area})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [23]:
# DataFrame 객체의 인덱스와 컬럼
print(states.index)
print(states.columns)

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')
Index(['population', 'area'], dtype='object')


In [50]:
states.values

array([[38332521,   423967],
       [26448193,   695662],
       [19651127,   141297],
       [19552860,   170312],
       [12882135,   149995]], dtype=int64)

In [27]:
# [+] DataFrame으로부터 Series 객체 접근하기
states['population'] # 보통 ['']을 사용
states.population # .을 사용하면 공백으로 인한 오류가 생길 수 있음 예)state capital

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
Name: population, dtype: int64

#### NumPy 배열로부터 DataFrame 객체 생성
+ 예제, 오늘의 운세: 금전운(```money_fortune```), 연애운(```love_fortune```)

In [30]:
# [+] 실수 난수로 이루어진 12X2 크기의 NumPy 배열 생성
arr = np.random.rand(12, 2)
arr

array([[0.57899026, 0.84419026],
       [0.10791013, 0.36926494],
       [0.99364578, 0.43403608],
       [0.94108605, 0.7943297 ],
       [0.56780557, 0.1108114 ],
       [0.26299455, 0.22794823],
       [0.54694217, 0.64832938],
       [0.29813028, 0.94326486],
       [0.91804926, 0.09841683],
       [0.32900912, 0.50915737],
       [0.7170307 , 0.90405557],
       [0.08424724, 0.03014122]])

In [32]:
# NumPy 배열로부터 DataFrame 객체 생성
df = pd.DataFrame(
    arr, 
    columns=['money_fortune', 'love_fortune'], 
    index=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
          'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
)

df

Unnamed: 0,money_fortune,love_fortune
Jan,0.57899,0.84419
Feb,0.10791,0.369265
Mar,0.993646,0.434036
Apr,0.941086,0.79433
May,0.567806,0.110811
Jun,0.262995,0.227948
Jul,0.546942,0.648329
Aug,0.29813,0.943265
Sep,0.918049,0.098417
Oct,0.329009,0.509157


### <strong> Series 객체 조작 </strong>

#### Dictionary 스타일 조작

In [36]:
# Series 객체 생성
ser = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])

print(ser)    
print('a' in ser)   # [+] Key + in 키워드
print(0.25 in ser)  # [+] Value + in 키워드
print(ser.index)    # [+] 인덱스
print(ser.keys())   # [+] 키 집합
ser['e'] = 1.25     # [+] 값 추가
ser['a'] = 0.125    # [+] 값 수정
print(ser)

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
True
False
Index(['a', 'b', 'c', 'd'], dtype='object')
Index(['a', 'b', 'c', 'd'], dtype='object')
a    0.125
b    0.500
c    0.750
d    1.000
e    1.250
dtype: float64


#### 배열 스타일 조작

In [39]:
print(ser['a':'c'])   # 슬라이싱
print(ser[(ser > 0.3) & (ser < 0.8)])   # 논리 연산
# 팬시 인덱싱
ind = ['a', 'e'] 
print(ser[ind])

a    0.125
b    0.500
c    0.750
dtype: float64
b    0.50
c    0.75
dtype: float64
a    0.125
e    1.250
dtype: float64


#### Pandas 객체 인덱싱
+ 정수 기반 인덱싱(암묵적, implicit)
+ 레이블 기반 인덱싱(명시적, explicit)

In [46]:
# 슬라이싱을 이용한 'a', 'b', 'c' 선택
print(ser[0:3])   # 정수 기반 인덱싱
print(ser['a':'c'])     # 레이블 기반 인덱싱

a    0.125
b    0.500
c    0.750
dtype: float64
a    0.125
b    0.500
c    0.750
dtype: float64


### <strong> DataFrame 객체 조작 </strong>

In [44]:
# 특정 Series 객체 접근
states['area']
states.area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [52]:
# [+] Series 객체 추가 (밀도 = 인구 / 면적)
states['density'] = states['population'] / states['area']
states

Unnamed: 0,population,area,density
California,38332521,423967,90.413926
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763


#### 인덱서: ```loc```, ```iloc```

In [6]:
# Series 객체 생성
ser = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
ser

1    a
3    b
5    c
dtype: object

In [8]:
# loc 인덱서
print(ser.loc[1])
print(ser.loc[1:3])

a
1    a
3    b
dtype: object


In [10]:
# iloc 인덱서
print(ser.iloc[1])
print(ser.iloc[1:3])

b
3    b
5    c
dtype: object


In [42]:
# DataFrame 객체 = 2차원 배열
print(states.values, '\n')        # 값들을 얻어오기
print(states.T, '\n')             # 전치행렬
print(states.iloc[:3, :2], '\n')  # 정수 기반 슬라이싱
print(states.loc[: 'Illinois', : 'population'])  # 레이블 기반 슬라이싱

[[38332521   423967]
 [26448193   695662]
 [19651127   141297]
 [19552860   170312]
 [12882135   149995]] 

            California     Texas  New York   Florida  Illinois
population    38332521  26448193  19651127  19552860  12882135
area            423967    695662    141297    170312    149995 

            population    area
California    38332521  423967
Texas         26448193  695662
New York      19651127  141297 

            population
California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135


In [54]:
# 마스킹 + 팬시 인덱싱
states.loc[states.density > 100, ['population', 'density']]

Unnamed: 0,population,density
New York,19651127,139.076746
Florida,19552860,114.806121


In [56]:
# 값 수정
states.iloc[0, 2] = 90
states

Unnamed: 0,population,area,density
California,38332521,423967,90.0
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763
