<a href="https://colab.research.google.com/github/weepingwillow2001/data_analysis_practice/blob/main/5%EC%9E%A5/5_1_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
from pandas import Series, DataFrame

data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
        "year": [2000, 2001, 2002, 2001, 2002, 2003],
        "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

frame = pd.DataFrame(data)
print(frame)
#     state  year  pop
# 0    Ohio  2000  1.5
# 1    Ohio  2001  1.7
# 2    Ohio  2002  3.6
# 3  Nevada  2001  2.4
# 4  Nevada  2002  2.9
# 5  Nevada  2003  3.2


#head / tail: 처음 / 마지막 5 개 행만 출력
print(frame.head())
#     state  year  pop
# 0    Ohio  2000  1.5
# 1    Ohio  2001  1.7
# 2    Ohio  2002  3.6
# 3  Nevada  2001  2.4
# 4  Nevada  2002  2.9
print(frame.tail())
#    state  year  pop
# 1    Ohio  2001  1.7
# 2    Ohio  2002  3.6
# 3  Nevada  2001  2.4
# 4  Nevada  2002  2.9
# 5  Nevada  2003  3.2


# "열 순서"를 지정하여 DataFrame 생성
frame = pd.DataFrame(data, columns=["year", "state", "pop"])
print(frame)
#    year   state  pop
# 0  2000    Ohio  1.5
# 1  2001    Ohio  1.7
# 2  2002    Ohio  3.6
# 3  2001  Nevada  2.4
# 4  2002  Nevada  2.9
# 5  2003  Nevada  3.2

# 존재하지 않는 열(debt)을 포함하여 DataFrame 생성
frame2 = pd.DataFrame(data, columns=["year", "state", "pop", "debt"])
print(frame2)  #원본 데이터에 'debt' 열이 없으므로 NaN 값으로 채워짐
#  year   state  pop debt
# 0  2000    Ohio  1.5  NaN
# 1  2001    Ohio  1.7  NaN
# 2  2002    Ohio  3.6  NaN
# 3  2001  Nevada  2.4  NaN
# 4  2002  Nevada  2.9  NaN
# 5  2003  Nevada  3.2  NaN


# DataFrame의 "열"에 접근

print(frame2.columns)  # 열 이름 확인
#Index(['year', 'state', 'pop', 'debt'], dtype='object')
print(frame2["state"])  # state 열 접근
# 0      Ohio
# 1      Ohio
# 2      Ohio
# 3    Nevada
# 4    Nevada
# 5    Nevada
# Name: state, dtype: object
print(frame2.year)     # 속성으로도 접근 가능
# 0    2000
# 1    2001
# 2    2002
# 3    2001
# 4    2002
# 5    2003
# Name: year, dtype: int64


# loc을 사용하여 특정 "행" 접근: location[index]
print(frame2.loc[1])  # loc: 1 이라는 "이름(label)"로 접근
# year     2001
# state    Ohio
# pop       1.7
# debt      NaN
# Name: 1, dtype: object
print(frame2.iloc[0]) # iloc: 위치 기반 접근(0번째 행)
# year     2000
# state    Ohio
# pop       1.5
# debt      NaN
# Name: 0, dtype: object


# 모든 행의 debt 열 값을 16.5로 설정
frame2["debt"] = 16.5
print(frame2)
#    year   state  pop  debt
# 0  2000    Ohio  1.5  16.5
# 1  2001    Ohio  1.7  16.5
# 2  2002    Ohio  3.6  16.5
# 3  2001  Nevada  2.4  16.5
# 4  2002  Nevada  2.9  16.5
# 5  2003  Nevada  3.2  16.5
# debt 열을 0부터 5까지의 범위로 설정
import numpy as np
frame2["debt"] = np.arange(6)
print(frame2)
#   year   state  pop  debt
# 0  2000    Ohio  1.5     0
# 1  2001    Ohio  1.7     1
# 2  2002    Ohio  3.6     2
# 3  2001  Nevada  2.4     3
# 4  2002  Nevada  2.9     4
# 5  2003  Nevada  3.2     5



# 특정 인덱스를 가진 Series 생성
val = pd.Series([-1.2, -1.5, -1.7], index=[0, 1, 2])
frame2["debt"] = val
print(frame2)
#    year   state  pop  debt
# 0  2000    Ohio  1.5  -1.2
# 1  2001    Ohio  1.7  -1.5
# 2  2002    Ohio  3.6  -1.7
# 3  2001  Nevada  2.4   NaN
# 4  2002  Nevada  2.9   NaN
# 5  2003  Nevada  3.2   NaN


frame2["eastern"] = frame2["state"] == "Ohio"  #불리언 연산
# state 열의 값이 "Ohio"인 행은 True, 아니면 False 값을 가진 새로운 eastern 열을 생성.
# year   state  pop  debt  eastern
# 0  2000   Ohio  1.5   NaN     True
# 1  2001   Ohio  1.7   NaN     True
# 2  2002   Ohio  3.6   NaN     True
# 3  2001  Nevada  2.4   NaN    False
# 4  2002  Nevada  2.9   NaN    False
# 5  2003  Nevada  3.2   NaN    False


# del 키워드로 열 삭제
del frame2["eastern"]


# "중첩"된 딕셔너리 정의
populations = {"Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6},
               "Nevada": {2001: 2.4, 2002: 2.9}}
# DataFrame으로 변환
frame3 = pd.DataFrame(populations)
print(frame3)
#       Ohio  Nevada
# 2000   1.5     NaN
# 2001   1.7     2.4
# 2002   3.6     2.9

# 행과 열을 바꾸기: Transpose
print(frame3.T)
#         2000  2001  2002
# Ohio     1.5   1.7   3.6
# Nevada   NaN   2.4   2.9

# 특정 인덱스 지정하여 DataFrame 생성
frame4 = pd.DataFrame(populations, index=[2001, 2002, 2003])
print(frame4)
#       Ohio  Nevada
# 2001   1.7     2.4
# 2002   3.6     2.9
# 2003   NaN     NaN

# DataFrame의 인덱스와 열에 이름 지정:
frame3.index.name = "year"
frame3.columns.name = "state"
print(frame3)
# state  Ohio  Nevada
# year
# 2000    1.5     NaN
# 2001    1.7     2.4
# 2002    3.6     2.9

# DataFrame을 NumPy 배열로 변환:
frame3.to_numpy()
# array([[1.5, nan],
#        [1.7, 2.4],
#        [3.6, 2.9]])

    state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
5  Nevada  2003  3.2
    state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
    state  year  pop
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
5  Nevada  2003  3.2
   year   state  pop
0  2000    Ohio  1.5
1  2001    Ohio  1.7
2  2002    Ohio  3.6
3  2001  Nevada  2.4
4  2002  Nevada  2.9
5  2003  Nevada  3.2
   year   state  pop debt
0  2000    Ohio  1.5  NaN
1  2001    Ohio  1.7  NaN
2  2002    Ohio  3.6  NaN
3  2001  Nevada  2.4  NaN
4  2002  Nevada  2.9  NaN
5  2003  Nevada  3.2  NaN
Index(['year', 'state', 'pop', 'debt'], dtype='object')
0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object
0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64
year     2001
state    Ohio
pop 

array([[1.5, nan],
       [1.7, 2.4],
       [3.6, 2.9]])