# pandas 데이터 구조

- Series :         1D 동질의 데이터 타입을 갖는 배열(array)
- DataFrame :    2D 테이블 구조. 각 컬럼은 서로 다른 데이터타입을 가질 수 있음.
- Panel :         3D 테이블 구조.

In [1]:
import pandas as pd
import numpy as np

## 객체 생성

### 1. Series

In [2]:
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
s.index

RangeIndex(start=0, stop=6, step=1)

In [4]:
s2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])   # 인덱스 지정
s2

d    4
b    7
a   -5
c    3
dtype: int64

In [5]:
s2['a']   # 인덱스로 값 선택

-5

In [6]:
# Dictionary to Series

In [7]:
europe = {'spain': 46.77, 'france': 66.03, 'germany': 80.62, 'norway': 5.084}
s3 = pd.Series(europe)
s3

spain      46.770
france     66.030
germany    80.620
norway      5.084
dtype: float64

### 2. DataFrame

In [8]:
# row, column 데이터 지정하여 생성

In [9]:
dates = pd.date_range('20161001', periods=7)
dates

DatetimeIndex(['2016-10-01', '2016-10-02', '2016-10-03', '2016-10-04',
               '2016-10-05', '2016-10-06', '2016-10-07'],
              dtype='datetime64[ns]', freq='D')

In [10]:
# 랜덤으로 소수점 2자리 수 생성하여 각 컬럼의 값으로 사용
df = pd.DataFrame(np.random.rand(7,4).round(2), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2016-10-01,0.34,0.81,0.12,0.7
2016-10-02,0.7,0.72,0.99,0.48
2016-10-03,0.46,0.32,0.31,0.53
2016-10-04,0.35,0.17,0.63,0.09
2016-10-05,0.69,0.1,0.84,0.11
2016-10-06,0.76,0.52,0.01,0.49
2016-10-07,0.87,0.14,0.42,0.67


In [11]:
# dictionary 를 dataframe으로 변환

In [12]:
names = ['United States', 'Australia', 'Japan', 'India', 'Russia', 'Morocco', 'Egypt', 'Korea', 'China', 'England']
dr =  [True, False, False, False, True, True, True, True, True, True]
cpc = [809, 731, 588, 18, 200, 70, 45, 122, 397, 255]

my_dict = {
    'country':names,
    'drives_right':dr,
    'cars_per_cap':cpc
}

In [13]:
cars = pd.DataFrame(my_dict)
cars

Unnamed: 0,country,drives_right,cars_per_cap
0,United States,True,809
1,Australia,False,731
2,Japan,False,588
3,India,False,18
4,Russia,True,200
5,Morocco,True,70
6,Egypt,True,45
7,Korea,True,122
8,China,True,397
9,England,True,255


In [14]:
# dataframe 구조 보기

In [15]:
cars.dtypes

country         object
drives_right      bool
cars_per_cap     int64
dtype: object

In [16]:
cars.shape

(10, 3)

### 데이터 조회

In [17]:
cars.head()

Unnamed: 0,country,drives_right,cars_per_cap
0,United States,True,809
1,Australia,False,731
2,Japan,False,588
3,India,False,18
4,Russia,True,200


In [18]:
cars.tail(3)

Unnamed: 0,country,drives_right,cars_per_cap
7,Korea,True,122
8,China,True,397
9,England,True,255


In [19]:
cars.index   # 각 행의 인덱스

RangeIndex(start=0, stop=10, step=1)

In [20]:
cars.columns   # 컬럼명

Index(['country', 'drives_right', 'cars_per_cap'], dtype='object')

In [21]:
cars.values   # 전체 데이터 조회

array([['United States', True, 809],
       ['Australia', False, 731],
       ['Japan', False, 588],
       ['India', False, 18],
       ['Russia', True, 200],
       ['Morocco', True, 70],
       ['Egypt', True, 45],
       ['Korea', True, 122],
       ['China', True, 397],
       ['England', True, 255]], dtype=object)

In [22]:
cars.T   # transposing data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
country,United States,Australia,Japan,India,Russia,Morocco,Egypt,Korea,China,England
drives_right,True,False,False,False,True,True,True,True,True,True
cars_per_cap,809,731,588,18,200,70,45,122,397,255


In [23]:
cars.describe()   # 요약된 통계 정보

Unnamed: 0,cars_per_cap
count,10.0
mean,323.5
std,293.035929
min,18.0
25%,83.0
50%,227.5
75%,540.25
max,809.0


In [24]:
df.describe()

Unnamed: 0,A,B,C,D
count,7.0,7.0,7.0,7.0
mean,0.595714,0.397143,0.474286,0.438571
std,0.210623,0.289178,0.363999,0.246335
min,0.34,0.1,0.01,0.09
25%,0.405,0.155,0.215,0.295
50%,0.69,0.32,0.42,0.49
75%,0.73,0.62,0.735,0.6
max,0.87,0.81,0.99,0.7


In [25]:
df.mean(axis='columns')   # 전체 컬럼의  평균

2016-10-01    0.4925
2016-10-02    0.7225
2016-10-03    0.4050
2016-10-04    0.3100
2016-10-05    0.4350
2016-10-06    0.4450
2016-10-07    0.5250
Freq: D, dtype: float64

In [26]:
df.mean(axis='rows')  # 각 컬럼별 행의 평균

A    0.595714
B    0.397143
C    0.474286
D    0.438571
dtype: float64

### Reindex

In [27]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [28]:
obj.reindex(['a', 'b', 'c', 'd', 'e'])

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [29]:
obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

### 함수 적용

In [30]:
dates = pd.date_range('20161011', periods=7)
df = pd.DataFrame(np.random.rand(7,4).round(4), index=[dates], columns=list('DABC'))
df

Unnamed: 0,D,A,B,C
2016-10-11,0.4856,0.6504,0.6515,0.756
2016-10-12,0.0464,0.1939,0.0933,0.5243
2016-10-13,0.9671,0.6539,0.0632,0.7962
2016-10-14,0.9769,0.0816,0.0321,0.4844
2016-10-15,0.9636,0.2375,0.2832,0.7644
2016-10-16,0.3874,0.6429,0.6971,0.0344
2016-10-17,0.2235,0.2739,0.9958,0.4816


In [31]:
f = lambda x: x.max() - x.min()

In [32]:
df.apply(f)

D    0.9305
A    0.5723
B    0.9637
C    0.7618
dtype: float64

In [33]:
df.apply(f, axis=1)

2016-10-11    0.2704
2016-10-12    0.4779
2016-10-13    0.9039
2016-10-14    0.9448
2016-10-15    0.7261
2016-10-16    0.6627
2016-10-17    0.7723
dtype: float64

In [34]:
def f2(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

In [35]:
df.apply(f2)

Unnamed: 0,D,A,B,C
min,0.0464,0.0816,0.0321,0.0344
max,0.9769,0.6539,0.9958,0.7962


In [36]:
# dataframe의 실수값을 문자열로 일괄 변환

In [37]:
f_form = lambda x: '%.2f' % x
df.applymap(f_form)

Unnamed: 0,D,A,B,C
2016-10-11,0.49,0.65,0.65,0.76
2016-10-12,0.05,0.19,0.09,0.52
2016-10-13,0.97,0.65,0.06,0.8
2016-10-14,0.98,0.08,0.03,0.48
2016-10-15,0.96,0.24,0.28,0.76
2016-10-16,0.39,0.64,0.7,0.03
2016-10-17,0.22,0.27,1.0,0.48


### 정렬

In [38]:
# sort_index

In [39]:
df.sort_index(axis=1)

Unnamed: 0,A,B,C,D
2016-10-11,0.6504,0.6515,0.756,0.4856
2016-10-12,0.1939,0.0933,0.5243,0.0464
2016-10-13,0.6539,0.0632,0.7962,0.9671
2016-10-14,0.0816,0.0321,0.4844,0.9769
2016-10-15,0.2375,0.2832,0.7644,0.9636
2016-10-16,0.6429,0.6971,0.0344,0.3874
2016-10-17,0.2739,0.9958,0.4816,0.2235


In [40]:
df.sort_index(axis=1, ascending=False)   # 컬럼 순서 뒤집기

Unnamed: 0,D,C,B,A
2016-10-11,0.4856,0.756,0.6515,0.6504
2016-10-12,0.0464,0.5243,0.0933,0.1939
2016-10-13,0.9671,0.7962,0.0632,0.6539
2016-10-14,0.9769,0.4844,0.0321,0.0816
2016-10-15,0.9636,0.7644,0.2832,0.2375
2016-10-16,0.3874,0.0344,0.6971,0.6429
2016-10-17,0.2235,0.4816,0.9958,0.2739


In [41]:
# sort_values : 특정 컬럼의 값을 기준으로 정렬

In [42]:
df.sort_values(by='C')

Unnamed: 0,D,A,B,C
2016-10-16,0.3874,0.6429,0.6971,0.0344
2016-10-17,0.2235,0.2739,0.9958,0.4816
2016-10-14,0.9769,0.0816,0.0321,0.4844
2016-10-12,0.0464,0.1939,0.0933,0.5243
2016-10-11,0.4856,0.6504,0.6515,0.756
2016-10-15,0.9636,0.2375,0.2832,0.7644
2016-10-13,0.9671,0.6539,0.0632,0.7962


In [43]:
cars.sort_values(by='country')

Unnamed: 0,country,drives_right,cars_per_cap
1,Australia,False,731
8,China,True,397
6,Egypt,True,45
9,England,True,255
3,India,False,18
2,Japan,False,588
7,Korea,True,122
5,Morocco,True,70
4,Russia,True,200
0,United States,True,809


In [44]:
# rank

In [45]:
df.rank(method='first')   # 값이 같은 경우 위치에 따라 순위 부여. (etc : average, min, max)

Unnamed: 0,D,A,B,C
2016-10-11,4.0,6.0,5.0,5.0
2016-10-12,1.0,2.0,3.0,4.0
2016-10-13,6.0,7.0,2.0,7.0
2016-10-14,7.0,1.0,1.0,3.0
2016-10-15,5.0,3.0,4.0,6.0
2016-10-16,3.0,5.0,6.0,1.0
2016-10-17,2.0,4.0,7.0,2.0


In [46]:
df.rank(axis=1)

Unnamed: 0,D,A,B,C
2016-10-11,1.0,2.0,3.0,4.0
2016-10-12,1.0,3.0,2.0,4.0
2016-10-13,4.0,2.0,1.0,3.0
2016-10-14,4.0,2.0,1.0,3.0
2016-10-15,4.0,1.0,2.0,3.0
2016-10-16,2.0,3.0,4.0,1.0
2016-10-17,1.0,2.0,4.0,3.0
