# 데이터 구조

- Series :         1D 동질의 데이터 타입을 갖는 배열(array)
- DataFrame :    2D 테이블 구조. 각 컬럼은 서로 다른 데이터타입을 가질 수 있음.
- Panel :         3D 테이블 구조.

In [1]:
import pandas as pd
import numpy as np

## 객체 생성

### 1. Series

In [5]:
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [7]:
s.index

RangeIndex(start=0, stop=6, step=1)

In [10]:
s2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
s2

d    4
b    7
a   -5
c    3
dtype: int64

In [11]:
s2['a']

-5

In [None]:
# Dictionary to Series

In [12]:
europe = {'spain': 46.77, 'france': 66.03, 'germany': 80.62, 'norway': 5.084}
s3 = pd.Series(europe)
s3

france     66.030
germany    80.620
norway      5.084
spain      46.770
dtype: float64

### 2. DataFrame

In [3]:
# row, column 데이터 지정하여 생성

In [3]:
dates = pd.date_range('20161001', periods=7)
dates

DatetimeIndex(['2016-10-01', '2016-10-02', '2016-10-03', '2016-10-04',
               '2016-10-05', '2016-10-06', '2016-10-07'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.rand(7,4).round(2), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2016-10-01,0.49,0.38,0.58,0.33
2016-10-02,0.68,0.84,0.56,0.44
2016-10-03,0.78,0.08,0.31,0.83
2016-10-04,0.89,0.57,0.87,0.65
2016-10-05,0.37,0.59,0.73,0.11
2016-10-06,0.19,0.03,0.8,0.06
2016-10-07,0.41,0.99,0.76,0.76


In [6]:
# dictionary 를 dataframe으로 변환

In [13]:
names = ['United States', 'Australia', 'Japan', 'India', 'Russia', 'Morocco', 'Egypt', 'Korea', 'China', 'England']
dr =  [True, False, False, False, True, True, True, True, True, True]
cpc = [809, 731, 588, 18, 200, 70, 45, 122, 397, 255]

my_dict = {
    'country':names,
    'drives_right':dr,
    'cars_per_cap':cpc
}

In [14]:
cars = pd.DataFrame(my_dict)
cars

Unnamed: 0,cars_per_cap,country,drives_right
0,809,United States,True
1,731,Australia,False
2,588,Japan,False
3,18,India,False
4,200,Russia,True
5,70,Morocco,True
6,45,Egypt,True
7,122,Korea,True
8,397,China,True
9,255,England,True


In [15]:
# dataframe 구조 보기

In [16]:
cars.dtypes

cars_per_cap     int64
country         object
drives_right      bool
dtype: object

In [17]:
cars.shape

(10, 3)

### 데이터 조회

In [11]:
cars.head()

Unnamed: 0,cars_per_cap,country,drives_right
0,809,United States,True
1,731,Australia,False
2,588,Japan,False
3,18,India,False
4,200,Russia,True


In [12]:
cars.tail(3)

Unnamed: 0,cars_per_cap,country,drives_right
7,122,Korea,True
8,397,China,True
9,255,England,True


In [13]:
cars.index   # 각 행의 인덱스

RangeIndex(start=0, stop=10, step=1)

In [14]:
cars.columns   # 컬럼명

Index(['cars_per_cap', 'country', 'drives_right'], dtype='object')

In [15]:
cars.values   # 전체 데이터 조회

array([[809, 'United States', True],
       [731, 'Australia', False],
       [588, 'Japan', False],
       [18, 'India', False],
       [200, 'Russia', True],
       [70, 'Morocco', True],
       [45, 'Egypt', True],
       [122, 'Korea', True],
       [397, 'China', True],
       [255, 'England', True]], dtype=object)

In [18]:
cars.T   # transposing data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
cars_per_cap,809,731,588,18,200,70,45,122,397,255
country,United States,Australia,Japan,India,Russia,Morocco,Egypt,Korea,China,England
drives_right,True,False,False,False,True,True,True,True,True,True


In [16]:
cars.describe()   # 요약된 통계 정보

Unnamed: 0,cars_per_cap
count,10.0
mean,323.5
std,293.035929
min,18.0
25%,83.0
50%,227.5
75%,540.25
max,809.0


In [17]:
df.describe()

Unnamed: 0,A,B,C,D
count,7.0,7.0,7.0,7.0
mean,0.677143,0.395714,0.61,0.427143
std,0.233789,0.344522,0.395474,0.378581
min,0.25,0.05,0.03,0.02
25%,0.575,0.11,0.35,0.165
50%,0.75,0.34,0.73,0.26
75%,0.83,0.605,0.905,0.69
max,0.93,0.95,1.0,1.0


### Reindex

In [10]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [12]:
obj.reindex(['a', 'b', 'c', 'd', 'e'])

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [13]:
obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

### 함수 적용

In [31]:
dates = pd.date_range('20161011', periods=7)
df = pd.DataFrame(np.random.rand(7,4).round(4), index=[dates], columns=list('DABC'))
df

Unnamed: 0,D,A,B,C
2016-10-11,0.3747,0.1836,0.5974,0.9073
2016-10-12,0.0151,0.3498,0.259,0.6186
2016-10-13,0.1778,0.368,0.542,0.2396
2016-10-14,0.1791,0.8782,0.07,0.5307
2016-10-15,0.6715,0.6681,0.767,0.4739
2016-10-16,0.2078,0.8626,0.1624,0.2181
2016-10-17,0.1669,0.0549,0.715,0.3056


In [32]:
f = lambda x: x.max() - x.min()

In [33]:
df.apply(f)

D    0.6564
A    0.8233
B    0.6970
C    0.6892
dtype: float64

In [34]:
df.apply(f, axis=1)

2016-10-11    0.7237
2016-10-12    0.6035
2016-10-13    0.3642
2016-10-14    0.8082
2016-10-15    0.2931
2016-10-16    0.7002
2016-10-17    0.6601
Freq: D, dtype: float64

In [35]:
def f2(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

In [36]:
df.apply(f2)

Unnamed: 0,D,A,B,C
min,0.0151,0.0549,0.07,0.2181
max,0.6715,0.8782,0.767,0.9073


In [37]:
# dataframe의 실수값을 문자열로 일괄 변환

In [38]:
f_form = lambda x: '%.2f' % x
df.applymap(f_form)

Unnamed: 0,D,A,B,C
2016-10-11,0.37,0.18,0.6,0.91
2016-10-12,0.02,0.35,0.26,0.62
2016-10-13,0.18,0.37,0.54,0.24
2016-10-14,0.18,0.88,0.07,0.53
2016-10-15,0.67,0.67,0.77,0.47
2016-10-16,0.21,0.86,0.16,0.22
2016-10-17,0.17,0.05,0.71,0.31


### 정렬

In [None]:
# sort_index

In [42]:
df.sort_index(axis=1)

Unnamed: 0,A,B,C,D
2016-10-11,0.1836,0.5974,0.9073,0.3747
2016-10-12,0.3498,0.259,0.6186,0.0151
2016-10-13,0.368,0.542,0.2396,0.1778
2016-10-14,0.8782,0.07,0.5307,0.1791
2016-10-15,0.6681,0.767,0.4739,0.6715
2016-10-16,0.8626,0.1624,0.2181,0.2078
2016-10-17,0.0549,0.715,0.3056,0.1669


In [41]:
df.sort_index(axis=1, ascending=False)   # 컬럼 순서 뒤집기

Unnamed: 0,D,C,B,A
2016-10-11,0.3747,0.9073,0.5974,0.1836
2016-10-12,0.0151,0.6186,0.259,0.3498
2016-10-13,0.1778,0.2396,0.542,0.368
2016-10-14,0.1791,0.5307,0.07,0.8782
2016-10-15,0.6715,0.4739,0.767,0.6681
2016-10-16,0.2078,0.2181,0.1624,0.8626
2016-10-17,0.1669,0.3056,0.715,0.0549


In [None]:
# sort_values

In [43]:
df.sort_values(by='C')

Unnamed: 0,D,A,B,C
2016-10-16,0.2078,0.8626,0.1624,0.2181
2016-10-13,0.1778,0.368,0.542,0.2396
2016-10-17,0.1669,0.0549,0.715,0.3056
2016-10-15,0.6715,0.6681,0.767,0.4739
2016-10-14,0.1791,0.8782,0.07,0.5307
2016-10-12,0.0151,0.3498,0.259,0.6186
2016-10-11,0.3747,0.1836,0.5974,0.9073


In [20]:
cars.sort_values(by='country')   # 특정 컬럼의 값을 기준으로 정렬

Unnamed: 0,cars_per_cap,country,drives_right
1,731,Australia,False
8,397,China,True
6,45,Egypt,True
9,255,England,True
3,18,India,False
2,588,Japan,False
7,122,Korea,True
5,70,Morocco,True
4,200,Russia,True
0,809,United States,True


In [None]:
# rank

In [47]:
df.rank(method='first')   # 값이 같은 경우 위치에 따라 순위 부여. (etc : average, min, max)

Unnamed: 0,D,A,B,C
2016-10-11,6.0,2.0,5.0,7.0
2016-10-12,1.0,3.0,3.0,6.0
2016-10-13,3.0,4.0,4.0,2.0
2016-10-14,4.0,7.0,1.0,5.0
2016-10-15,7.0,5.0,7.0,4.0
2016-10-16,5.0,6.0,2.0,1.0
2016-10-17,2.0,1.0,6.0,3.0


In [48]:
df.rank(axis=1)

Unnamed: 0,D,A,B,C
2016-10-11,2.0,1.0,3.0,4.0
2016-10-12,1.0,3.0,2.0,4.0
2016-10-13,1.0,3.0,4.0,2.0
2016-10-14,2.0,4.0,1.0,3.0
2016-10-15,3.0,2.0,4.0,1.0
2016-10-16,2.0,4.0,1.0,3.0
2016-10-17,2.0,1.0,4.0,3.0


### Correlation and covariance