# 10 Minutes to pandas

In [3]:
import pandas as pd
import numpy as np
# numpy(넘파이) Numerical Python의 줄임 말로써 고성능의 수치 계산을 하기 위해 만들어진 python package

import matplotlib.pyplot as plt

## ㅇ Object Creation

### - creating series

In [35]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])

In [5]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

### - creating data frame

In [6]:
dates = pd.date_range('20190113', periods = 6)

In [7]:
dates

DatetimeIndex(['2019-01-13', '2019-01-14', '2019-01-15', '2019-01-16',
               '2019-01-17', '2019-01-18'],
              dtype='datetime64[ns]', freq='D')

In [11]:
df = pd.DataFrame(np.random.randn(6, 4), index = dates, columns = list('ABCD'))
# np.random.randn(row, column) - 난수 생성

In [10]:
df

Unnamed: 0,A,B,C,D
2019-01-13,-0.381957,0.149185,0.620113,0.44758
2019-01-14,-1.280569,0.370352,-0.204282,-1.498363
2019-01-15,0.864669,0.16296,0.559462,-1.423723
2019-01-16,0.055934,-0.275281,0.942675,-0.838611
2019-01-17,0.885674,-1.24826,-0.430887,1.169803
2019-01-18,0.002373,0.942446,-1.570134,-1.810926


In [26]:
df2 = pd.DataFrame({ 'A': 1,
                     'B': pd.Timestamp('20190113'),
                     'C': pd.Series(1, index=list(range(6)), dtype='float32'),
                     'D': np.array([3] * 6, dtype='int32'),
                     'E': pd.Categorical(["test", "train", "test", "train", "test2", "test3"]),
                     'F': 'foo',
                     'TEST': pd.date_range('20190114', periods = 6)
                   })
df2

Unnamed: 0,A,B,C,D,E,F,TEST
0,1,2019-01-13,1.0,3,test,foo,2019-01-14
1,1,2019-01-13,1.0,3,train,foo,2019-01-15
2,1,2019-01-13,1.0,3,test,foo,2019-01-16
3,1,2019-01-13,1.0,3,train,foo,2019-01-17
4,1,2019-01-13,1.0,3,test2,foo,2019-01-18
5,1,2019-01-13,1.0,3,test3,foo,2019-01-19


In [27]:
df2.dtypes

A                int64
B       datetime64[ns]
C              float32
D                int32
E             category
F               object
TEST    datetime64[ns]
dtype: object

## ㅇ Viewing Data

In [37]:
df.head(2)

Unnamed: 0,A,B,C,D
2019-01-13,-0.956758,-1.123661,-0.865189,0.171262
2019-01-14,-0.161622,-0.164722,-0.801905,0.41537


In [38]:
df.tail(2)

Unnamed: 0,A,B,C,D
2019-01-17,0.251674,-1.453502,0.261507,1.089001
2019-01-18,-0.788986,-0.117928,1.553361,-0.492547


In [39]:
df.index

DatetimeIndex(['2019-01-13', '2019-01-14', '2019-01-15', '2019-01-16',
               '2019-01-17', '2019-01-18'],
              dtype='datetime64[ns]', freq='D')

In [40]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [41]:
df.values

array([[-0.95675829, -1.1236606 , -0.86518884,  0.17126151],
       [-0.16162232, -0.16472154, -0.80190528,  0.41536968],
       [ 1.00133893, -0.13429129,  0.48790031, -0.02267105],
       [ 0.55516963, -0.41799529,  1.9970863 ,  0.51253209],
       [ 0.25167384, -1.45350241,  0.26150723,  1.08900131],
       [-0.78898634, -0.11792819,  1.55336137, -0.49254707]])

In [44]:
df.describe()
# mean: 평균
  # 평균(Average)에는 다양한 평균값이 존재
  # 가장 기본적인 게 mean.
  # 중앙값 median:  중앙에 있는 값
  # 최빈값 mode: 도수가 가장 높은 값을 나타냄
# std: 표준편차 (standard deviation)

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.016531,-0.568683,0.438794,0.278824
std,0.766368,0.577698,1.178481,0.534253
min,-0.956758,-1.453502,-0.865189,-0.492547
25%,-0.632145,-0.947244,-0.536052,0.025812
50%,0.045026,-0.291358,0.374704,0.293316
75%,0.479296,-0.141899,1.286996,0.488241
max,1.001339,-0.117928,1.997086,1.089001


In [46]:
df.T
# 행, 열 바꾸기

Unnamed: 0,2019-01-13 00:00:00,2019-01-14 00:00:00,2019-01-15 00:00:00,2019-01-16 00:00:00,2019-01-17 00:00:00,2019-01-18 00:00:00
A,-0.956758,-0.161622,1.001339,0.55517,0.251674,-0.788986
B,-1.123661,-0.164722,-0.134291,-0.417995,-1.453502,-0.117928
C,-0.865189,-0.801905,0.4879,1.997086,0.261507,1.553361
D,0.171262,0.41537,-0.022671,0.512532,1.089001,-0.492547


In [59]:
df.sort_index(axis=1, ascending=False)
# axis = 0 이면 row, 1이면 column
# 둘 다 바꾸고 싶으면????

Unnamed: 0,D,C,B,A
2019-01-13,0.171262,-0.865189,-1.123661,-0.956758
2019-01-14,0.41537,-0.801905,-0.164722,-0.161622
2019-01-15,-0.022671,0.4879,-0.134291,1.001339
2019-01-16,0.512532,1.997086,-0.417995,0.55517
2019-01-17,1.089001,0.261507,-1.453502,0.251674
2019-01-18,-0.492547,1.553361,-0.117928,-0.788986


In [69]:
df.sort_values(by='B').head(3)
# 기본값은 ascending = True

Unnamed: 0,A,B,C,D
2019-01-17,0.251674,-1.453502,0.261507,1.089001
2019-01-13,-0.956758,-1.123661,-0.865189,0.171262
2019-01-16,0.55517,-0.417995,1.997086,0.512532


In [73]:
df.sort_values(by='B', ascending = False).head(3)
# row의 특정값으로 sorting하고 싶으면 어떻게 해야하지??

Unnamed: 0,A,B,C,D
2019-01-18,-0.788986,-0.117928,1.553361,-0.492547
2019-01-15,1.001339,-0.134291,0.4879,-0.022671
2019-01-14,-0.161622,-0.164722,-0.801905,0.41537


## ㅇ Selection

### - Getting

In [119]:
df['A']
# A column만 가져오기
# column을 여러개 가져오려면?
  # df['A', 'D'], df['A':'D'] -> row를 기준으로 get할때와 똑같은 문법인데 왜 안될까?
df.loc[:, ['A','C']]
df.iloc[:, [3]]

Unnamed: 0,D
2019-01-13,0.171262
2019-01-14,0.41537
2019-01-15,-0.022671
2019-01-16,0.512532
2019-01-17,1.089001
2019-01-18,-0.492547


In [75]:
df[0:3]

Unnamed: 0,A,B,C,D
2019-01-13,-0.956758,-1.123661,-0.865189,0.171262
2019-01-14,-0.161622,-0.164722,-0.801905,0.41537
2019-01-15,1.001339,-0.134291,0.4879,-0.022671


In [82]:
df['2019-01-14':'2019-01-15']

Unnamed: 0,A,B,C,D
2019-01-14,-0.161622,-0.164722,-0.801905,0.41537
2019-01-15,1.001339,-0.134291,0.4879,-0.022671


In [90]:
df[0:1] # row를 가져올 땐 무조건 range가 있어야 하는 것 같음

Unnamed: 0,A,B,C,D
2019-01-13,-0.956758,-1.123661,-0.865189,0.171262


### - Selection By Label

In [92]:
df

Unnamed: 0,A,B,C,D
2019-01-13,-0.956758,-1.123661,-0.865189,0.171262
2019-01-14,-0.161622,-0.164722,-0.801905,0.41537
2019-01-15,1.001339,-0.134291,0.4879,-0.022671
2019-01-16,0.55517,-0.417995,1.997086,0.512532
2019-01-17,0.251674,-1.453502,0.261507,1.089001
2019-01-18,-0.788986,-0.117928,1.553361,-0.492547


In [93]:
##  getting a cross section using a label
df.loc[dates[0]]

A   -0.956758
B   -1.123661
C   -0.865189
D    0.171262
Name: 2019-01-13 00:00:00, dtype: float64

In [94]:
df.loc[[dates[0]]]

Unnamed: 0,A,B,C,D
2019-01-13,-0.956758,-1.123661,-0.865189,0.171262


In [103]:
# Selecting on a multi-axis by label
df.loc[:,["A","B"]]

Unnamed: 0,A,B
2019-01-13,-0.956758,-1.123661
2019-01-14,-0.161622,-0.164722
2019-01-15,1.001339,-0.134291
2019-01-16,0.55517,-0.417995
2019-01-17,0.251674,-1.453502
2019-01-18,-0.788986,-0.117928


In [110]:
# Showing label slicing, both endpoints are included:
df.loc['20190113':'20190115',['A','D']]

Unnamed: 0,A,D
2019-01-13,-0.956758,0.171262
2019-01-14,-0.161622,0.41537
2019-01-15,1.001339,-0.022671


In [107]:
# Reduction in the dimensions of the returned object:
df.loc['20190113',['A','D']]

A   -0.956758
D    0.171262
Name: 2019-01-13 00:00:00, dtype: float64

In [113]:
# For getting a scalar value:
df.loc[dates[0],'C']

-0.8651888367153121

### - Selection by Position

In [120]:
df.iloc[[3, 4], [2, 3]]

Unnamed: 0,C,D
2019-01-16,1.997086,0.512532
2019-01-17,0.261507,1.089001


### - Boolean Indexing

In [124]:
df[(df.A > 0) | (df.C > 0)]

Unnamed: 0,A,B,C,D
2019-01-15,1.001339,-0.134291,0.4879,-0.022671
2019-01-16,0.55517,-0.417995,1.997086,0.512532
2019-01-17,0.251674,-1.453502,0.261507,1.089001
2019-01-18,-0.788986,-0.117928,1.553361,-0.492547


In [125]:
df[df>0]

Unnamed: 0,A,B,C,D
2019-01-13,,,,0.171262
2019-01-14,,,,0.41537
2019-01-15,1.001339,,0.4879,
2019-01-16,0.55517,,1.997086,0.512532
2019-01-17,0.251674,,0.261507,1.089001
2019-01-18,,,1.553361,


In [126]:
df3 = df.copy()
df3['E'] = ['one', 'two', 'three', 'four', 'five', 'six']
df3

Unnamed: 0,A,B,C,D,E
2019-01-13,-0.956758,-1.123661,-0.865189,0.171262,one
2019-01-14,-0.161622,-0.164722,-0.801905,0.41537,two
2019-01-15,1.001339,-0.134291,0.4879,-0.022671,three
2019-01-16,0.55517,-0.417995,1.997086,0.512532,four
2019-01-17,0.251674,-1.453502,0.261507,1.089001,five
2019-01-18,-0.788986,-0.117928,1.553361,-0.492547,six


In [128]:
# Using the isin() method for filtering:
df3[df3['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2019-01-14,-0.161622,-0.164722,-0.801905,0.41537,two
2019-01-16,0.55517,-0.417995,1.997086,0.512532,four


### - Setting