## Pandas

## 1. 데이터프레임 샘플 생성

#### 1.1 라이브러리 임포트

In [62]:
import pandas as pd
import numpy as np

#### 1.2 난수생성

In [63]:
# np.random.randint : 균일 분포의 정수 난수 1개 생성
# np.random.rand : 0부터 1사이의 균일 분포에서 난수 matrix array 생성
# np.random.randn : 가우시안 표준 정규 분포에서 난수 matrix array 생성

data = np.random.randn(6, 4)
data

array([[-1.20509962,  3.08285705, -1.65038886,  0.25568368],
       [-0.97799439,  0.77000224,  0.49909525, -0.6580701 ],
       [-0.09835748, -1.87752422, -0.56370239, -1.58255819],
       [-2.06883094, -0.54982246, -1.16854501,  0.5014808 ],
       [-1.03085806, -0.6971445 ,  0.33693029,  1.55306431],
       [-0.40787239, -0.18177718,  0.83802887, -1.80403408]])

#### 1.3 날짜 생성

In [64]:
dates = pd.date_range("20220101", periods=6)
dates

DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
               '2022-01-05', '2022-01-06'],
              dtype='datetime64[ns]', freq='D')

In [65]:
type(dates)

pandas.core.indexes.datetimes.DatetimeIndex

#### 1.4 데이터 프레임 생성
- 데이터 : 난수  행렬(6x4)
- 인덱스 : 날짜 (6X1)
- 칼럼 : A,B,C,D (1X4)

In [67]:
df = pd.DataFrame(data, index=dates, columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
2022-01-01,-1.2051,3.082857,-1.650389,0.255684
2022-01-02,-0.977994,0.770002,0.499095,-0.65807
2022-01-03,-0.098357,-1.877524,-0.563702,-1.582558
2022-01-04,-2.068831,-0.549822,-1.168545,0.501481
2022-01-05,-1.030858,-0.697145,0.33693,1.553064
2022-01-06,-0.407872,-0.181777,0.838029,-1.804034


---

## 2.2 데이터프레임 정보 탐색
- df.head()
- df.tail()
- df.info
- df.describe()

In [68]:
df.head()

Unnamed: 0,A,B,C,D
2022-01-01,-1.2051,3.082857,-1.650389,0.255684
2022-01-02,-0.977994,0.770002,0.499095,-0.65807
2022-01-03,-0.098357,-1.877524,-0.563702,-1.582558
2022-01-04,-2.068831,-0.549822,-1.168545,0.501481
2022-01-05,-1.030858,-0.697145,0.33693,1.553064


In [69]:
df.tail()

Unnamed: 0,A,B,C,D
2022-01-02,-0.977994,0.770002,0.499095,-0.65807
2022-01-03,-0.098357,-1.877524,-0.563702,-1.582558
2022-01-04,-2.068831,-0.549822,-1.168545,0.501481
2022-01-05,-1.030858,-0.697145,0.33693,1.553064
2022-01-06,-0.407872,-0.181777,0.838029,-1.804034


In [70]:
# 데이터프레임의 기본 정보
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2022-01-01 to 2022-01-06
Freq: D
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       6 non-null      float64
 1   B       6 non-null      float64
 2   C       6 non-null      float64
 3   D       6 non-null      float64
dtypes: float64(4)
memory usage: 240.0 bytes


In [71]:
# 데이터프레임의 기술통계 정보 확인
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.964835,0.091098,-0.284764,-0.289072
std,0.684406,1.696678,0.99854,1.297657
min,-2.068831,-1.877524,-1.650389,-1.804034
25%,-1.161539,-0.660314,-1.017334,-1.351436
50%,-1.004426,-0.3658,-0.113386,-0.201193
75%,-0.550403,0.532057,0.458554,0.440032
max,-0.098357,3.082857,0.838029,1.553064


In [72]:
df.index

DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
               '2022-01-05', '2022-01-06'],
              dtype='datetime64[ns]', freq='D')

In [73]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

---

## 2.3 데이터 선택하기

In [74]:
df = pd.DataFrame(data, index=dates, columns=["A", "B", "C", "D"]) 
df

Unnamed: 0,A,B,C,D
2022-01-01,-1.2051,3.082857,-1.650389,0.255684
2022-01-02,-0.977994,0.770002,0.499095,-0.65807
2022-01-03,-0.098357,-1.877524,-0.563702,-1.582558
2022-01-04,-2.068831,-0.549822,-1.168545,0.501481
2022-01-05,-1.030858,-0.697145,0.33693,1.553064
2022-01-06,-0.407872,-0.181777,0.838029,-1.804034


### 2.3.1 열 선택하기

In [75]:
# 한 개 컬럼 선택
df['A']

2022-01-01   -1.205100
2022-01-02   -0.977994
2022-01-03   -0.098357
2022-01-04   -2.068831
2022-01-05   -1.030858
2022-01-06   -0.407872
Freq: D, Name: A, dtype: float64

In [76]:
df.A

2022-01-01   -1.205100
2022-01-02   -0.977994
2022-01-03   -0.098357
2022-01-04   -2.068831
2022-01-05   -1.030858
2022-01-06   -0.407872
Freq: D, Name: A, dtype: float64

In [77]:
type(df['A'])

pandas.core.series.Series

In [78]:
df[['A']]

Unnamed: 0,A
2022-01-01,-1.2051
2022-01-02,-0.977994
2022-01-03,-0.098357
2022-01-04,-2.068831
2022-01-05,-1.030858
2022-01-06,-0.407872


In [79]:
type(df[['A']])

pandas.core.frame.DataFrame

In [80]:
df['A','B']

KeyError: ('A', 'B')

In [81]:
df[['A','B']]

Unnamed: 0,A,B
2022-01-01,-1.2051,3.082857
2022-01-02,-0.977994,0.770002
2022-01-03,-0.098357,-1.877524
2022-01-04,-2.068831,-0.549822
2022-01-05,-1.030858,-0.697145
2022-01-06,-0.407872,-0.181777


### 2.3.2 행 선택하기

In [82]:
df.loc['20220101']

A   -1.205100
B    3.082857
C   -1.650389
D    0.255684
Name: 2022-01-01 00:00:00, dtype: float64

### 2.3.3 offset index
- [n:m] : n부터 m-1 까지 
- 인덱스나 컬럼의 이름으로 slice 하는 경우는 끝을 포함한다

#### 2.3.3.1 case1 `df[ : ]`

In [84]:
df[:]

Unnamed: 0,A,B,C,D
2022-01-01,-1.2051,3.082857,-1.650389,0.255684
2022-01-02,-0.977994,0.770002,0.499095,-0.65807
2022-01-03,-0.098357,-1.877524,-0.563702,-1.582558
2022-01-04,-2.068831,-0.549822,-1.168545,0.501481
2022-01-05,-1.030858,-0.697145,0.33693,1.553064
2022-01-06,-0.407872,-0.181777,0.838029,-1.804034


In [86]:
df[0:3]

Unnamed: 0,A,B,C,D
2022-01-01,-1.2051,3.082857,-1.650389,0.255684
2022-01-02,-0.977994,0.770002,0.499095,-0.65807
2022-01-03,-0.098357,-1.877524,-0.563702,-1.582558


In [88]:
df["20220101":"20220104"]

Unnamed: 0,A,B,C,D
2022-01-01,-1.2051,3.082857,-1.650389,0.255684
2022-01-02,-0.977994,0.770002,0.499095,-0.65807
2022-01-03,-0.098357,-1.877524,-0.563702,-1.582558
2022-01-04,-2.068831,-0.549822,-1.168545,0.501481


#### 2.3.3.1 case2 `df.loc[]`

In [90]:
df.loc[:,:]

Unnamed: 0,A,B,C,D
2022-01-01,-1.2051,3.082857,-1.650389,0.255684
2022-01-02,-0.977994,0.770002,0.499095,-0.65807
2022-01-03,-0.098357,-1.877524,-0.563702,-1.582558
2022-01-04,-2.068831,-0.549822,-1.168545,0.501481
2022-01-05,-1.030858,-0.697145,0.33693,1.553064
2022-01-06,-0.407872,-0.181777,0.838029,-1.804034


In [92]:
df.loc['20220103':'20220106', 'B':'D']

Unnamed: 0,B,C,D
2022-01-03,-1.877524,-0.563702,-1.582558
2022-01-04,-0.549822,-1.168545,0.501481
2022-01-05,-0.697145,0.33693,1.553064
2022-01-06,-0.181777,0.838029,-1.804034


In [93]:
df.loc['20220103':'20220106', ['B','D']]

Unnamed: 0,B,D
2022-01-03,-1.877524,-1.582558
2022-01-04,-0.549822,0.501481
2022-01-05,-0.697145,1.553064
2022-01-06,-0.181777,-1.804034


In [94]:
df.loc[['20220103','20220106'], 'B':'D']

Unnamed: 0,B,C,D
2022-01-03,-1.877524,-0.563702,-1.582558
2022-01-06,-0.181777,0.838029,-1.804034


In [95]:
df.loc[['20220103','20220106'], ['B','D']]

Unnamed: 0,B,D
2022-01-03,-1.877524,-1.582558
2022-01-06,-0.181777,-1.804034


In [73]:
df.loc["20220527":"20220530", ['A','C']]

Unnamed: 0,A,C
2022-05-27,1.46144,-1.052474
2022-05-28,0.798987,1.234549
2022-05-29,-0.889393,-1.307586
2022-05-30,0.821736,0.137565


In [96]:
df.loc['20220103', 'C']

-0.5637023882883242

### 2.3.3 특정 위치의 값 선택하기 `df.loc[]`

In [97]:
df.at['20220103', 'C']

-0.5637023882883242

### 2.3.4 인덱스 번호로 선택하기 `df.iloc[]`

- iloc : inter location 
    - 컴퓨터가 인식하는 인덱스 값으로 선택

In [98]:
df

Unnamed: 0,A,B,C,D
2022-01-01,-1.2051,3.082857,-1.650389,0.255684
2022-01-02,-0.977994,0.770002,0.499095,-0.65807
2022-01-03,-0.098357,-1.877524,-0.563702,-1.582558
2022-01-04,-2.068831,-0.549822,-1.168545,0.501481
2022-01-05,-1.030858,-0.697145,0.33693,1.553064
2022-01-06,-0.407872,-0.181777,0.838029,-1.804034


In [99]:
df.iloc[3]

A   -2.068831
B   -0.549822
C   -1.168545
D    0.501481
Name: 2022-01-04 00:00:00, dtype: float64

In [103]:
df.iloc[3,1]

-0.5498224568693055

In [104]:
df.iloc[0:3, 0:3]

Unnamed: 0,A,B,C
2022-01-01,-1.2051,3.082857,-1.650389
2022-01-02,-0.977994,0.770002,0.499095
2022-01-03,-0.098357,-1.877524,-0.563702


In [105]:
df.iloc[2,[0,2]]

A   -0.098357
C   -0.563702
Name: 2022-01-03 00:00:00, dtype: float64

In [106]:
df.iloc[:,[0,2]]

Unnamed: 0,A,C
2022-01-01,-1.2051,-1.650389
2022-01-02,-0.977994,0.499095
2022-01-03,-0.098357,-0.563702
2022-01-04,-2.068831,-1.168545
2022-01-05,-1.030858,0.33693
2022-01-06,-0.407872,0.838029


In [107]:
df.iloc[[0,2], 2]

2022-01-01   -1.650389
2022-01-03   -0.563702
Freq: 2D, Name: C, dtype: float64

In [108]:
df.iloc[[0,2], :]

Unnamed: 0,A,B,C,D
2022-01-01,-1.2051,3.082857,-1.650389,0.255684
2022-01-03,-0.098357,-1.877524,-0.563702,-1.582558


---

## 2.4 데이터 정렬

In [111]:
# 오름차순

df.sort_values(by='A', inplace=True)
df

Unnamed: 0,A,B,C,D
2022-01-04,-2.068831,-0.549822,-1.168545,0.501481
2022-01-01,-1.2051,3.082857,-1.650389,0.255684
2022-01-05,-1.030858,-0.697145,0.33693,1.553064
2022-01-02,-0.977994,0.770002,0.499095,-0.65807
2022-01-06,-0.407872,-0.181777,0.838029,-1.804034
2022-01-03,-0.098357,-1.877524,-0.563702,-1.582558


In [112]:
# 내림차순

df.sort_values(by='A', ascending=False, inplace=True)
df

Unnamed: 0,A,B,C,D
2022-01-03,-0.098357,-1.877524,-0.563702,-1.582558
2022-01-06,-0.407872,-0.181777,0.838029,-1.804034
2022-01-02,-0.977994,0.770002,0.499095,-0.65807
2022-01-05,-1.030858,-0.697145,0.33693,1.553064
2022-01-01,-1.2051,3.082857,-1.650389,0.255684
2022-01-04,-2.068831,-0.549822,-1.168545,0.501481


In [113]:
# 원상복구

df = pd.DataFrame(data, index=dates, columns=["A", "B", "C", "D"]) 
df

Unnamed: 0,A,B,C,D
2022-01-01,-1.2051,3.082857,-1.650389,0.255684
2022-01-02,-0.977994,0.770002,0.499095,-0.65807
2022-01-03,-0.098357,-1.877524,-0.563702,-1.582558
2022-01-04,-2.068831,-0.549822,-1.168545,0.501481
2022-01-05,-1.030858,-0.697145,0.33693,1.553064
2022-01-06,-0.407872,-0.181777,0.838029,-1.804034


---

## 2.5 condition

### 2.5.1 마스킹

In [127]:
df

Unnamed: 0,A,B,C,D
2022-01-01,-1.2051,3.082857,-1.650389,0.255684
2022-01-02,-0.977994,0.770002,0.499095,-0.65807
2022-01-03,-0.098357,-1.877524,-0.563702,-1.582558
2022-01-04,-2.068831,-0.549822,-1.168545,0.501481
2022-01-05,-1.030858,-0.697145,0.33693,1.553064
2022-01-06,-0.407872,-0.181777,0.838029,-1.804034


In [122]:
# 양수만 선택

df['A'] > 0

2022-01-01    False
2022-01-02    False
2022-01-03    False
2022-01-04    False
2022-01-05    False
2022-01-06    False
Freq: D, Name: A, dtype: bool

In [123]:
df[['A','B']] > 0

Unnamed: 0,A,B
2022-01-01,False,True
2022-01-02,False,True
2022-01-03,False,False
2022-01-04,False,False
2022-01-05,False,False
2022-01-06,False,False


In [126]:
# 마스킹
df[df[['A','B']] > 0]

Unnamed: 0,A,B,C,D
2022-01-01,,3.082857,,
2022-01-02,,0.770002,,
2022-01-03,,,,
2022-01-04,,,,
2022-01-05,,,,
2022-01-06,,,,


In [125]:
# NaN : Not a Number

df[df > 0]

Unnamed: 0,A,B,C,D
2022-01-01,,3.082857,,0.255684
2022-01-02,,0.770002,0.499095,
2022-01-03,,,,
2022-01-04,,,,0.501481
2022-01-05,,,0.33693,1.553064
2022-01-06,,,0.838029,


### 2.5.2 isin()

- isin() : 특정 요소가 있는지 확인

In [184]:
df['E'] = ['one', 'one', 'two', 'three', 'four', 'seven']
df

Unnamed: 0,A,B,C,D,E
2022-01-01,-1.2051,3.082857,-1.650389,0.255684,one
2022-01-02,-0.977994,0.770002,0.499095,-0.65807,one
2022-01-03,-0.098357,-1.877524,-0.563702,-1.582558,two
2022-01-04,-2.068831,-0.549822,-1.168545,0.501481,three
2022-01-05,-1.030858,-0.697145,0.33693,1.553064,four
2022-01-06,-0.407872,-0.181777,0.838029,-1.804034,seven


In [185]:
df['E'].isin(["two"])

2022-01-01    False
2022-01-02    False
2022-01-03     True
2022-01-04    False
2022-01-05    False
2022-01-06    False
Freq: D, Name: E, dtype: bool

In [186]:
df['E'].isin(["two", "seven"])

2022-01-01    False
2022-01-02    False
2022-01-03     True
2022-01-04    False
2022-01-05    False
2022-01-06     True
Freq: D, Name: E, dtype: bool

In [187]:
# 마스킹
df[df['E'].isin(["two", "seven"])]

Unnamed: 0,A,B,C,D,E
2022-01-03,-0.098357,-1.877524,-0.563702,-1.582558,two
2022-01-06,-0.407872,-0.181777,0.838029,-1.804034,seven


### 2.5.3 str.contains()

In [188]:
df

Unnamed: 0,A,B,C,D,E
2022-01-01,-1.2051,3.082857,-1.650389,0.255684,one
2022-01-02,-0.977994,0.770002,0.499095,-0.65807,one
2022-01-03,-0.098357,-1.877524,-0.563702,-1.582558,two
2022-01-04,-2.068831,-0.549822,-1.168545,0.501481,three
2022-01-05,-1.030858,-0.697145,0.33693,1.553064,four
2022-01-06,-0.407872,-0.181777,0.838029,-1.804034,seven


In [189]:
df['E'].str.contains('one')

2022-01-01     True
2022-01-02     True
2022-01-03    False
2022-01-04    False
2022-01-05    False
2022-01-06    False
Freq: D, Name: E, dtype: bool

In [191]:
sum(df['E'].str.contains('one'))

2

In [193]:
df['E'].value_counts()

one      2
two      1
four     1
seven    1
three    1
Name: E, dtype: int64

In [151]:
df.loc[df['E'].str.contains('one'), :]

Unnamed: 0,A,B,C,D,E
2022-01-01,-1.2051,3.082857,-1.650389,0.255684,one
2022-01-02,-0.977994,0.770002,0.499095,-0.65807,one


In [194]:
# 원상복구

del df['E']
df

Unnamed: 0,A,B,C,D
2022-01-01,-1.2051,3.082857,-1.650389,0.255684
2022-01-02,-0.977994,0.770002,0.499095,-0.65807
2022-01-03,-0.098357,-1.877524,-0.563702,-1.582558
2022-01-04,-2.068831,-0.549822,-1.168545,0.501481
2022-01-05,-1.030858,-0.697145,0.33693,1.553064
2022-01-06,-0.407872,-0.181777,0.838029,-1.804034


---

## 2.6 만들기

#### 2.6.1 컬럼 만들기

In [156]:
df

Unnamed: 0,A,B,C,D,E
2022-01-01,-1.2051,3.082857,-1.650389,0.255684,one
2022-01-02,-0.977994,0.770002,0.499095,-0.65807,one
2022-01-03,-0.098357,-1.877524,-0.563702,-1.582558,two
2022-01-04,-2.068831,-0.549822,-1.168545,0.501481,three
2022-01-05,-1.030858,-0.697145,0.33693,1.553064,four
2022-01-06,-0.407872,-0.181777,0.838029,-1.804034,seven


In [212]:
# 기존 칼럼이 없으면 추가
# 기존 칼럼이 있으면 수정

df['E'] = ['one', 'one', 'two', 'three', 'four', 'seven']
df

Unnamed: 0,A,B,C,D,E
2022-01-01,-1.2051,3.082857,-1.650389,0.255684,one
2022-01-02,-0.977994,0.770002,0.499095,-0.65807,one
2022-01-03,-0.098357,-1.877524,-0.563702,-1.582558,two
2022-01-04,-2.068831,-0.549822,-1.168545,0.501481,three
2022-01-05,-1.030858,-0.697145,0.33693,1.553064,four
2022-01-06,-0.407872,-0.181777,0.838029,-1.804034,seven


#### 2.6.2 `apply`

In [166]:
# 덧셈
df['A'].apply('sum')

-5.789012865809826

In [162]:
# 중앙값
df['A'].apply('mean')

-0.964835477634971

In [167]:
# 표준편차
df['A'].apply('std')

0.6844058702105106

In [169]:
# 최솟값
df['A'].apply('min')

-2.0688309368114406

In [170]:
# 최댓값
df['A'].apply('max')

-0.0983574768301934

In [116]:
df['A'].apply(np.sum)

2022-05-26   -0.178990
2022-05-27    1.461440
2022-05-28    0.798987
2022-05-29   -0.889393
2022-05-30    0.821736
2022-05-31   -0.300977
Freq: D, Name: A, dtype: float64

In [117]:
df['A'].apply(np.mean)

2022-05-26   -0.178990
2022-05-27    1.461440
2022-05-28    0.798987
2022-05-29   -0.889393
2022-05-30    0.821736
2022-05-31   -0.300977
Freq: D, Name: A, dtype: float64

In [118]:
df['A'].apply(np.std)

2022-05-26    0.0
2022-05-27    0.0
2022-05-28    0.0
2022-05-29    0.0
2022-05-30    0.0
2022-05-31    0.0
Freq: D, Name: A, dtype: float64

In [171]:
df.apply(np.min)

A   -2.068831
B   -1.877524
C   -1.650389
D   -1.804034
dtype: float64

In [172]:
df.apply(np.max)

A   -0.098357
B    3.082857
C    0.838029
D    1.553064
dtype: float64

In [175]:
# 함수를 만들어서 적용시킬 수 잇음
def plusminus(num):
    return "plus" if num>0  else "minus"

df['A'].apply(plusminus)

2022-01-01    minus
2022-01-02    minus
2022-01-03    minus
2022-01-04    minus
2022-01-05    minus
2022-01-06    minus
Freq: D, Name: A, dtype: object

In [176]:
# 람다로 표현을 더 간결하게 할 수 있음
df["A"].apply(lambda num: "plus" if num > 0 else "minus")

2022-01-01    minus
2022-01-02    minus
2022-01-03    minus
2022-01-04    minus
2022-01-05    minus
2022-01-06    minus
Freq: D, Name: A, dtype: object

In [177]:
# 여러 열을 동시에 함수 적용
df[['A','D']].apply('sum')

A   -5.789013
D   -1.734434
dtype: float64

---

## 2.7 칼럼 제거
- del
- drop

In [213]:
df

Unnamed: 0,A,B,C,D,E
2022-01-01,-1.2051,3.082857,-1.650389,0.255684,one
2022-01-02,-0.977994,0.770002,0.499095,-0.65807,one
2022-01-03,-0.098357,-1.877524,-0.563702,-1.582558,two
2022-01-04,-2.068831,-0.549822,-1.168545,0.501481,three
2022-01-05,-1.030858,-0.697145,0.33693,1.553064,four
2022-01-06,-0.407872,-0.181777,0.838029,-1.804034,seven


In [215]:
del df['E']
df

Unnamed: 0,A,B,C,D
2022-01-01,-1.2051,3.082857,-1.650389,0.255684
2022-01-02,-0.977994,0.770002,0.499095,-0.65807
2022-01-03,-0.098357,-1.877524,-0.563702,-1.582558
2022-01-04,-2.068831,-0.549822,-1.168545,0.501481
2022-01-05,-1.030858,-0.697145,0.33693,1.553064
2022-01-06,-0.407872,-0.181777,0.838029,-1.804034


In [216]:
# axis = 0 가로
# axis = 1 세로

df.drop(['D'], axis=1)

Unnamed: 0,A,B,C
2022-01-01,-1.2051,3.082857,-1.650389
2022-01-02,-0.977994,0.770002,0.499095
2022-01-03,-0.098357,-1.877524,-0.563702
2022-01-04,-2.068831,-0.549822,-1.168545
2022-01-05,-1.030858,-0.697145,0.33693
2022-01-06,-0.407872,-0.181777,0.838029


In [226]:
# axis = 0이 기본
df.drop(['20220101'])

KeyError: "['20220101'] not found in axis"