## numpy
- 배열 생성, 연산

## pandas
- 파이썬에서 1, 2차원 데이터를 구조화해서 생성/조작
- 실제 파일시스템에서 엑셀/csv/json... 파일을 불러올 수 있다.

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

## Series
- 1차원 데이터 구조

In [2]:
ser = Series([4,5,3,3],
             index=['a','b','c','d'])
ser

a    4
b    5
c    3
d    3
dtype: int64

In [3]:
ser[1:3], ser['b':'d'] # 라벨기반 슬라이싱은 마지막 값을 포함.

(b    5
 c    3
 dtype: int64,
 b    5
 c    3
 d    3
 dtype: int64)

In [4]:
ser[ser>3]

a    4
b    5
dtype: int64

In [5]:
ser.unique()

array([4, 5, 3], dtype=int64)

In [6]:
ser.value_counts()

3    2
5    1
4    1
dtype: int64

# DataFrame
## 2차원 데이터의 구조
### 생성법
1. 딕셔너리 데이터로 생성

In [7]:
# DataFrame(딕셔너리형태)
# - 딕셔너리의 key가 column
# - 딕셔너리의 data가 data
# DataFrame(2차원데이터, index, columns)
data = {
    'name': ['김철수','김영희','김민재'],
    'height': [180,170,160],
    'age': [20,25,30]
}
DataFrame(data)

Unnamed: 0,name,height,age
0,김철수,180,20
1,김영희,170,25
2,김민재,160,30


2. 2차원 데이터로 생성

In [8]:
# index - 행, column - 열
df = DataFrame(np.arange(1,10).reshape(3,3),
         index=['a','b','c'],
         columns=['one','two','three'])

df

Unnamed: 0,one,two,three
a,1,2,3
b,4,5,6
c,7,8,9


3. 데이터 파일을 읽어와서 생성
 - csv파일 : pd.read_csv('파일이름')
 - 엑셀파일 : pd.read_excel('파일이름')

In [9]:
df2 = pd.read_csv('sample.csv')
df2

Unnamed: 0,A,B,C,D
0,3,7,5,8
1,5,9,1,4
2,5,4,3,7
3,4,4,3,9
4,6,9,5,4
5,8,3,5,7
6,4,1,4,2
7,1,9,9,7
8,8,7,7,8
9,5,5,1,4


데이터를 저장하는것도 가능

In [10]:
df2.to_csv('sample2.csv', index=False)

In [11]:
df2.shape

(10, 4)

In [12]:
df.index # 인덱스정보 출력

Index(['a', 'b', 'c'], dtype='object')

In [13]:
df2.index = ['a','b','c','d','e','f','g','h','i','j']
df2

Unnamed: 0,A,B,C,D
a,3,7,5,8
b,5,9,1,4
c,5,4,3,7
d,4,4,3,9
e,6,9,5,4
f,8,3,5,7
g,4,1,4,2
h,1,9,9,7
i,8,7,7,8
j,5,5,1,4


In [14]:
# 날짜의 배열을 생성해준다.
# pd.date_range('시작날짜', periods=날짜의수)
pd.date_range('20200101', periods=3)

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03'], dtype='datetime64[ns]', freq='D')

In [15]:
df2.index = pd.date_range('20200101', periods=df2.shape[0])
df2

Unnamed: 0,A,B,C,D
2020-01-01,3,7,5,8
2020-01-02,5,9,1,4
2020-01-03,5,4,3,7
2020-01-04,4,4,3,9
2020-01-05,6,9,5,4
2020-01-06,8,3,5,7
2020-01-07,4,1,4,2
2020-01-08,1,9,9,7
2020-01-09,8,7,7,8
2020-01-10,5,5,1,4


In [16]:
# 데이터를 삭제 df.drop(행or열이름, axis=0 or 1)
# axis=0 --> 행 삭제, axis=1 --> 열 삭제
# inplace - True이면 원본에 적용
df2.drop(df2.index[0], axis=0, inplace=True)
df2

Unnamed: 0,A,B,C,D
2020-01-02,5,9,1,4
2020-01-03,5,4,3,7
2020-01-04,4,4,3,9
2020-01-05,6,9,5,4
2020-01-06,8,3,5,7
2020-01-07,4,1,4,2
2020-01-08,1,9,9,7
2020-01-09,8,7,7,8
2020-01-10,5,5,1,4


In [17]:
df2.columns # 컬럼정보 출력

Index(['A', 'B', 'C', 'D'], dtype='object')

In [18]:
# df.rename(columns={'원본컬럼':'바꿀이름',...})
# 원본적용은 inplace=True
df2.rename(columns={'A':'First', 'B':'Second'})
df2

Unnamed: 0,A,B,C,D
2020-01-02,5,9,1,4
2020-01-03,5,4,3,7
2020-01-04,4,4,3,9
2020-01-05,6,9,5,4
2020-01-06,8,3,5,7
2020-01-07,4,1,4,2
2020-01-08,1,9,9,7
2020-01-09,8,7,7,8
2020-01-10,5,5,1,4


In [19]:
# 모든 컬럼의 이름을 한꺼번에 바꾸고 싶다.
df2.columns = ['one','two','three','four']
df2

Unnamed: 0,one,two,three,four
2020-01-02,5,9,1,4
2020-01-03,5,4,3,7
2020-01-04,4,4,3,9
2020-01-05,6,9,5,4
2020-01-06,8,3,5,7
2020-01-07,4,1,4,2
2020-01-08,1,9,9,7
2020-01-09,8,7,7,8
2020-01-10,5,5,1,4


In [20]:
df2.values, type(df2.values) # 내부의 데이터만 출력

(array([[5, 9, 1, 4],
        [5, 4, 3, 7],
        [4, 4, 3, 9],
        [6, 9, 5, 4],
        [8, 3, 5, 7],
        [4, 1, 4, 2],
        [1, 9, 9, 7],
        [8, 7, 7, 8],
        [5, 5, 1, 4]], dtype=int64),
 numpy.ndarray)

In [21]:
df2.describe() # 컬럼별 통계량 출력

Unnamed: 0,one,two,three,four
count,9.0,9.0,9.0,9.0
mean,5.111111,5.666667,4.222222,5.777778
std,2.14735,2.95804,2.635231,2.333333
min,1.0,1.0,1.0,2.0
25%,4.0,4.0,3.0,4.0
50%,5.0,5.0,4.0,7.0
75%,6.0,9.0,5.0,7.0
max,8.0,9.0,9.0,9.0


In [22]:
df2.info() # 데이터프레임의 데이터타입 정보

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 9 entries, 2020-01-02 to 2020-01-10
Freq: D
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   one     9 non-null      int64
 1   two     9 non-null      int64
 2   three   9 non-null      int64
 3   four    9 non-null      int64
dtypes: int64(4)
memory usage: 360.0 bytes


In [23]:
df2.T # 데이터프레임의 행과 열을 뒤집어서 출력

Unnamed: 0,2020-01-02,2020-01-03,2020-01-04,2020-01-05,2020-01-06,2020-01-07,2020-01-08,2020-01-09,2020-01-10
one,5,5,4,6,8,4,1,8,5
two,9,4,4,9,3,1,9,7,5
three,1,3,3,5,5,4,9,7,1
four,4,7,9,4,7,2,7,8,4


In [24]:
df2.sort_values(by='two')

Unnamed: 0,one,two,three,four
2020-01-07,4,1,4,2
2020-01-06,8,3,5,7
2020-01-03,5,4,3,7
2020-01-04,4,4,3,9
2020-01-10,5,5,1,4
2020-01-09,8,7,7,8
2020-01-02,5,9,1,4
2020-01-05,6,9,5,4
2020-01-08,1,9,9,7


In [25]:
df2

Unnamed: 0,one,two,three,four
2020-01-02,5,9,1,4
2020-01-03,5,4,3,7
2020-01-04,4,4,3,9
2020-01-05,6,9,5,4
2020-01-06,8,3,5,7
2020-01-07,4,1,4,2
2020-01-08,1,9,9,7
2020-01-09,8,7,7,8
2020-01-10,5,5,1,4


In [26]:
# inplace=True 로 원본에 결과를 적용한다.
df2.sort_values(by='two', inplace=True)
df2

Unnamed: 0,one,two,three,four
2020-01-07,4,1,4,2
2020-01-06,8,3,5,7
2020-01-03,5,4,3,7
2020-01-04,4,4,3,9
2020-01-10,5,5,1,4
2020-01-09,8,7,7,8
2020-01-02,5,9,1,4
2020-01-05,6,9,5,4
2020-01-08,1,9,9,7


## 집계함수
- 행별, 열별로 데이터의 집계를 낼 수 있다.
 - sum(합계), mean(평균), max(최대값), min(최소값), argmax(최대값 index), argmix(최소값 index), cumsum(누적합)
 - axis=0 (한 열의 **모든 행**을 연산), axis=1(한 행의 **모든 열**을 연산)

In [27]:
df2.sum() # axis=0 기본

one      46
two      51
three    38
four     52
dtype: int64

In [28]:
df2.sum(axis=1)

2020-01-07    11
2020-01-06    23
2020-01-03    19
2020-01-04    20
2020-01-10    15
2020-01-09    30
2020-01-02    19
2020-01-05    24
2020-01-08    26
dtype: int64

In [29]:
df2.mean()

one      5.111111
two      5.666667
three    4.222222
four     5.777778
dtype: float64

## 결측값 처리
### 1. 결측값 탐색
- isna(), notna()

In [30]:
df3 = pd.read_csv('결측값처리.csv')
df3

Unnamed: 0,A,B,C,D,E
0,0.349,1.075,1.746,7.297,
1,2.2,5.928,6.143,,
2,9.311,5.251,3.962,4.226,9.688
3,,,,,
4,4.605,2.349,2.679,1.955,2.996
5,,8.79,8.159,,0.486
6,6.647,2.491,,0.492,8.902
7,,0.997,0.879,,5.125
8,6.256,3.635,0.845,8.065,5.803
9,,5.119,1.601,8.739,


In [31]:
df3.isna()

Unnamed: 0,A,B,C,D,E
0,False,False,False,False,True
1,False,False,False,True,True
2,False,False,False,False,False
3,True,True,True,True,True
4,False,False,False,False,False
5,True,False,False,True,False
6,False,False,True,False,False
7,True,False,False,True,False
8,False,False,False,False,False
9,True,False,False,False,True


In [32]:
df3.isna().sum() # axis=0 (행기준)

A    4
B    1
C    2
D    4
E    4
dtype: int64

In [33]:
df3.isna().sum(axis=1) # axis=1 (열기준)

0    1
1    2
2    0
3    5
4    0
5    2
6    1
7    2
8    0
9    2
dtype: int64

In [34]:
df3.isna().sum().sum() # 전체 개수

15

### 2-1. 결측값 삭제
- dropna : 결측값 제거
 - axis : 축 ( axis=0 - 행 삭제(기본값), axis=1 - 열 삭제 )
 - how : 제거방법 ('any' - 하나라도 있으면 삭제(기본설정값), 'all' - 전부가 결측값이어야 삭제)
 - thresh : 최소유효값개수

In [35]:
df3.dropna() # how='any', axis=0

Unnamed: 0,A,B,C,D,E
2,9.311,5.251,3.962,4.226,9.688
4,4.605,2.349,2.679,1.955,2.996
8,6.256,3.635,0.845,8.065,5.803


In [36]:
df3.dropna(how='all')

Unnamed: 0,A,B,C,D,E
0,0.349,1.075,1.746,7.297,
1,2.2,5.928,6.143,,
2,9.311,5.251,3.962,4.226,9.688
4,4.605,2.349,2.679,1.955,2.996
5,,8.79,8.159,,0.486
6,6.647,2.491,,0.492,8.902
7,,0.997,0.879,,5.125
8,6.256,3.635,0.845,8.065,5.803
9,,5.119,1.601,8.739,


In [37]:
df3.dropna(thresh=4)

Unnamed: 0,A,B,C,D,E
0,0.349,1.075,1.746,7.297,
2,9.311,5.251,3.962,4.226,9.688
4,4.605,2.349,2.679,1.955,2.996
6,6.647,2.491,,0.492,8.902
8,6.256,3.635,0.845,8.065,5.803


### 2-2. 결측값 채워넣기
- fillna()

In [38]:
df3.fillna(0)

Unnamed: 0,A,B,C,D,E
0,0.349,1.075,1.746,7.297,0.0
1,2.2,5.928,6.143,0.0,0.0
2,9.311,5.251,3.962,4.226,9.688
3,0.0,0.0,0.0,0.0,0.0
4,4.605,2.349,2.679,1.955,2.996
5,0.0,8.79,8.159,0.0,0.486
6,6.647,2.491,0.0,0.492,8.902
7,0.0,0.997,0.879,0.0,5.125
8,6.256,3.635,0.845,8.065,5.803
9,0.0,5.119,1.601,8.739,0.0


In [39]:
df3.fillna(df3.mean())

Unnamed: 0,A,B,C,D,E
0,0.349,1.075,1.746,7.297,5.5
1,2.2,5.928,6.143,5.129,5.5
2,9.311,5.251,3.962,4.226,9.688
3,4.894667,3.959444,3.25175,5.129,5.5
4,4.605,2.349,2.679,1.955,2.996
5,4.894667,8.79,8.159,5.129,0.486
6,6.647,2.491,3.25175,0.492,8.902
7,4.894667,0.997,0.879,5.129,5.125
8,6.256,3.635,0.845,8.065,5.803
9,4.894667,5.119,1.601,8.739,5.5


In [40]:
df3.fillna(df3.median())

Unnamed: 0,A,B,C,D,E
0,0.349,1.075,1.746,7.297,5.464
1,2.2,5.928,6.143,5.7615,5.464
2,9.311,5.251,3.962,4.226,9.688
3,5.4305,3.635,2.2125,5.7615,5.464
4,4.605,2.349,2.679,1.955,2.996
5,5.4305,8.79,8.159,5.7615,0.486
6,6.647,2.491,2.2125,0.492,8.902
7,5.4305,0.997,0.879,5.7615,5.125
8,6.256,3.635,0.845,8.065,5.803
9,5.4305,5.119,1.601,8.739,5.464


# 데이터프레임의 데이터 조회
## 1. loc 인덱서
- 사용자가 직접 지정한 인덱스를 사용

## 2. iloc 인덱서
- 자동생성되는 숫자 인덱스를 사용

In [41]:
df = DataFrame(np.random.randint(1,9,12).reshape(3,4),
              index=['a','b','c'],
              columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
a,3,3,6,3
b,2,1,7,8
c,6,3,1,3


In [42]:
# df.loc[행인덱스(인덱싱,슬라이싱), 열인덱스(인덱싱,슬라이싱)]
df.loc['b':'c', 'B':'D']

Unnamed: 0,B,C,D
b,1,7,8
c,3,1,3


In [43]:
# Fancy Indexing
df.loc[['a','c'], :]

Unnamed: 0,A,B,C,D
a,3,3,6,3
c,6,3,1,3


In [44]:
# df.iloc[행숫자인덱스,열숫자인덱스]
df.iloc[0:2,:]

Unnamed: 0,A,B,C,D
a,3,3,6,3
b,2,1,7,8


In [45]:
# Fancy Indexing
df.iloc[[0,2], :]

Unnamed: 0,A,B,C,D
a,3,3,6,3
c,6,3,1,3


# 데이터 병합
## 1. merge (특정 컬럼을 기준으로 데이터를 병합)
### 기준 컬럼의 이름이 양쪽 데이터프레임 모두 같을 때
- pd.merge(첫번째df,두번째df, on='기준 컬럼 이름', how='병합방식')
 - on - 데이터프레임을 합칠 기준 컬럼의 이름
 - how - 데이터프레임을 두개를 합칠 방법
   - 1) inner - 겹치는 데이터만 남겨놓는다.
   - 2) outer - 겹치지 않아도 무조건 합친다.
   - 3) left - 첫번째 데이터프레임을 우선으로 합친다.
   - 4) right - 두번째 데이터프레임을 우선으로 합친다.
- db의 join과 동일한 기능이다.

In [46]:
df1 = DataFrame({
    'ID': [101,102,103,104,105],
    '이름': ['철수','영희','민재','길동','민철']
})
df2 = DataFrame({
    'ID': [105,102,104],
    '예금': [100,200,500]
})

display(df1, df2)

Unnamed: 0,ID,이름
0,101,철수
1,102,영희
2,103,민재
3,104,길동
4,105,민철


Unnamed: 0,ID,예금
0,105,100
1,102,200
2,104,500


In [47]:
df_merge = pd.merge(df1,df2,on='ID', how='inner')
df_merge

Unnamed: 0,ID,이름,예금
0,102,영희,200
1,104,길동,500
2,105,민철,100


In [48]:
df_merge = pd.merge(df1,df2,on='ID', how='outer')
df_merge

Unnamed: 0,ID,이름,예금
0,101,철수,
1,102,영희,200.0
2,103,민재,
3,104,길동,500.0
4,105,민철,100.0


In [49]:
df_merge['예금'].fillna(0, inplace=True)
df_merge

Unnamed: 0,ID,이름,예금
0,101,철수,0.0
1,102,영희,200.0
2,103,민재,0.0
3,104,길동,500.0
4,105,민철,100.0


In [50]:
df1 = DataFrame({
    'ID': [101,102,103,104,105],
    '이름': ['철수','영희','민재','길동','민철']
})
df2 = DataFrame({
    '고객ID': [105,102,104],
    '예금': [100,200,500]
})

display(df1, df2)

Unnamed: 0,ID,이름
0,101,철수
1,102,영희
2,103,민재
3,104,길동
4,105,민철


Unnamed: 0,고객ID,예금
0,105,100
1,102,200
2,104,500


In [51]:
df_merge = pd.merge(df1,df2,left_on='ID',right_on='고객ID')
df_merge

Unnamed: 0,ID,이름,고객ID,예금
0,102,영희,102,200
1,104,길동,104,500
2,105,민철,105,100


In [52]:
df_merge = pd.merge(df1,df2,left_on='ID',right_on='고객ID', how='outer')
df_merge

Unnamed: 0,ID,이름,고객ID,예금
0,101,철수,,
1,102,영희,102.0,200.0
2,103,민재,,
3,104,길동,104.0,500.0
4,105,민철,105.0,100.0


In [53]:
df_merge.drop('고객ID',axis=1, inplace=True)
df_merge

Unnamed: 0,ID,이름,예금
0,101,철수,
1,102,영희,200.0
2,103,민재,
3,104,길동,500.0
4,105,민철,100.0


## pd.concat([df1, df2, ...]) - 데이터들을 그냥 차례대로 붙여준다.
- axis=0 - 행으로 붙인다.
- axis=1 - 열로 붙인다.

In [54]:
pd.concat([df1,df2], axis=1)

Unnamed: 0,ID,이름,고객ID,예금
0,101,철수,105.0,100.0
1,102,영희,102.0,200.0
2,103,민재,104.0,500.0
3,104,길동,,
4,105,민철,,


# 그룹핑

In [55]:
df = DataFrame({
    '성별': ['남','여','남','여'],
    '출신': ['대구','인천','서울','대구'],
    '키': [180,160,170,165],
    '자산': [1000,2000,1500,3500]
})

df

Unnamed: 0,성별,출신,키,자산
0,남,대구,180,1000
1,여,인천,160,2000
2,남,서울,170,1500
3,여,대구,165,3500


## 1. groupby
- df.groupby([그룹기준컬럼1, 기준컬럼2, ...]).통계함수()
- df.groupby(...).agg(['집계함수1','집계함수2',...])
 - 집계함수 : sum(합계), mean(평균), max(최대값), min(최소값), ...

In [56]:
df.groupby('성별').sum()

Unnamed: 0_level_0,키,자산
성별,Unnamed: 1_level_1,Unnamed: 2_level_1
남,350,2500
여,325,5500


In [57]:
df.groupby('성별').agg(['sum','mean'])

Unnamed: 0_level_0,키,키,자산,자산
Unnamed: 0_level_1,sum,mean,sum,mean
성별,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
남,350,175.0,2500,1250
여,325,162.5,5500,2750


In [58]:
df.groupby(['성별','출신']).agg(['sum','mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,키,키,자산,자산
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,sum,mean
성별,출신,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
남,대구,180,180,1000,1000
남,서울,170,170,1500,1500
여,대구,165,165,3500,3500
여,인천,160,160,2000,2000


### df.pivot_table(values='컬럼이름', index='컬럼이름', columns='컬럼이름, aggfunc='집계함수')
- values - 출력할 데이터의 원본컬럼이름
- index - 출력할 결과의 행 그룹으로 지정할 원본컬럼의 이름
- columns - 출력결과의 열 그룹으로 지정할 원본컬럼의 이름
- aggfunc - 각 그룹에 적용할 통계함수 ex) ['sum','mean',..]

In [59]:
df.pivot_table(values=['키','자산'], index='성별',
              columns='출신', aggfunc=['sum','mean'])

Unnamed: 0_level_0,sum,sum,sum,sum,sum,sum,mean,mean,mean,mean,mean,mean
Unnamed: 0_level_1,자산,자산,자산,키,키,키,자산,자산,자산,키,키,키
출신,대구,서울,인천,대구,서울,인천,대구,서울,인천,대구,서울,인천
성별,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
남,1000.0,1500.0,,180.0,170.0,,1000.0,1500.0,,180.0,170.0,
여,3500.0,,2000.0,165.0,,160.0,3500.0,,2000.0,165.0,,160.0
