# Pandas

In [1]:
import numpy as np
import pandas as pd

In [3]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

# Series

In [4]:
s1 = pd.Series([1, 3, 5, np.nan, 6, 8])
s1

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [5]:
s1.index

RangeIndex(start=0, stop=6, step=1)

In [5]:
s1.values

array([ 1.,  3.,  5., nan,  6.,  8.])

#### numpy 배열을 사용하지만 object타입으로 이용할 경우 배열의 모든 원소의 데이터 타입이 달라도 됨.

In [6]:
s1 = pd.Series([np.nan, np.inf, 0, 1, 2, 3, 'a', 'b', 'c'])
s1

0    NaN
1    inf
2      0
3      1
4      2
5      3
6      a
7      b
8      c
dtype: object

In [8]:
s1.values

array([nan, inf, 0, 1, 2, 3, 'a', 'b', 'c'], dtype=object)

## index 설정하기

In [18]:
index_seq = range(10,20)
seq_data = range(10)
s2 = pd.Series(data=seq_data, index=index_seq)
s2

10    0
11    1
12    2
13    3
14    4
15    5
16    6
17    7
18    8
19    9
dtype: int64

In [19]:
index_date = ['2018-10-17', '2018-10-18', '2018-10-19', '2018-10-20']
s3 = pd.Series([200, 195, np.nan, 205], index=index_date)
s3

2018-10-17    200.0
2018-10-18    195.0
2018-10-19      NaN
2018-10-20    205.0
dtype: float64

In [20]:
s3.index  = [1,2,3,4]

In [21]:
s3

1    200.0
2    195.0
3      NaN
4    205.0
dtype: float64

## Dictionary로 시리즈 입력

In [29]:
# key->index, value->value
dict_data = {
    '국어': 100,
    '영어': 95,
    '수학': 80
}
s5 = pd.Series(dict_data)
print(s5)

국어    100
영어     95
수학     80
dtype: int64


## 데이터 연산

In [27]:
s1 = pd.Series((range(10, 70, 10)))
s2 = pd.Series((range(1, 6)))

print(repr(s1))
print(repr(s2))

s2

0    10
1    20
2    30
3    40
4    50
5    60
dtype: int64
0    1
1    2
2    3
3    4
4    5
dtype: int64


0    1
1    2
2    3
3    4
4    5
dtype: int64

In [28]:
s1+s2

0    11.0
1    22.0
2    33.0
3    44.0
4    55.0
5     NaN
dtype: float64

In [17]:
s1-s2

0     9.0
1    18.0
2    27.0
3    36.0
4    45.0
5     NaN
dtype: float64

In [18]:
s1*s2

0     10.0
1     40.0
2     90.0
3    160.0
4    250.0
5      NaN
dtype: float64

In [19]:
s1/s2

0    10.0
1    10.0
2    10.0
3    10.0
4    10.0
5     NaN
dtype: float64

In [20]:
s1**s2

0           10.0
1          400.0
2        27000.0
3      2560000.0
4    312500000.0
5            NaN
dtype: float64

In [21]:
s1//s2

0    10.0
1    10.0
2    10.0
3    10.0
4    10.0
5     NaN
dtype: float64

In [22]:
s1%s2

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
5    NaN
dtype: float64

## 날짜 자동 생성 - date_range 

In [30]:
pd.date_range('2019-05-25', '2019-05-28')

DatetimeIndex(['2019-05-25', '2019-05-26', '2019-05-27', '2019-05-28'], dtype='datetime64[ns]', freq='D')

In [31]:
pd.date_range(start='2019-05-25', end='2019-05-28')

DatetimeIndex(['2019-05-25', '2019-05-26', '2019-05-27', '2019-05-28'], dtype='datetime64[ns]', freq='D')

In [32]:
pd.date_range(start='2019-05-25', periods=4)

DatetimeIndex(['2019-05-25', '2019-05-26', '2019-05-27', '2019-05-28'], dtype='datetime64[ns]', freq='D')

In [33]:
pd.date_range(start='2019-05-25 10:00:00', periods=6)

DatetimeIndex(['2019-05-25 10:00:00', '2019-05-26 10:00:00',
               '2019-05-27 10:00:00', '2019-05-28 10:00:00',
               '2019-05-29 10:00:00', '2019-05-30 10:00:00'],
              dtype='datetime64[ns]', freq='D')

### 날짜 데이터 포맷 (yyyy-mm-dd)

In [34]:
date_index1 = pd.date_range(start='2019-05-25', periods=4)
date_index2 = pd.date_range(start='2019.05.25', periods=4)
date_index3 = pd.date_range(start='2019/05/25', periods=4)
print('date_index1: ', repr(date_index1))
print('date_index2: ', repr(date_index2))
print('date_index3: ', repr(date_index3))

date_index4 = pd.date_range(start='05/25/2019', periods=4)
date_index5 = pd.date_range(start='05-25-2019', periods=4)
date_index6 = pd.date_range(start='05.25.2019', periods=4)

print('date_index4: ', repr(date_index4))
print('date_index5: ', repr(date_index5))
print('date_index6: ', repr(date_index6))


date_index1:  DatetimeIndex(['2019-05-25', '2019-05-26', '2019-05-27', '2019-05-28'], dtype='datetime64[ns]', freq='D')
date_index2:  DatetimeIndex(['2019-05-25', '2019-05-26', '2019-05-27', '2019-05-28'], dtype='datetime64[ns]', freq='D')
date_index3:  DatetimeIndex(['2019-05-25', '2019-05-26', '2019-05-27', '2019-05-28'], dtype='datetime64[ns]', freq='D')
date_index4:  DatetimeIndex(['2019-05-25', '2019-05-26', '2019-05-27', '2019-05-28'], dtype='datetime64[ns]', freq='D')
date_index5:  DatetimeIndex(['2019-05-25', '2019-05-26', '2019-05-27', '2019-05-28'], dtype='datetime64[ns]', freq='D')
date_index6:  DatetimeIndex(['2019-05-25', '2019-05-26', '2019-05-27', '2019-05-28'], dtype='datetime64[ns]', freq='D')


### date_range's  time series Frequencies

In [35]:
# 2 day
pd.date_range(start="2019-05-25", periods=8, freq='2D')

DatetimeIndex(['2019-05-25', '2019-05-27', '2019-05-29', '2019-05-31',
               '2019-06-02', '2019-06-04', '2019-06-06', '2019-06-08'],
              dtype='datetime64[ns]', freq='2D')

In [36]:
# business day
pd.date_range(start="2019-05-25", periods=8, freq='B')

DatetimeIndex(['2019-05-27', '2019-05-28', '2019-05-29', '2019-05-30',
               '2019-05-31', '2019-06-03', '2019-06-04', '2019-06-05'],
              dtype='datetime64[ns]', freq='B')

In [37]:
# 3 week
pd.date_range(start="2019-05-25", periods=8, freq='3W')

DatetimeIndex(['2019-05-26', '2019-06-16', '2019-07-07', '2019-07-28',
               '2019-08-18', '2019-09-08', '2019-09-29', '2019-10-20'],
              dtype='datetime64[ns]', freq='3W-SUN')

In [38]:
# 월말 Month Last
pd.date_range(start="2019-05-25", periods=8, freq='M')

DatetimeIndex(['2019-05-31', '2019-06-30', '2019-07-31', '2019-08-31',
               '2019-09-30', '2019-10-31', '2019-11-30', '2019-12-31'],
              dtype='datetime64[ns]', freq='M')

In [39]:
# Buisness Month
pd.date_range(start="2019-05-25", periods=8, freq='BM')

DatetimeIndex(['2019-05-31', '2019-06-28', '2019-07-31', '2019-08-30',
               '2019-09-30', '2019-10-31', '2019-11-29', '2019-12-31'],
              dtype='datetime64[ns]', freq='BM')

In [40]:
# Buisness Month Start
pd.date_range(start="2019-05-25", periods=8, freq='BMS')

DatetimeIndex(['2019-06-03', '2019-07-01', '2019-08-01', '2019-09-02',
               '2019-10-01', '2019-11-01', '2019-12-02', '2020-01-01'],
              dtype='datetime64[ns]', freq='BMS')

In [41]:
# Quarter
pd.date_range(start="2019-05-25", periods=8, freq='Q')

DatetimeIndex(['2019-06-30', '2019-09-30', '2019-12-31', '2020-03-31',
               '2020-06-30', '2020-09-30', '2020-12-31', '2021-03-31'],
              dtype='datetime64[ns]', freq='Q-DEC')

In [42]:
# Year
pd.date_range(start="2019-05-25 10:10", periods=8, freq='A')

DatetimeIndex(['2019-12-31 10:10:00', '2020-12-31 10:10:00',
               '2021-12-31 10:10:00', '2022-12-31 10:10:00',
               '2023-12-31 10:10:00', '2024-12-31 10:10:00',
               '2025-12-31 10:10:00', '2026-12-31 10:10:00'],
              dtype='datetime64[ns]', freq='A-DEC')

In [43]:
# Hour
pd.date_range(start="2019-05-25 10:10", periods=8, freq='H')

DatetimeIndex(['2019-05-25 10:10:00', '2019-05-25 11:10:00',
               '2019-05-25 12:10:00', '2019-05-25 13:10:00',
               '2019-05-25 14:10:00', '2019-05-25 15:10:00',
               '2019-05-25 16:10:00', '2019-05-25 17:10:00'],
              dtype='datetime64[ns]', freq='H')

In [44]:
# Minutes

pd.date_range(start="2019-05-25 10:10", periods=8, freq='min')
# pd.date_range(start="2019-05-25 10:10", periods=8, freq='T')

DatetimeIndex(['2019-05-25 10:10:00', '2019-05-25 10:11:00',
               '2019-05-25 10:12:00', '2019-05-25 10:13:00',
               '2019-05-25 10:14:00', '2019-05-25 10:15:00',
               '2019-05-25 10:16:00', '2019-05-25 10:17:00'],
              dtype='datetime64[ns]', freq='T')

In [47]:
# Seconds

pd.date_range(start="2019-05-25 10:10", periods=8, freq='10S')

DatetimeIndex(['2019-05-25 10:10:00', '2019-05-25 10:10:10',
               '2019-05-25 10:10:20', '2019-05-25 10:10:30',
               '2019-05-25 10:10:40', '2019-05-25 10:10:50',
               '2019-05-25 10:11:00', '2019-05-25 10:11:10'],
              dtype='datetime64[ns]', freq='10S')

# DataFrame

Series는 1차원 데이터  
2차원 데이터 처리 필요 (Excel)  => DataFrame


In [52]:
df = pd.DataFrame([[10, 20, 30], [40, 50, 60], [70, 80, 90]])
df


Unnamed: 0,0,1,2
0,10,20,30
1,40,50,60
2,70,80,90


In [52]:
type(df)

pandas.core.frame.DataFrame

In [55]:
np_array = np.arange(10,100,10).reshape(3,3)
df1 = pd.DataFrame(np_array)
df1

Unnamed: 0,0,1,2
0,10,20,30
1,40,50,60
2,70,80,90


## 딕셔너리로 DataFrame 만들기

In [60]:
table_data = {'연도':list(range(2010, 2020)),
              '매출액': [v**2 for i, v in enumerate(range(1,11))],
              '종업원 수': list(range(2, 30, 3))}


df = pd.DataFrame(table_data)
df

Unnamed: 0,연도,매출액,종업원 수
0,2010,1,2
1,2011,4,5
2,2012,9,8
3,2013,16,11
4,2014,25,14
5,2015,36,17
6,2016,49,20
7,2017,64,23
8,2018,81,26
9,2019,100,29


## index와 column 그리고 value

In [59]:
np_array = np.arange(10,100,10).reshape(3,3)
df1 = pd.DataFrame(np_array)
df1

Unnamed: 0,0,1,2
0,10,20,30
1,40,50,60
2,70,80,90


##### index

In [86]:
df1.index

RangeIndex(start=0, stop=3, step=1)

In [56]:
# index 지정
df1.index = pd.date_range('2019-05-05',periods=3)
df1

Unnamed: 0,0,1,2
2019-05-05,10,20,30
2019-05-06,40,50,60
2019-05-07,70,80,90


In [90]:
df1.index

RangeIndex(start=0, stop=3, step=1)

##### columns

In [85]:
df1.columns

RangeIndex(start=0, stop=3, step=1)

In [88]:
# column 지정
df1.columns = ['A', 'B', 'C']
df1

Unnamed: 0,A,B,C
0,10,20,30
1,40,50,60
2,70,80,90


In [89]:
df1.columns

Index(['A', 'B', 'C'], dtype='object')

In [62]:
# 초기화시 인자 전달
df2 = pd.DataFrame(np.arange(9).reshape(3,3), 
                  index=pd.date_range('2019-03-05', periods=3, freq='M'), 
                  columns=['A', 'B', 'C'])
df2

Unnamed: 0,A,B,C
2019-03-31,0,1,2
2019-04-30,3,4,5
2019-05-31,6,7,8


In [63]:
# 초기화시 인자 전달
df2 = pd.DataFrame(np.arange(10, 100, 10).reshape(3,3), 
                  index=pd.date_range('2019-03-05', periods=3, freq='M'), 
                  columns=['A', 'B', 'C'])
df2

Unnamed: 0,A,B,C
2019-03-31,10,20,30
2019-04-30,40,50,60
2019-05-31,70,80,90


##### values

In [101]:
type(df2.values)

numpy.ndarray

In [97]:
df2.values

array([[10, 20, 30],
       [40, 50, 60],
       [70, 80, 90]])

## 데이터 연산

In [105]:
# dictionary로 만들기
table_data1 = {
    'A': np.arange(1, 6),
    'B': np.arange(10, 60, 10),
    'C': np.arange(100, 600, 100),
}
df1 = pd.DataFrame(table_data1)
df1

Unnamed: 0,A,B,C
0,1,10,100
1,2,20,200
2,3,30,300
3,4,40,400
4,5,50,500


In [106]:
table_data2 = {
    'A': [6, 7, 8],
    'B': [60, 70, 80],
    'C': [600, 700, 800],
}
df2 = pd.DataFrame(table_data2)
df2

Unnamed: 0,A,B,C
0,6,60,600
1,7,70,700
2,8,80,800


In [61]:
# 덧셈
df1+df2

Unnamed: 0,A,B,C
0,7.0,70.0,700.0
1,9.0,90.0,900.0
2,11.0,110.0,1100.0
3,,,
4,,,


In [62]:
# 뺄셈
df1-df2

Unnamed: 0,A,B,C
0,-5.0,-50.0,-500.0
1,-5.0,-50.0,-500.0
2,-5.0,-50.0,-500.0
3,,,
4,,,


In [63]:
df1*df2

Unnamed: 0,A,B,C
0,6.0,600.0,60000.0
1,14.0,1400.0,140000.0
2,24.0,2400.0,240000.0
3,,,
4,,,


In [64]:
df1/df2

Unnamed: 0,A,B,C
0,0.166667,0.166667,0.166667
1,0.285714,0.285714,0.285714
2,0.375,0.375,0.375
3,,,
4,,,


In [65]:
df1**df2

Unnamed: 0,A,B,C
0,1.0,1.0000000000000001e+60,inf
1,128.0,1.180592e+91,inf
2,6561.0,1.4780879999999998e+118,inf
3,,,
4,,,


In [66]:
df1%df2

Unnamed: 0,A,B,C
0,1.0,10.0,100.0
1,2.0,20.0,200.0
2,3.0,30.0,300.0
3,,,
4,,,


## Head & tail

In [77]:
df = pd.DataFrame(np.linspace(0, 149, 150).reshape(50,3).astype(int))
df

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11
4,12,13,14
5,15,16,17
6,18,19,20
7,21,22,23
8,24,25,26
9,27,28,29


In [78]:
df.head()

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11
4,12,13,14


In [79]:
df.head(3)

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8


In [None]:
df.tail()

In [80]:
df.tail(11)

Unnamed: 0,0,1,2
39,117,118,119
40,120,121,122
41,123,124,125
42,126,127,128
43,129,130,131
44,132,133,134
45,135,136,137
46,138,139,140
47,141,142,143
48,144,145,146


## 통계분석을 위한 method

In [135]:
df1 = pd.DataFrame({'봄': [256.5, 264.3, 215.9, 223.2, 312.8],
                    '여름': [770.6, 567.5, 599.8, 387.1, 446.2],
                    '가을': [363.5, 231.2, 293.1, 247.7, 381.6],
                    '겨울': [139.3, 59.9, 76.9, 109.1, 108.1]},
                    index=['2012', '2013', '2014', '2015', '2016'])
df1

Unnamed: 0,봄,여름,가을,겨울
2012,256.5,770.6,363.5,139.3
2013,264.3,567.5,231.2,59.9
2014,215.9,599.8,293.1,76.9
2015,223.2,387.1,247.7,109.1
2016,312.8,446.2,381.6,108.1


### sum / mean

In [110]:
df1.sum()

봄     1272.7
여름    2771.2
가을    1517.1
겨울     493.3
dtype: float64

In [111]:
df1.sum(axis=1)

2012    1529.9
2013    1122.9
2014    1185.7
2015     967.1
2016    1248.7
dtype: float64

In [None]:
df1.mean()

In [None]:
df1.mean(axis=1)

### min / max

In [None]:
df1.min()

In [None]:
df1.min(axis=1)

In [None]:
df1.max()

In [None]:
df1.max(axis=1)

### std / var

In [None]:
df1.std()

In [None]:
df1.std(axis=1)

In [None]:
df1.var()

In [None]:
df1.var(axis=1)

### cumsum / cumprod

In [112]:
df1.cumsum()

Unnamed: 0,봄,여름,가을,겨울
2012,256.5,770.6,363.5,139.3
2013,520.8,1338.1,594.7,199.2
2014,736.7,1937.9,887.8,276.1
2015,959.9,2325.0,1135.5,385.2
2016,1272.7,2771.2,1517.1,493.3


In [113]:
df1.cumsum(axis=1)

Unnamed: 0,봄,여름,가을,겨울
2012,256.5,1027.1,1390.6,1529.9
2013,264.3,831.8,1063.0,1122.9
2014,215.9,815.7,1108.8,1185.7
2015,223.2,610.3,858.0,967.1
2016,312.8,759.0,1140.6,1248.7


In [114]:
df1.cumprod()

Unnamed: 0,봄,여름,가을,겨울
2012,256.5,770.6,363.5,139.3
2013,67792.95,437315.5,84041.2,8344.07
2014,14636500.0,262301800.0,24632480.0,641659.0
2015,3266866000.0,101537000000.0,6101464000.0,70005000.0
2016,1021876000000.0,45305830000000.0,2328319000000.0,7567540000.0


In [115]:
df1.cumprod(axis=1)

Unnamed: 0,봄,여름,가을,겨울
2012,256.5,197658.9,71849010.0,10008570000.0
2013,264.3,149990.25,34677750.0,2077197000.0
2014,215.9,129496.82,37955520.0,2918779000.0
2015,223.2,86400.72,21401460.0,2334899000.0
2016,312.8,139571.36,53260430.0,5757453000.0


### describe()

In [116]:
df1.describe()

Unnamed: 0,봄,여름,가을,겨울
count,5.0,5.0,5.0,5.0
mean,254.54,554.24,303.42,98.66
std,38.628267,148.888895,67.358496,30.925523
min,215.9,387.1,231.2,59.9
25%,223.2,446.2,247.7,76.9
50%,256.5,567.5,293.1,108.1
75%,264.3,599.8,363.5,109.1
max,312.8,770.6,381.6,139.3


In [137]:
df1.loc['2012', '봄'] = np.nan

In [138]:
df1

Unnamed: 0,봄,여름,가을,겨울
2012,,770.6,363.5,139.3
2013,264.3,567.5,231.2,59.9
2014,215.9,599.8,293.1,76.9
2015,223.2,387.1,247.7,109.1
2016,312.8,446.2,381.6,108.1


In [139]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, 2012 to 2016
Data columns (total 4 columns):
봄     4 non-null float64
여름    5 non-null float64
가을    5 non-null float64
겨울    5 non-null float64
dtypes: float64(4)
memory usage: 360.0+ bytes


## 인덱싱, 슬라이싱

In [140]:
df1

Unnamed: 0,봄,여름,가을,겨울
2012,,770.6,363.5,139.3
2013,264.3,567.5,231.2,59.9
2014,215.9,599.8,293.1,76.9
2015,223.2,387.1,247.7,109.1
2016,312.8,446.2,381.6,108.1


### Column으로 접근

In [141]:
df1['봄']

2012      NaN
2013    264.3
2014    215.9
2015    223.2
2016    312.8
Name: 봄, dtype: float64

In [83]:
df1['여름']

2012    770.6
2013    567.5
2014    599.8
2015    387.1
2016    446.2
Name: 여름, dtype: float64

In [143]:
df1[['봄','가을']]

Unnamed: 0,봄,가을
2012,,363.5
2013,264.3,231.2
2014,215.9,293.1
2015,223.2,247.7
2016,312.8,381.6


### Row로 접근

In [72]:
df1[1:3]

Unnamed: 0,봄,여름,가을,겨울
2013,264.3,567.5,231.2,59.9
2014,215.9,599.8,293.1,76.9


In [73]:
df1[:2]

Unnamed: 0,봄,여름,가을,겨울
2012,256.5,770.6,363.5,139.3
2013,264.3,567.5,231.2,59.9


In [74]:
df1[:'2013']

Unnamed: 0,봄,여름,가을,겨울
2012,256.5,770.6,363.5,139.3
2013,264.3,567.5,231.2,59.9


##### 

In [75]:
df1.loc['2013']

봄     264.3
여름    567.5
가을    231.2
겨울     59.9
Name: 2013, dtype: float64

In [149]:
df1.loc['2013':'2016']

Unnamed: 0,봄,가을
2013,264.3,231.2
2014,215.9,293.1
2015,223.2,247.7
2016,312.8,381.6


### 결합

In [None]:
df1['봄'][:3]

In [None]:
df1['여름'][:'2014']

In [151]:
df1.loc['2014':, '봄':"가을"]

Unnamed: 0,봄,여름,가을
2014,215.9,599.8,293.1
2015,223.2,387.1,247.7
2016,312.8,446.2,381.6


# 데이터프레임 조작하기

### 데이터프레임 전치

In [None]:
df1

In [153]:
df1.T

Unnamed: 0,2012,2013,2014,2015,2016
봄,,264.3,215.9,223.2,312.8
여름,770.6,567.5,599.8,387.1,446.2
가을,363.5,231.2,293.1,247.7,381.6
겨울,139.3,59.9,76.9,109.1,108.1


In [155]:
df1.transpose()

Unnamed: 0,2012,2013,2014,2015,2016
봄,,264.3,215.9,223.2,312.8
여름,770.6,567.5,599.8,387.1,446.2
가을,363.5,231.2,293.1,247.7,381.6
겨울,139.3,59.9,76.9,109.1,108.1


### 열 순서 변경

In [None]:
df1

In [159]:
df1[['겨울', '봄', '여름', '가을']]

Unnamed: 0,겨울,봄,여름,가을
2012,139.3,,770.6,363.5
2013,59.9,264.3,567.5,231.2
2014,76.9,215.9,599.8,293.1
2015,109.1,223.2,387.1,247.7
2016,108.1,312.8,446.2,381.6


### 행 변경

In [175]:
df1.loc[['2016', '2015', '2014']]

Unnamed: 0,봄,여름,가을,겨울
2016,312.8,446.2,381.6,108.1
2015,223.2,387.1,247.7,109.1
2014,215.9,599.8,293.1,76.9


In [176]:
df2 = pd.DataFrame({
    '봄': [302.9, 256.9],
    '여름': [692.6, 1053.6],
    '가을': [307.6, 225.5],
    '겨울': [98.7, 45.6]},
    index=[2010,2011]
)
df2

Unnamed: 0,봄,여름,가을,겨울
2010,302.9,692.6,307.6,98.7
2011,256.9,1053.6,225.5,45.6


## 데이터 통합하기

In [178]:
df1 = pd.DataFrame({'Class1': [95, 92, 98, 100],
                    'Class2': [91, 93, 98, 100]})
df1

Unnamed: 0,Class1,Class2
0,95,91
1,92,93
2,98,98
3,100,100


In [179]:
df2 = pd.DataFrame({'Class1': [76, 88],
                    'Class2': [100, 100]})
df2

Unnamed: 0,Class1,Class2
0,76,100
1,88,100


### 세로방향 통합

In [180]:
df1.append(df2) 

Unnamed: 0,Class1,Class2
0,95,91
1,92,93
2,98,98
3,100,100
0,76,100
1,88,100


In [101]:
# index 무시
df1.append(df2, ignore_index=True)

Unnamed: 0,Class1,Class2
0,95,91
1,92,93
2,98,98
3,100,100
4,76,100
5,88,100


#### 컬럼이 다 다를때

In [181]:
df3 = pd.DataFrame({'Class1': [80,10], 'Class3': [50, 60]})
df3

Unnamed: 0,Class1,Class3
0,80,50
1,10,60


In [183]:
df1.append(df3, ignore_index=True, sort=False)

Unnamed: 0,Class1,Class2,Class3
0,95,91.0,
1,92,93.0,
2,98,98.0,
3,100,100.0,
4,80,,50.0
5,10,,60.0


### 가로방향 통합하기

In [184]:
df1

Unnamed: 0,Class1,Class2
0,95,91
1,92,93
2,98,98
3,100,100


In [187]:
df4 = pd.DataFrame({'Class3': [92,93,94,91,99]})
df4

Unnamed: 0,Class3
0,92
1,93
2,94
3,91
4,99


In [188]:
df1.join(df4)

Unnamed: 0,Class1,Class2,Class3
0,95,91,92
1,92,93,93
2,98,98,94
3,100,100,91


#### index 지정시

In [189]:
df1

Unnamed: 0,Class1,Class2
0,95,91
1,92,93
2,98,98
3,100,100


In [190]:
df1.index = ['a', 'b', 'c', 'd']
df1

Unnamed: 0,Class1,Class2
a,95,91
b,92,93
c,98,98
d,100,100


In [191]:
df4

Unnamed: 0,Class3
0,92
1,93
2,94
3,91
4,99


In [192]:
df4.index = list('abcde')
df4

Unnamed: 0,Class3
a,92
b,93
c,94
d,91
e,99


In [193]:
df1.join(df4)

Unnamed: 0,Class1,Class2,Class3
a,95,91,92
b,92,93,93
c,98,98,94
d,100,100,91


In [194]:
df4.join(df1)

Unnamed: 0,Class3,Class1,Class2
a,92,95.0,91.0
b,93,92.0,93.0
c,94,98.0,98.0
d,91,100.0,100.0
e,99,,


### 특정 열을 기준으로 통합하기

In [203]:
import random
month_list = [str(i)+'월' for i in range(1,5)]

prod_A, prod_B, prod_C, prod_D = [np.random.randint(50, 100, size=4) for i in range(4)]


df_A_B = pd.DataFrame({'판매월': month_list, 
                       '제품A': prod_A,
                       '제품B': prod_B})
df_A_B

Unnamed: 0,판매월,제품A,제품B
0,1월,99,98
1,2월,53,62
2,3월,87,83
3,4월,89,61


In [204]:
df_C_D = pd.DataFrame({'판매월': month_list,
                       '제품C': prod_C,
                       '제품D': prod_D})
df_C_D

Unnamed: 0,판매월,제품C,제품D
0,1월,89,55
1,2월,97,93
2,3월,65,89
3,4월,78,55


### merge

In [205]:
df_A_B.merge(df_C_D)

Unnamed: 0,판매월,제품A,제품B,제품C,제품D
0,1월,99,98,89,55
1,2월,53,62,97,93
2,3월,87,83,65,89
3,4월,89,61,78,55


#### mefge - how

In [206]:
df_A_B

Unnamed: 0,판매월,제품A,제품B
0,1월,99,98
1,2월,53,62
2,3월,87,83
3,4월,89,61


In [207]:
prod_E, prod_F = [np.random.randint(50, 100, size=4) for i in range(2)]
df_E_F = pd.DataFrame({'판매월': ['3월', '4월', '5월', '6월'],
                       '제품E': prod_E,
                       '제품F': prod_F})
df_E_F

Unnamed: 0,판매월,제품E,제품F
0,3월,88,87
1,4월,74,76
2,5월,89,55
3,6월,57,50


In [202]:
df_A_B.merge(df_E_F, how='left')

NameError: name 'df_E_F' is not defined

In [129]:
df_A_B.merge(df_E_F, how='right')

Unnamed: 0,판매월,제품A,제품B,제품E,제품F
0,3월,78.0,93.0,70,83
1,4월,61.0,85.0,82,71
2,5월,,,69,52
3,6월,,,73,55


In [130]:
df_A_B.merge(df_E_F, how='inner')

Unnamed: 0,판매월,제품A,제품B,제품E,제품F
0,3월,78,93,70,83
1,4월,61,85,82,71


In [131]:
df_A_B.merge(df_E_F, how='outer')

Unnamed: 0,판매월,제품A,제품B,제품E,제품F
0,1월,73.0,82.0,,
1,2월,66.0,86.0,,
2,3월,78.0,93.0,70.0,83.0
3,4월,61.0,85.0,82.0,71.0
4,5월,,,69.0,52.0
5,6월,,,73.0,55.0


In [211]:
df_A_B.merge(df_E_F, how='outer', on="판매월")

Unnamed: 0,판매월,제품A,제품B,제품E,제품F
0,1월,99.0,98.0,,
1,2월,53.0,62.0,,
2,3월,87.0,83.0,88.0,87.0
3,4월,89.0,61.0,74.0,76.0
4,5월,,,89.0,55.0
5,6월,,,57.0,50.0


# 데이터 파일 읽고 쓰기

## CSV 파일

In [212]:
df1 = pd.read_csv('temp.csv')
# df1 = pd.read_csv('temp.csv', encoding='utf8') # 사실상 표준 # default
# df1 = pd.read_csv('temp.csv', encoding='cp949') # windows

FileNotFoundError: File b'temp.csv' does not exist

## Excel 파일

In [219]:
df2 = pd.read_excel('file.xlsx')

# data type 지정
# df2 = pd.read_excel('tmp.xlsx', index_col=0, dtype={'Name': str, 'Value': float})  # doctest: +SKIP
# 
# na value 지정
# pd.read_excel('tmp.xlsx', index_col=0,
#               na_values=['string1', 'string2'])

FileNotFoundError: [Errno 2] No such file or directory: 'file.xlsx'

In [217]:
df_A_B.to_csv('file_csv.csv')

In [218]:
df_A_B.to_excel('file_excel.xlsx')