In [1]:
import numpy as np
import pandas as pd

## AirPassengers data
- 월별, 년도별 승객은 얼마나 될까?
- 이전달과 이번달의 승객의 차이는? 상승율은?
- 누적 승객수, 최고, 최소 승객수는?

In [7]:
df_time_series = pd.read_csv("AirPassengers.csv")
df_time_series.head()

Unnamed: 0,Month,#Passengers
0,1949-01,112
1,1949-02,118
2,1949-03,132
3,1949-04,129
4,1949-05,121


#### 시계열 데이터 처리할 때 유용한 함수들

In [8]:
df_time_series["step"] = range(len(df_time_series))
df_time_series["cum_sum"] = df_time_series["#Passengers"].cumsum()
df_time_series["cum_max"] = df_time_series["#Passengers"].cummax()
df_time_series["cum_min"] = df_time_series["#Passengers"].cummin()
df_time_series.head()

Unnamed: 0,Month,#Passengers,step,cum_sum,cum_max,cum_min
0,1949-01,112,0,112,112,112
1,1949-02,118,1,230,118,112
2,1949-03,132,2,362,132,112
3,1949-04,129,3,491,132,112
4,1949-05,121,4,612,132,112


#### month열에 있는 데이터를 year와 month로 나눔

In [9]:
temp_date = df_time_series["Month"].map(lambda x : x.split("-"))
temp_date = np.array(temp_date.values.tolist())
temp_date[:5]

array([['1949', '01'],
       ['1949', '02'],
       ['1949', '03'],
       ['1949', '04'],
       ['1949', '05']], dtype='<U4')

In [10]:
df_time_series["year"] = temp_date[:, 0]
df_time_series["month"] = temp_date[:, 1]
df_time_series.head()

Unnamed: 0,Month,#Passengers,step,cum_sum,cum_max,cum_min,year,month
0,1949-01,112,0,112,112,112,1949,1
1,1949-02,118,1,230,118,112,1949,2
2,1949-03,132,2,362,132,112,1949,3
3,1949-04,129,3,491,132,112,1949,4
4,1949-05,121,4,612,132,112,1949,5


In [11]:
df_time_series["diff"] = df_time_series["#Passengers"].diff().fillna(0)
df_time_series[:5]

Unnamed: 0,Month,#Passengers,step,cum_sum,cum_max,cum_min,year,month,diff
0,1949-01,112,0,112,112,112,1949,1,0.0
1,1949-02,118,1,230,118,112,1949,2,6.0
2,1949-03,132,2,362,132,112,1949,3,14.0
3,1949-04,129,3,491,132,112,1949,4,-3.0
4,1949-05,121,4,612,132,112,1949,5,-8.0


#### pct_change() : 이전 값에 비해 몇 % 증가 or 감소했는지 보여줌

In [14]:
df_time_series["#Passengers"].pct_change()

0           NaN
1      0.053571
2      0.118644
3     -0.022727
4     -0.062016
         ...   
139   -0.025723
140   -0.161716
141   -0.092520
142   -0.154013
143    0.107692
Name: #Passengers, Length: 144, dtype: float64

In [12]:
df_time_series["#Passengers"].pct_change().map(lambda x : x*100).map(lambda x : " %.2f" % x)

0          nan
1         5.36
2        11.86
3        -2.27
4        -6.20
        ...   
139      -2.57
140     -16.17
141      -9.25
142     -15.40
143      10.77
Name: #Passengers, Length: 144, dtype: object

In [13]:
df_time_series["pct"]= df_time_series["#Passengers"].pct_change().map(lambda x: "%.2f" % (x * 100))

In [16]:
df_time_series.groupby(['year']).sum()

Unnamed: 0_level_0,#Passengers,step,cum_sum,cum_max,cum_min,diff
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1949,1520,66,9891,1649,1328,6.0
1950,1676,210,28943,1909,1248,22.0
1951,2042,354,51480,2246,1248,26.0
1952,2364,498,77974,2653,1248,28.0
1953,2700,642,108826,3077,1248,7.0
1954,2867,786,141888,3444,1248,28.0
1955,3408,930,179692,4009,1248,49.0
1956,3939,1074,224217,4672,1248,28.0
1957,4421,1218,274497,5287,1248,30.0
1958,4572,1362,328644,5818,1248,1.0


In [15]:
df_time_series

Unnamed: 0,Month,#Passengers,step,cum_sum,cum_max,cum_min,year,month,diff,pct
0,1949-01,112,0,112,112,112,1949,01,0.0,
1,1949-02,118,1,230,118,112,1949,02,6.0,5.36
2,1949-03,132,2,362,132,112,1949,03,14.0,11.86
3,1949-04,129,3,491,132,112,1949,04,-3.0,-2.27
4,1949-05,121,4,612,132,112,1949,05,-8.0,-6.20
...,...,...,...,...,...,...,...,...,...,...
139,1960-08,606,139,38572,622,104,1960,08,-16.0,-2.57
140,1960-09,508,140,39080,622,104,1960,09,-98.0,-16.17
141,1960-10,461,141,39541,622,104,1960,10,-47.0,-9.25
142,1960-11,390,142,39931,622,104,1960,11,-71.0,-15.40


In [17]:
df_time_series.iloc[0]

Month          1949-01
#Passengers        112
step                 0
cum_sum            112
cum_max            112
cum_min            112
year              1949
month               01
diff                 0
pct                nan
Name: 0, dtype: object

In [20]:
df_time_series.loc[0]

Month          1949-01
#Passengers        112
step                 0
cum_sum            112
cum_max            112
cum_min            112
year              1949
month               01
diff                 0
pct                nan
Name: 0, dtype: object

In [21]:
df_time_series.iloc[0:1]

Unnamed: 0,Month,#Passengers,step,cum_sum,cum_max,cum_min,year,month,diff,pct
0,1949-01,112,0,112,112,112,1949,1,0.0,
