# Indexing in Pandas

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [51]:
pd.set_option('display.precision',2)

## Date-time index

In [2]:
str1 = '2021-04-07'
today = pd.to_datetime(str1)

In [3]:
today

Timestamp('2021-04-07 00:00:00')

In [4]:
strs = ['April 7, 2022','Apr. 7, 2022','Thursday, April 7, 2022', '7 April 2022']

In [5]:
for date in strs:
    print(pd.to_datetime(date))

2022-04-07 00:00:00
2022-04-07 00:00:00
2022-04-07 00:00:00
2022-04-07 00:00:00


In [6]:
str2 = '4/7/2022 11:22:35'

In [7]:
pd.to_datetime(str2)

Timestamp('2022-04-07 11:22:35')

In [8]:
str3 = '4/7/2022 X 11:22:35'

In [9]:
# won't work
#pd.to_datetime(str3)

In [10]:
pd.to_datetime(str3, format='%m/%d/%Y X %H:%M:%S')

Timestamp('2022-04-07 11:22:35')

The official [Python documentation](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior) contains a complete list of possible %something patterns that are accepted in the format argument.

In [11]:
url = "https://datascience.quantecon.org/assets/data/state_unemployment.csv"
unemp_raw = pd.read_csv(url) #parse_dates=["Date"]

In [12]:
unemp_all = (
    unemp_raw
    .reset_index()
    .pivot_table(index="Date", columns="state", values="UnemploymentRate")
)
unemp_all.head()

state,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,Florida,Georgia,...,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-01,4.7,6.3,4.1,4.4,5.0,2.8,2.8,3.5,3.7,3.7,...,2.4,3.7,4.6,3.1,2.7,2.6,4.9,5.8,3.2,4.1
2000-02-01,4.7,6.3,4.1,4.3,5.0,2.8,2.7,3.6,3.7,3.6,...,2.4,3.7,4.6,3.1,2.6,2.5,4.9,5.6,3.2,3.9
2000-03-01,4.6,6.3,4.0,4.3,5.0,2.7,2.6,3.6,3.7,3.6,...,2.4,3.8,4.5,3.1,2.6,2.4,5.0,5.5,3.3,3.9
2000-04-01,4.6,6.3,4.0,4.3,5.1,2.7,2.5,3.7,3.7,3.7,...,2.4,3.8,4.4,3.1,2.7,2.4,5.0,5.4,3.4,3.8
2000-05-01,4.5,6.3,4.0,4.2,5.1,2.7,2.4,3.7,3.7,3.7,...,2.4,3.9,4.3,3.2,2.7,2.3,5.1,5.4,3.5,3.8


In [13]:
states = [
    "Arizona", "California", "Florida", "Illinois",
    "Michigan", "New York", "Texas"
]
unemp = unemp_all[states]
unemp.head()

state,Arizona,California,Florida,Illinois,Michigan,New York,Texas
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-01-01,4.1,5.0,3.7,4.2,3.3,4.7,4.6
2000-02-01,4.1,5.0,3.7,4.2,3.2,4.7,4.6
2000-03-01,4.0,5.0,3.7,4.3,3.2,4.6,4.5
2000-04-01,4.0,5.1,3.7,4.3,3.3,4.6,4.4
2000-05-01,4.0,5.1,3.7,4.3,3.5,4.6,4.3


In [14]:
unemp.index

Index(['2000-01-01', '2000-02-01', '2000-03-01', '2000-04-01', '2000-05-01',
       '2000-06-01', '2000-07-01', '2000-08-01', '2000-09-01', '2000-10-01',
       ...
       '2017-03-01', '2017-04-01', '2017-05-01', '2017-06-01', '2017-07-01',
       '2017-08-01', '2017-09-01', '2017-10-01', '2017-11-01', '2017-12-01'],
      dtype='object', name='Date', length=216)

In [15]:
pd.to_datetime(unemp.index)

DatetimeIndex(['2000-01-01', '2000-02-01', '2000-03-01', '2000-04-01',
               '2000-05-01', '2000-06-01', '2000-07-01', '2000-08-01',
               '2000-09-01', '2000-10-01',
               ...
               '2017-03-01', '2017-04-01', '2017-05-01', '2017-06-01',
               '2017-07-01', '2017-08-01', '2017-09-01', '2017-10-01',
               '2017-11-01', '2017-12-01'],
              dtype='datetime64[ns]', name='Date', length=216, freq=None)

In [16]:
unemp.index

Index(['2000-01-01', '2000-02-01', '2000-03-01', '2000-04-01', '2000-05-01',
       '2000-06-01', '2000-07-01', '2000-08-01', '2000-09-01', '2000-10-01',
       ...
       '2017-03-01', '2017-04-01', '2017-05-01', '2017-06-01', '2017-07-01',
       '2017-08-01', '2017-09-01', '2017-10-01', '2017-11-01', '2017-12-01'],
      dtype='object', name='Date', length=216)

In [17]:
new = unemp.set_index(pd.to_datetime(unemp.index))

In [18]:
new.index

DatetimeIndex(['2000-01-01', '2000-02-01', '2000-03-01', '2000-04-01',
               '2000-05-01', '2000-06-01', '2000-07-01', '2000-08-01',
               '2000-09-01', '2000-10-01',
               ...
               '2017-03-01', '2017-04-01', '2017-05-01', '2017-06-01',
               '2017-07-01', '2017-08-01', '2017-09-01', '2017-10-01',
               '2017-11-01', '2017-12-01'],
              dtype='datetime64[ns]', name='Date', length=216, freq=None)

In [19]:
unemp.set_index(pd.to_datetime(unemp.index),inplace=True)

In [20]:
unemp.index

DatetimeIndex(['2000-01-01', '2000-02-01', '2000-03-01', '2000-04-01',
               '2000-05-01', '2000-06-01', '2000-07-01', '2000-08-01',
               '2000-09-01', '2000-10-01',
               ...
               '2017-03-01', '2017-04-01', '2017-05-01', '2017-06-01',
               '2017-07-01', '2017-08-01', '2017-09-01', '2017-10-01',
               '2017-11-01', '2017-12-01'],
              dtype='datetime64[ns]', name='Date', length=216, freq=None)

In [21]:
new.index

DatetimeIndex(['2000-01-01', '2000-02-01', '2000-03-01', '2000-04-01',
               '2000-05-01', '2000-06-01', '2000-07-01', '2000-08-01',
               '2000-09-01', '2000-10-01',
               ...
               '2017-03-01', '2017-04-01', '2017-05-01', '2017-06-01',
               '2017-07-01', '2017-08-01', '2017-09-01', '2017-10-01',
               '2017-11-01', '2017-12-01'],
              dtype='datetime64[ns]', name='Date', length=216, freq=None)

In [22]:
new.head()

state,Arizona,California,Florida,Illinois,Michigan,New York,Texas
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-01-01,4.1,5.0,3.7,4.2,3.3,4.7,4.6
2000-02-01,4.1,5.0,3.7,4.2,3.2,4.7,4.6
2000-03-01,4.0,5.0,3.7,4.3,3.2,4.6,4.5
2000-04-01,4.0,5.1,3.7,4.3,3.3,4.6,4.4
2000-05-01,4.0,5.1,3.7,4.3,3.5,4.6,4.3


In [23]:
new.loc['2002']

state,Arizona,California,Florida,Illinois,Michigan,New York,Texas
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2002-01-01,5.9,6.5,6.0,6.3,6.4,6.1,6.1
2002-02-01,6.0,6.6,5.9,6.4,6.4,6.2,6.2
2002-03-01,6.0,6.6,5.9,6.5,6.4,6.3,6.3
2002-04-01,6.1,6.6,5.8,6.6,6.4,6.3,6.3
2002-05-01,6.1,6.6,5.7,6.6,6.3,6.2,6.3
2002-06-01,6.2,6.6,5.6,6.6,6.3,6.1,6.4
2002-07-01,6.2,6.6,5.6,6.6,6.2,6.1,6.4
2002-08-01,6.2,6.7,5.5,6.6,6.1,6.0,6.4
2002-09-01,6.2,6.7,5.5,6.5,6.1,6.0,6.4
2002-10-01,6.1,6.7,5.4,6.5,6.1,6.1,6.5


In [24]:
new.loc['2001':'2002']

state,Arizona,California,Florida,Illinois,Michigan,New York,Texas
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2001-01-01,4.0,4.8,3.8,4.8,4.4,4.2,4.1
2001-02-01,4.1,4.8,3.9,4.9,4.6,4.2,4.2
2001-03-01,4.2,4.9,4.0,5.0,4.7,4.2,4.3
2001-04-01,4.3,5.0,4.1,5.1,4.8,4.2,4.5
2001-05-01,4.4,5.1,4.2,5.1,4.8,4.4,4.6
2001-06-01,4.6,5.2,4.3,5.1,4.9,4.5,4.8
2001-07-01,4.8,5.4,4.4,5.2,5.0,4.7,5.0
2001-08-01,5.0,5.6,4.5,5.3,5.2,5.0,5.2
2001-09-01,5.2,5.8,4.7,5.5,5.5,5.3,5.4
2001-10-01,5.4,6.0,5.8,5.7,5.8,5.5,5.6


In [25]:
# add a new row
new.loc[pd.to_datetime('1995-06-15')] = [6.0, 4.6, 5.3, 6.7, 5.5, 5.5, 4.5 ]

In [26]:
new

state,Arizona,California,Florida,Illinois,Michigan,New York,Texas
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-01-01,4.1,5.0,3.7,4.2,3.3,4.7,4.6
2000-02-01,4.1,5.0,3.7,4.2,3.2,4.7,4.6
2000-03-01,4.0,5.0,3.7,4.3,3.2,4.6,4.5
2000-04-01,4.0,5.1,3.7,4.3,3.3,4.6,4.4
2000-05-01,4.0,5.1,3.7,4.3,3.5,4.6,4.3
...,...,...,...,...,...,...,...
2017-09-01,4.7,4.5,3.9,5.0,4.7,4.7,4.0
2017-10-01,4.7,4.5,3.9,4.9,4.7,4.7,3.9
2017-11-01,4.7,4.5,3.9,4.9,4.7,4.7,3.9
2017-12-01,4.7,4.5,3.9,4.9,4.7,4.7,4.0


In [27]:
new.sort_index()

state,Arizona,California,Florida,Illinois,Michigan,New York,Texas
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1995-06-15,6.0,4.6,5.3,6.7,5.5,5.5,4.5
2000-01-01,4.1,5.0,3.7,4.2,3.3,4.7,4.6
2000-02-01,4.1,5.0,3.7,4.2,3.2,4.7,4.6
2000-03-01,4.0,5.0,3.7,4.3,3.2,4.6,4.5
2000-04-01,4.0,5.1,3.7,4.3,3.3,4.6,4.4
...,...,...,...,...,...,...,...
2017-08-01,4.7,4.6,4.0,5.0,4.6,4.7,4.0
2017-09-01,4.7,4.5,3.9,5.0,4.7,4.7,4.0
2017-10-01,4.7,4.5,3.9,4.9,4.7,4.7,3.9
2017-11-01,4.7,4.5,3.9,4.9,4.7,4.7,3.9


In [28]:
new.head()

state,Arizona,California,Florida,Illinois,Michigan,New York,Texas
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-01-01,4.1,5.0,3.7,4.2,3.3,4.7,4.6
2000-02-01,4.1,5.0,3.7,4.2,3.2,4.7,4.6
2000-03-01,4.0,5.0,3.7,4.3,3.2,4.6,4.5
2000-04-01,4.0,5.1,3.7,4.3,3.3,4.6,4.4
2000-05-01,4.0,5.1,3.7,4.3,3.5,4.6,4.3


In [29]:
new.sort_index(inplace=True)

In [30]:
new

state,Arizona,California,Florida,Illinois,Michigan,New York,Texas
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1995-06-15,6.0,4.6,5.3,6.7,5.5,5.5,4.5
2000-01-01,4.1,5.0,3.7,4.2,3.3,4.7,4.6
2000-02-01,4.1,5.0,3.7,4.2,3.2,4.7,4.6
2000-03-01,4.0,5.0,3.7,4.3,3.2,4.6,4.5
2000-04-01,4.0,5.1,3.7,4.3,3.3,4.6,4.4
...,...,...,...,...,...,...,...
2017-08-01,4.7,4.6,4.0,5.0,4.6,4.7,4.0
2017-09-01,4.7,4.5,3.9,5.0,4.7,4.7,4.0
2017-10-01,4.7,4.5,3.9,4.9,4.7,4.7,3.9
2017-11-01,4.7,4.5,3.9,4.9,4.7,4.7,3.9


In [31]:
new.loc['1995']

state,Arizona,California,Florida,Illinois,Michigan,New York,Texas
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1995-06-15,6.0,4.6,5.3,6.7,5.5,5.5,4.5


In [32]:
new.loc['Jan. 1995':'February, 2000']

state,Arizona,California,Florida,Illinois,Michigan,New York,Texas
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1995-06-15,6.0,4.6,5.3,6.7,5.5,5.5,4.5
2000-01-01,4.1,5.0,3.7,4.2,3.3,4.7,4.6
2000-02-01,4.1,5.0,3.7,4.2,3.2,4.7,4.6


### Accessing date properties

In [33]:
new.index.year

Int64Index([1995, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000,
            ...
            2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017],
           dtype='int64', name='Date', length=217)

In [34]:
new.index.day

Int64Index([15,  1,  1,  1,  1,  1,  1,  1,  1,  1,
            ...
             1,  1,  1,  1,  1,  1,  1,  1,  1,  1],
           dtype='int64', name='Date', length=217)

In [35]:
new.index.month

Int64Index([ 6,  1,  2,  3,  4,  5,  6,  7,  8,  9,
            ...
             3,  4,  5,  6,  7,  8,  9, 10, 11, 12],
           dtype='int64', name='Date', length=217)

In [36]:
new2 = new.reset_index()

In [37]:
new2

state,Date,Arizona,California,Florida,Illinois,Michigan,New York,Texas
0,1995-06-15,6.0,4.6,5.3,6.7,5.5,5.5,4.5
1,2000-01-01,4.1,5.0,3.7,4.2,3.3,4.7,4.6
2,2000-02-01,4.1,5.0,3.7,4.2,3.2,4.7,4.6
3,2000-03-01,4.0,5.0,3.7,4.3,3.2,4.6,4.5
4,2000-04-01,4.0,5.1,3.7,4.3,3.3,4.6,4.4
...,...,...,...,...,...,...,...,...
212,2017-08-01,4.7,4.6,4.0,5.0,4.6,4.7,4.0
213,2017-09-01,4.7,4.5,3.9,5.0,4.7,4.7,4.0
214,2017-10-01,4.7,4.5,3.9,4.9,4.7,4.7,3.9
215,2017-11-01,4.7,4.5,3.9,4.9,4.7,4.7,3.9


In [38]:
new2['Date']

0     1995-06-15
1     2000-01-01
2     2000-02-01
3     2000-03-01
4     2000-04-01
         ...    
212   2017-08-01
213   2017-09-01
214   2017-10-01
215   2017-11-01
216   2017-12-01
Name: Date, Length: 217, dtype: datetime64[ns]

In [40]:
new2['Date'].dt.year

0      1995
1      2000
2      2000
3      2000
4      2000
       ... 
212    2017
213    2017
214    2017
215    2017
216    2017
Name: Date, Length: 217, dtype: int64

In [41]:
new2['Date'].dt.isocalendar().week

0      24
1      52
2       5
3       9
4      13
       ..
212    31
213    35
214    39
215    44
216    48
Name: week, Length: 217, dtype: UInt32

In [42]:
new

state,Arizona,California,Florida,Illinois,Michigan,New York,Texas
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1995-06-15,6.0,4.6,5.3,6.7,5.5,5.5,4.5
2000-01-01,4.1,5.0,3.7,4.2,3.3,4.7,4.6
2000-02-01,4.1,5.0,3.7,4.2,3.2,4.7,4.6
2000-03-01,4.0,5.0,3.7,4.3,3.2,4.6,4.5
2000-04-01,4.0,5.1,3.7,4.3,3.3,4.6,4.4
...,...,...,...,...,...,...,...
2017-08-01,4.7,4.6,4.0,5.0,4.6,4.7,4.0
2017-09-01,4.7,4.5,3.9,5.0,4.7,4.7,4.0
2017-10-01,4.7,4.5,3.9,4.9,4.7,4.7,3.9
2017-11-01,4.7,4.5,3.9,4.9,4.7,4.7,3.9


In [45]:
new.drop('1995-06-15',inplace=True)

In [46]:
new.head()

state,Arizona,California,Florida,Illinois,Michigan,New York,Texas
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-01-01,4.1,5.0,3.7,4.2,3.3,4.7,4.6
2000-02-01,4.1,5.0,3.7,4.2,3.2,4.7,4.6
2000-03-01,4.0,5.0,3.7,4.3,3.2,4.6,4.5
2000-04-01,4.0,5.1,3.7,4.3,3.3,4.6,4.4
2000-05-01,4.0,5.1,3.7,4.3,3.5,4.6,4.3


### Leads and lags

In [47]:
# Lag the data by 1 month
new.shift()

state,Arizona,California,Florida,Illinois,Michigan,New York,Texas
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-01-01,,,,,,,
2000-02-01,4.1,5.0,3.7,4.2,3.3,4.7,4.6
2000-03-01,4.1,5.0,3.7,4.2,3.2,4.7,4.6
2000-04-01,4.0,5.0,3.7,4.3,3.2,4.6,4.5
2000-05-01,4.0,5.1,3.7,4.3,3.3,4.6,4.4
...,...,...,...,...,...,...,...
2017-08-01,4.7,4.7,4.1,5.0,4.5,4.7,4.1
2017-09-01,4.7,4.6,4.0,5.0,4.6,4.7,4.0
2017-10-01,4.7,4.5,3.9,5.0,4.7,4.7,4.0
2017-11-01,4.7,4.5,3.9,4.9,4.7,4.7,3.9


In [48]:
# Lag the data by 2 months
new.shift(2)

state,Arizona,California,Florida,Illinois,Michigan,New York,Texas
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-01-01,,,,,,,
2000-02-01,,,,,,,
2000-03-01,4.1,5.0,3.7,4.2,3.3,4.7,4.6
2000-04-01,4.1,5.0,3.7,4.2,3.2,4.7,4.6
2000-05-01,4.0,5.0,3.7,4.3,3.2,4.6,4.5
...,...,...,...,...,...,...,...
2017-08-01,4.8,4.8,4.1,4.9,4.4,4.7,4.2
2017-09-01,4.7,4.7,4.1,5.0,4.5,4.7,4.1
2017-10-01,4.7,4.6,4.0,5.0,4.6,4.7,4.0
2017-11-01,4.7,4.5,3.9,5.0,4.7,4.7,4.0


In [49]:
# Lead the data by 1 month
new.shift(-1)

state,Arizona,California,Florida,Illinois,Michigan,New York,Texas
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-01-01,4.1,5.0,3.7,4.2,3.2,4.7,4.6
2000-02-01,4.0,5.0,3.7,4.3,3.2,4.6,4.5
2000-03-01,4.0,5.1,3.7,4.3,3.3,4.6,4.4
2000-04-01,4.0,5.1,3.7,4.3,3.5,4.6,4.3
2000-05-01,4.0,5.1,3.8,4.3,3.7,4.6,4.3
...,...,...,...,...,...,...,...
2017-08-01,4.7,4.5,3.9,5.0,4.7,4.7,4.0
2017-09-01,4.7,4.5,3.9,4.9,4.7,4.7,3.9
2017-10-01,4.7,4.5,3.9,4.9,4.7,4.7,3.9
2017-11-01,4.7,4.5,3.9,4.9,4.7,4.7,4.0


In [52]:
# percentage change from last month
new.pct_change()

state,Arizona,California,Florida,Illinois,Michigan,New York,Texas
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-01-01,,,,,,,
2000-02-01,0.00,0.00,0.00,0.00,-0.03,0.00,0.00
2000-03-01,-0.02,0.00,0.00,0.02,0.00,-0.02,-0.02
2000-04-01,0.00,0.02,0.00,0.00,0.03,0.00,-0.02
2000-05-01,0.00,0.00,0.00,0.00,0.06,0.00,-0.02
...,...,...,...,...,...,...,...
2017-08-01,0.00,-0.02,-0.02,0.00,0.02,0.00,-0.02
2017-09-01,0.00,-0.02,-0.03,0.00,0.02,0.00,0.00
2017-10-01,0.00,0.00,0.00,-0.02,0.00,0.00,-0.03
2017-11-01,0.00,0.00,0.00,0.00,0.00,0.00,0.00


### Changing the frequency of data

The user guide shows how the offsetting strings work: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects

In [53]:
# QS: beginning of calendar quarter
new.resample('QS').mean()

state,Arizona,California,Florida,Illinois,Michigan,New York,Texas
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-01-01,4.07,5.00,3.70,4.23,3.23,4.67,4.57
2000-04-01,4.00,5.10,3.73,4.30,3.50,4.60,4.33
2000-07-01,3.93,4.97,3.73,4.30,3.83,4.53,4.17
2000-10-01,3.90,4.77,3.70,4.50,4.03,4.30,4.03
2001-01-01,4.10,4.83,3.90,4.90,4.57,4.20,4.20
...,...,...,...,...,...,...,...
2016-10-01,5.20,5.37,4.73,5.50,5.13,4.87,4.80
2017-01-01,5.13,5.10,4.50,5.10,4.80,4.70,4.70
2017-04-01,4.90,4.90,4.20,4.90,4.40,4.70,4.37
2017-07-01,4.70,4.60,4.00,5.00,4.60,4.70,4.03


## Automatic Alignment

In [54]:
Florida_short = unemp_all.loc['2000':'2002']['Florida']

In [55]:
Florida_short

Date
2000-01-01    3.7
2000-02-01    3.7
2000-03-01    3.7
2000-04-01    3.7
2000-05-01    3.7
2000-06-01    3.8
2000-07-01    3.8
2000-08-01    3.7
2000-09-01    3.7
2000-10-01    3.7
2000-11-01    3.7
2000-12-01    3.7
2001-01-01    3.8
2001-02-01    3.9
2001-03-01    4.0
2001-04-01    4.1
2001-05-01    4.2
2001-06-01    4.3
2001-07-01    4.4
2001-08-01    4.5
2001-09-01    4.7
2001-10-01    5.8
2001-11-01    5.9
2001-12-01    6.0
Name: Florida, dtype: float64

In [56]:
unemp

state,Arizona,California,Florida,Illinois,Michigan,New York,Texas
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-01-01,4.1,5.0,3.7,4.2,3.3,4.7,4.6
2000-02-01,4.1,5.0,3.7,4.2,3.2,4.7,4.6
2000-03-01,4.0,5.0,3.7,4.3,3.2,4.6,4.5
2000-04-01,4.0,5.1,3.7,4.3,3.3,4.6,4.4
2000-05-01,4.0,5.1,3.7,4.3,3.5,4.6,4.3
...,...,...,...,...,...,...,...
2017-08-01,4.7,4.6,4.0,5.0,4.6,4.7,4.0
2017-09-01,4.7,4.5,3.9,5.0,4.7,4.7,4.0
2017-10-01,4.7,4.5,3.9,4.9,4.7,4.7,3.9
2017-11-01,4.7,4.5,3.9,4.9,4.7,4.7,3.9


In [57]:
unemp['Florida'] = Florida_short

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unemp['Florida'] = Florida_short


In [58]:
unemp

state,Arizona,California,Florida,Illinois,Michigan,New York,Texas
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-01-01,4.1,5.0,,4.2,3.3,4.7,4.6
2000-02-01,4.1,5.0,,4.2,3.2,4.7,4.6
2000-03-01,4.0,5.0,,4.3,3.2,4.6,4.5
2000-04-01,4.0,5.1,,4.3,3.3,4.6,4.4
2000-05-01,4.0,5.1,,4.3,3.5,4.6,4.3
...,...,...,...,...,...,...,...
2017-08-01,4.7,4.6,,5.0,4.6,4.7,4.0
2017-09-01,4.7,4.5,,5.0,4.7,4.7,4.0
2017-10-01,4.7,4.5,,4.9,4.7,4.7,3.9
2017-11-01,4.7,4.5,,4.9,4.7,4.7,3.9


In [59]:
unemp['Florida']

Date
2000-01-01   NaN
2000-02-01   NaN
2000-03-01   NaN
2000-04-01   NaN
2000-05-01   NaN
              ..
2017-08-01   NaN
2017-09-01   NaN
2017-10-01   NaN
2017-11-01   NaN
2017-12-01   NaN
Name: Florida, Length: 216, dtype: float64

In [60]:
Florida_short.index

Index(['2000-01-01', '2000-02-01', '2000-03-01', '2000-04-01', '2000-05-01',
       '2000-06-01', '2000-07-01', '2000-08-01', '2000-09-01', '2000-10-01',
       '2000-11-01', '2000-12-01', '2001-01-01', '2001-02-01', '2001-03-01',
       '2001-04-01', '2001-05-01', '2001-06-01', '2001-07-01', '2001-08-01',
       '2001-09-01', '2001-10-01', '2001-11-01', '2001-12-01'],
      dtype='object', name='Date')

In [61]:
unemp.index

DatetimeIndex(['2000-01-01', '2000-02-01', '2000-03-01', '2000-04-01',
               '2000-05-01', '2000-06-01', '2000-07-01', '2000-08-01',
               '2000-09-01', '2000-10-01',
               ...
               '2017-03-01', '2017-04-01', '2017-05-01', '2017-06-01',
               '2017-07-01', '2017-08-01', '2017-09-01', '2017-10-01',
               '2017-11-01', '2017-12-01'],
              dtype='datetime64[ns]', name='Date', length=216, freq=None)

In [66]:
temp = pd.DataFrame(Florida_short).set_index(pd.to_datetime(Florida_short.index))

In [71]:
unemp['Florida'] = temp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unemp['Florida'] = temp


In [72]:
unemp

state,Arizona,California,Florida,Illinois,Michigan,New York,Texas
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-01-01,4.1,5.0,3.7,4.2,3.3,4.7,4.6
2000-02-01,4.1,5.0,3.7,4.2,3.2,4.7,4.6
2000-03-01,4.0,5.0,3.7,4.3,3.2,4.6,4.5
2000-04-01,4.0,5.1,3.7,4.3,3.3,4.6,4.4
2000-05-01,4.0,5.1,3.7,4.3,3.5,4.6,4.3
...,...,...,...,...,...,...,...
2017-08-01,4.7,4.6,,5.0,4.6,4.7,4.0
2017-09-01,4.7,4.5,,5.0,4.7,4.7,4.0
2017-10-01,4.7,4.5,,4.9,4.7,4.7,3.9
2017-11-01,4.7,4.5,,4.9,4.7,4.7,3.9


In [73]:
unemp['Florida']

Date
2000-01-01    3.7
2000-02-01    3.7
2000-03-01    3.7
2000-04-01    3.7
2000-05-01    3.7
             ... 
2017-08-01    NaN
2017-09-01    NaN
2017-10-01    NaN
2017-11-01    NaN
2017-12-01    NaN
Name: Florida, Length: 216, dtype: float64

## Hierarchical Index

In [74]:
url = "https://datascience.quantecon.org/assets/data/wdi_data.csv"
df = pd.read_csv(url)

In [75]:
df

Unnamed: 0,country,year,GovExpend,Consumption,Exports,Imports,GDP
0,Canada,2017,0.37,1.10,0.58,0.60,1.87
1,Canada,2016,0.36,1.06,0.58,0.58,1.81
2,Canada,2015,0.36,1.04,0.57,0.58,1.79
3,Canada,2014,0.35,1.01,0.55,0.57,1.78
4,Canada,2013,0.35,0.99,0.52,0.56,1.73
...,...,...,...,...,...,...,...
67,United States,2004,2.27,9.31,1.34,2.11,13.85
68,United States,2003,2.23,8.97,1.22,1.89,13.34
69,United States,2002,2.19,8.70,1.19,1.80,12.97
70,United States,2001,2.11,8.48,1.21,1.74,12.75


In [76]:
new4 = df.set_index(['country','year'])

In [77]:
new4

Unnamed: 0_level_0,Unnamed: 1_level_0,GovExpend,Consumption,Exports,Imports,GDP
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Canada,2017,0.37,1.10,0.58,0.60,1.87
Canada,2016,0.36,1.06,0.58,0.58,1.81
Canada,2015,0.36,1.04,0.57,0.58,1.79
Canada,2014,0.35,1.01,0.55,0.57,1.78
Canada,2013,0.35,0.99,0.52,0.56,1.73
...,...,...,...,...,...,...
United States,2004,2.27,9.31,1.34,2.11,13.85
United States,2003,2.23,8.97,1.22,1.89,13.34
United States,2002,2.19,8.70,1.19,1.80,12.97
United States,2001,2.11,8.48,1.21,1.74,12.75


### Slicing

In [79]:
new4.loc['Canada']

Unnamed: 0_level_0,GovExpend,Consumption,Exports,Imports,GDP
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017,0.37,1.1,0.58,0.6,1.87
2016,0.36,1.06,0.58,0.58,1.81
2015,0.36,1.04,0.57,0.58,1.79
2014,0.35,1.01,0.55,0.57,1.78
2013,0.35,0.99,0.52,0.56,1.73
2012,0.35,0.96,0.51,0.55,1.69
2011,0.35,0.94,0.49,0.53,1.66
2010,0.35,0.92,0.47,0.5,1.61
2009,0.34,0.89,0.44,0.44,1.57
2008,0.33,0.89,0.51,0.5,1.61


In [80]:
# put the multi-index in a tuple, not a list
new4.loc[('Canada',2010)]

GovExpend      0.35
Consumption    0.92
Exports        0.47
Imports        0.50
GDP            1.61
Name: (Canada, 2010), dtype: float64

In [82]:
# new4.loc[['Canada',2010]]
# won't work

In [83]:
new4.loc[['Canada','United States']]

Unnamed: 0_level_0,Unnamed: 1_level_0,GovExpend,Consumption,Exports,Imports,GDP
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Canada,2017,0.37,1.1,0.58,0.6,1.87
Canada,2016,0.36,1.06,0.58,0.58,1.81
Canada,2015,0.36,1.04,0.57,0.58,1.79
Canada,2014,0.35,1.01,0.55,0.57,1.78
Canada,2013,0.35,0.99,0.52,0.56,1.73
Canada,2012,0.35,0.96,0.51,0.55,1.69
Canada,2011,0.35,0.94,0.49,0.53,1.66
Canada,2010,0.35,0.92,0.47,0.5,1.61
Canada,2009,0.34,0.89,0.44,0.44,1.57
Canada,2008,0.33,0.89,0.51,0.5,1.61


In [84]:
new4.loc[['Canada','United States'],'GDP']

country        year
Canada         2017     1.87
               2016     1.81
               2015     1.79
               2014     1.78
               2013     1.73
               2012     1.69
               2011     1.66
               2010     1.61
               2009     1.57
               2008     1.61
               2007     1.60
               2006     1.56
               2005     1.52
               2004     1.48
               2003     1.43
               2002     1.41
               2001     1.37
               2000     1.34
United States  2017    17.35
               2016    16.97
               2015    16.71
               2014    16.24
               2013    15.85
               2012    15.57
               2011    15.22
               2010    14.99
               2009    14.62
               2008    15.00
               2007    15.02
               2006    14.74
               2005    14.33
               2004    13.85
               2003    13.34
               2002    

In [85]:
new4.loc[(['Canada','United States'],[2010,2011]),:]

Unnamed: 0_level_0,Unnamed: 1_level_0,GovExpend,Consumption,Exports,Imports,GDP
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Canada,2010,0.35,0.92,0.47,0.5,1.61
Canada,2011,0.35,0.94,0.49,0.53,1.66
United States,2010,2.51,10.19,1.85,2.36,14.99
United States,2011,2.43,10.38,1.98,2.49,15.22


In [86]:
new4.loc[(['Canada','United States'],[2010,2011]),'GDP']

country        year
Canada         2010     1.61
               2011     1.66
United States  2010    14.99
               2011    15.22
Name: GDP, dtype: float64

In [87]:
new4.loc[(['Canada','United States'],[2010,2011]),['GDP','Consumption']]

Unnamed: 0_level_0,Unnamed: 1_level_0,GDP,Consumption
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Canada,2010,1.61,0.92
Canada,2011,1.66,0.94
United States,2010,14.99,10.19
United States,2011,15.22,10.38


In [88]:
# Exercise
# locate imports for the United States for year 2012
new4.loc[('United States',2012),'Imports']

2.5606772224

Find GDP and consumption for Canada in 2010 and United States in 2011
* `df.loc[ multi-index, columns]`
* the multi-index needs to be in a pair of (), a tuple
* whatever data you want should go into a list if there are more than 1


In [89]:
new4.loc[[('Canada',2010),('United States',2011)],['GDP','Consumption']]

Unnamed: 0_level_0,Unnamed: 1_level_0,GDP,Consumption
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Canada,2010,1.61,0.92
United States,2011,15.22,10.38


In [91]:
new4.loc['Canada'].head()

Unnamed: 0_level_0,GovExpend,Consumption,Exports,Imports,GDP
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017,0.37,1.1,0.58,0.6,1.87
2016,0.36,1.06,0.58,0.58,1.81
2015,0.36,1.04,0.57,0.58,1.79
2014,0.35,1.01,0.55,0.57,1.78
2013,0.35,0.99,0.52,0.56,1.73


In [93]:
#new4.loc[2010]

In [94]:
new4.loc[pd.IndexSlice[:,2005],:]

Unnamed: 0_level_0,Unnamed: 1_level_0,GovExpend,Consumption,Exports,Imports,GDP
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Canada,2005,0.3,0.79,0.52,0.45,1.52
Germany,2005,0.59,1.87,1.18,1.03,3.21
United Kingdom,2005,0.49,1.58,0.64,0.72,2.4
United States,2005,2.29,9.64,1.43,2.25,14.33


In [95]:
newT = new4.T

In [96]:
newT

country,Canada,Canada,Canada,Canada,Canada,Canada,Canada,Canada,Canada,Canada,...,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States
year,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,...,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000
GovExpend,0.37,0.36,0.36,0.35,0.35,0.35,0.35,0.35,0.34,0.33,...,2.51,2.41,2.35,2.31,2.29,2.27,2.23,2.19,2.11,2.04
Consumption,1.1,1.06,1.04,1.01,0.99,0.96,0.94,0.92,0.89,0.89,...,10.01,10.14,10.16,9.94,9.64,9.31,8.97,8.7,8.48,8.27
Exports,0.58,0.58,0.57,0.55,0.52,0.51,0.49,0.47,0.44,0.51,...,1.65,1.8,1.7,1.56,1.43,1.34,1.22,1.19,1.21,1.29
Imports,0.6,0.58,0.58,0.57,0.56,0.55,0.53,0.5,0.44,0.5,...,2.09,2.4,2.46,2.4,2.25,2.11,1.89,1.8,1.74,1.79
GDP,1.87,1.81,1.79,1.78,1.73,1.69,1.66,1.61,1.57,1.61,...,14.62,15.0,15.02,14.74,14.33,13.85,13.34,12.97,12.75,12.62


In [97]:
newT.loc[:,'Canada']

year,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000
GovExpend,0.37,0.36,0.36,0.35,0.35,0.35,0.35,0.35,0.34,0.33,0.32,0.31,0.3,0.3,0.29,0.29,0.28,0.27
Consumption,1.1,1.06,1.04,1.01,0.99,0.96,0.94,0.92,0.89,0.89,0.86,0.83,0.79,0.76,0.74,0.72,0.69,0.68
Exports,0.58,0.58,0.57,0.55,0.52,0.51,0.49,0.47,0.44,0.51,0.53,0.52,0.52,0.51,0.48,0.49,0.48,0.5
Imports,0.6,0.58,0.58,0.57,0.56,0.55,0.53,0.5,0.44,0.5,0.5,0.47,0.45,0.42,0.38,0.37,0.36,0.38
GDP,1.87,1.81,1.79,1.78,1.73,1.69,1.66,1.61,1.57,1.61,1.6,1.56,1.52,1.48,1.43,1.41,1.37,1.34


In [98]:
newT.loc[:,('Canada',2010)]

GovExpend      0.35
Consumption    0.92
Exports        0.47
Imports        0.50
GDP            1.61
Name: (Canada, 2010), dtype: float64

Exercise: locate Canadian data for years 2010 and 2011

## Cleaning data with an example