
# Setting and removing indexes

pandas allows you to designate columns as an index. This enables cleaner code when taking subsets (as well as providing more efficient lookup under some circumstances).

In this chapter, you'll be exploring temperatures, a DataFrame of average temperatures in cities around the world. pandas is loaded as pd.


* Look at temperatures.
* Set the index of temperatures to "city", assigning to temperatures_ind.
* Look at temperatures_ind. How is it different from temperatures?
* Reset the index of temperatures_ind, keeping its contents.
* Reset the index of temperatures_ind, dropping its contents.

In [2]:
import pandas as pd

temperatures = pd.read_csv("/kaggle/input/temperature-change/Environment_Temperature_change_E_All_Data_NOFLAG.csv", index_col=0, encoding='ISO-8859-1')
temperatures.head()


Unnamed: 0_level_0,Area,Months Code,Months,Element Code,Element,Unit,Y1961,Y1962,Y1963,Y1964,...,Y2010,Y2011,Y2012,Y2013,Y2014,Y2015,Y2016,Y2017,Y2018,Y2019
Area Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,Afghanistan,7001,January,7271,Temperature change,°C,0.777,0.062,2.744,-5.232,...,3.601,1.179,-0.583,1.233,1.755,1.943,3.416,1.201,1.996,2.951
2,Afghanistan,7001,January,6078,Standard Deviation,°C,1.95,1.95,1.95,1.95,...,1.95,1.95,1.95,1.95,1.95,1.95,1.95,1.95,1.95,1.95
2,Afghanistan,7002,February,7271,Temperature change,°C,-1.743,2.465,3.919,-0.202,...,1.212,0.321,-3.201,1.494,-3.187,2.699,2.251,-0.323,2.705,0.086
2,Afghanistan,7002,February,6078,Standard Deviation,°C,2.597,2.597,2.597,2.597,...,2.597,2.597,2.597,2.597,2.597,2.597,2.597,2.597,2.597,2.597
2,Afghanistan,7003,March,7271,Temperature change,°C,0.516,1.336,0.403,1.659,...,3.39,0.748,-0.527,2.246,-0.076,-0.497,2.296,0.834,4.418,0.234


In [3]:
# Look at temperatures
print(temperatures)


                  Area  Months Code               Months  Element Code  \
Area Code                                                                
2          Afghanistan         7001              January          7271   
2          Afghanistan         7001              January          6078   
2          Afghanistan         7002             February          7271   
2          Afghanistan         7002             February          6078   
2          Afghanistan         7003                March          7271   
...                ...          ...                  ...           ...   
5873              OECD         7018          JunJulAug          6078   
5873              OECD         7019          SepOctNov          7271   
5873              OECD         7019          SepOctNov          6078   
5873              OECD         7020  Meteorological year          7271   
5873              OECD         7020  Meteorological year          6078   

                      Element Unit  Y

In [4]:
# Set the index of temperatures to city
temperatures_ind = temperatures.set_index("Area")

# Look at temperatures_ind
print(temperatures_ind)



             Months Code               Months  Element Code  \
Area                                                          
Afghanistan         7001              January          7271   
Afghanistan         7001              January          6078   
Afghanistan         7002             February          7271   
Afghanistan         7002             February          6078   
Afghanistan         7003                March          7271   
...                  ...                  ...           ...   
OECD                7018          JunJulAug          6078   
OECD                7019          SepOctNov          7271   
OECD                7019          SepOctNov          6078   
OECD                7020  Meteorological year          7271   
OECD                7020  Meteorological year          6078   

                        Element Unit  Y1961  Y1962  Y1963  Y1964  Y1965  ...  \
Area                                                                     ...   
Afghanistan  Tempera

In [5]:
# Reset the temperatures_ind index, keeping its contents
print(temperatures_ind.reset_index())



             Area  Months Code               Months  Element Code  \
0     Afghanistan         7001              January          7271   
1     Afghanistan         7001              January          6078   
2     Afghanistan         7002             February          7271   
3     Afghanistan         7002             February          6078   
4     Afghanistan         7003                March          7271   
...           ...          ...                  ...           ...   
9651         OECD         7018          JunJulAug          6078   
9652         OECD         7019          SepOctNov          7271   
9653         OECD         7019          SepOctNov          6078   
9654         OECD         7020  Meteorological year          7271   
9655         OECD         7020  Meteorological year          6078   

                 Element Unit  Y1961  Y1962  Y1963  Y1964  ...  Y2010  Y2011  \
0     Temperature change   °C  0.777  0.062  2.744 -5.232  ...  3.601  1.179   
1     Stand

In [6]:
# Reset the temperatures_ind index, dropping its contents
print(temperatures_ind.reset_index(drop = True) )

      Months Code               Months  Element Code             Element Unit  \
0            7001              January          7271  Temperature change   °C   
1            7001              January          6078  Standard Deviation   °C   
2            7002             February          7271  Temperature change   °C   
3            7002             February          6078  Standard Deviation   °C   
4            7003                March          7271  Temperature change   °C   
...           ...                  ...           ...                 ...  ...   
9651         7018          JunJulAug          6078  Standard Deviation   °C   
9652         7019          SepOctNov          7271  Temperature change   °C   
9653         7019          SepOctNov          6078  Standard Deviation   °C   
9654         7020  Meteorological year          7271  Temperature change   °C   
9655         7020  Meteorological year          6078  Standard Deviation   °C   

      Y1961  Y1962  Y1963  

# Subsetting with .loc[]

The killer feature for indexes is .loc[]: a subsetting method that accepts index values. When you pass it a single argument, it will take a subset of rows.

The code for subsetting using .loc[] can be easier to read than standard square bracket subsetting, which can make your code less burdensome to maintain.

pandas is loaded as pd. temperatures and temperatures_ind are available; the latter is indexed by city.


* Create a list called cities that contains "Moscow" and "Saint Petersburg".
* Use [] subsetting to filter temperatures for rows where the city column takes a value in the cities list.
* Use .loc[] subsetting to filter temperatures_ind for rows where the city is in the cities list.





In [7]:
# Make a list of cities to subset on
countries = ["India", "China"]

# Subset temperatures using square brackets
print(temperatures[temperatures["Area"].isin(countries)])



            Area  Months Code               Months  Element Code  \
Area Code                                                          
351        China         7001              January          7271   
351        China         7001              January          6078   
351        China         7002             February          7271   
351        China         7002             February          6078   
351        China         7003                March          7271   
...          ...          ...                  ...           ...   
100        India         7018          JunJulAug          6078   
100        India         7019          SepOctNov          7271   
100        India         7019          SepOctNov          6078   
100        India         7020  Meteorological year          7271   
100        India         7020  Meteorological year          6078   

                      Element Unit  Y1961  Y1962  Y1963  Y1964  ...  Y2010  \
Area Code                            

In [8]:
# Subset temperatures_ind using .loc[]
print(temperatures_ind.loc[countries])

       Months Code               Months  Element Code             Element  \
Area                                                                        
India         7001              January          7271  Temperature change   
India         7001              January          6078  Standard Deviation   
India         7002             February          7271  Temperature change   
India         7002             February          6078  Standard Deviation   
India         7003                March          7271  Temperature change   
...            ...                  ...           ...                 ...   
China         7018          JunJulAug          6078  Standard Deviation   
China         7019          SepOctNov          7271  Temperature change   
China         7019          SepOctNov          6078  Standard Deviation   
China         7020  Meteorological year          7271  Temperature change   
China         7020  Meteorological year          6078  Standard Deviation   

**.loc[] is used by all the best folk! Setting an index allows more concise code for subsetting rows via .loc[].**

**Setting multi-level indexes**

Indexes can also be made out of multiple columns, forming a multi-level index (sometimes called a hierarchical index). There is a trade-off to using these.

The benefit is that multi-level indexes make it more natural to reason about nested categorical variables. For example, in a clinical trial, you might have control and treatment groups. Then each test subject belongs to one or another group, and we can say that a test subject is nested inside the treatment group. Similarly, in the temperature dataset, the city is located in the country, so we can say a city is nested inside the country.

The main downside is that the code for manipulating indexes is different from the code for manipulating columns, so you have to learn two syntaxes and keep track of how your data is represented.


* Set the index of temperatures to the "country" and "city" columns, and assign this to temperatures_ind.
* Specify two country/city pairs to keep: "Brazil"/"Rio De Janeiro" and "Pakistan"/"Lahore", assigning to rows_to_keep.
* Print and subset temperatures_ind for rows_to_keep using .loc[]

In [None]:
# Index temperatures by country & city
temperatures_ind = temperatures.set_index(["country","city"])
# List of tuples: Brazil, Rio De Janeiro & Pakistan, Lahore
rows_to_keep = [("Brazil", "Rio De Janeiro"),("Pakistan", "Lahore")]

# Subset for rows to keep
print(temperatures_ind.loc[rows_to_keep])

![image.png](attachment:258811d1-9578-4630-b5c6-1b18b47a62b7.png)

Magnificent multi-level indexing! Multi-level indexes can make it easy to comprehend your dataset when one category is nested inside another category.

**Sorting by index values**

Previously, you changed the order of the rows in a DataFrame by calling .sort_values(). It's also useful to be able to sort by elements in the index. For this, you need to use .sort_index().

pandas is loaded as pd. temperatures_ind has a multi-level index of country and city, and is available.


* Sort temperatures_ind by the index values.
* Sort temperatures_ind by the index values at the "city" level.
* Sort temperatures_ind by ascending country then descending city

In [10]:
# Sort temperatures_ind by index values
print(temperatures_ind.sort_index())



             Months Code       Months  Element Code             Element Unit  \
Area                                                                           
Afghanistan         7001      January          7271  Temperature change   °C   
Afghanistan         7010      October          6078  Standard Deviation   °C   
Afghanistan         7011     November          7271  Temperature change   °C   
Afghanistan         7011     November          6078  Standard Deviation   °C   
Afghanistan         7012     December          7271  Temperature change   °C   
...                  ...          ...           ...                 ...  ...   
Zimbabwe            7011     November          6078  Standard Deviation   °C   
Zimbabwe            7011     November          7271  Temperature change   °C   
Zimbabwe            7010      October          6078  Standard Deviation   °C   
Zimbabwe            7017  MarAprMay          7271  Temperature change   °C   
Zimbabwe            7001      January   

In [11]:
# Sort temperatures_ind by index values at the city level
print(temperatures_ind.sort_index(level = "Months Code"))



             Months Code       Months  Element Code             Element Unit  \
Area                                                                           
Afghanistan         7001      January          7271  Temperature change   °C   
Afghanistan         7010      October          6078  Standard Deviation   °C   
Afghanistan         7011     November          7271  Temperature change   °C   
Afghanistan         7011     November          6078  Standard Deviation   °C   
Afghanistan         7012     December          7271  Temperature change   °C   
...                  ...          ...           ...                 ...  ...   
Zimbabwe            7011     November          6078  Standard Deviation   °C   
Zimbabwe            7011     November          7271  Temperature change   °C   
Zimbabwe            7010      October          6078  Standard Deviation   °C   
Zimbabwe            7017  MarAprMay          7271  Temperature change   °C   
Zimbabwe            7001      January   

In [16]:
# Ensure MultiIndex is set correctly
temperatures_ind = temperatures_ind.set_index(["Element Code", "Months Code"])

# Now sort by 'Area' and then descending 'Months Code'
print(temperatures_ind.sort_index(level=["Element Code", "Months Code"], ascending=[True, False]))


                                       Months             Element Unit  Y1961  \
Element Code Months Code                                                        
6078         7020         Meteorological year  Standard Deviation   °C  0.548   
             7020         Meteorological year  Standard Deviation   °C  0.425   
             7020         Meteorological year  Standard Deviation   °C  0.394   
             7020         Meteorological year  Standard Deviation   °C  0.224   
             7020         Meteorological year  Standard Deviation   °C  0.415   
...                                       ...                 ...  ...    ...   
7271         7001                     January  Temperature change   °C  0.350   
             7001                     January  Temperature change   °C  0.266   
             7001                     January  Temperature change   °C  0.808   
             7001                     January  Temperature change   °C  0.108   
             7001           

**Sorted! Sorting index values is similar to sorting values in columns, except that you call .sort_index() instead of .sort_values().**