# Index DataFrame

In [1]:
import pandas as pd

## Create dataset

In [40]:
prices_dict = {
    "fruits": ["apples", "oranges", "bananas", "strawberries"],
    "prices": [1.5, 2, 2.5, 3],
    "suppliers": ["supplier1", "supplier2", "supplier4", "supplier3"],
}

prices_df = pd.DataFrame(prices_dict, index = [1,2,3,4])
prices_df

Unnamed: 0,fruits,prices,suppliers
1,apples,1.5,supplier1
2,oranges,2.0,supplier2
3,bananas,2.5,supplier4
4,strawberries,3.0,supplier3


In [41]:
prices_df.columns

Index(['fruits', 'prices', 'suppliers'], dtype='object')

## Basic indexing

df[colname] -> Series corresponding to colname

### Select Single Column 

In [3]:
## select single column - square bracket notation:
prices_col = prices_df['prices']
prices_col

1    1.5
2    2.0
3    2.5
4    3.0
Name: prices, dtype: float64

Remember, that a DataFrame Column is a Series object

In [4]:
print(type(prices_col))

<class 'pandas.core.series.Series'>


In [5]:
## select single column - attribute (dot) notation:
prices_df.prices

1    1.5
2    2.0
3    2.5
4    3.0
Name: prices, dtype: float64

#### square bracket vs dot notation
Note that square bracket notation is more canonical (can be used for 1 or multiple columns selection) and allows for any string to be used as selector. I.e you can't use the dot notation, if the column name contains spaces, or is a reserverd word (like max, min, etc.)


In [9]:
demo_df = pd.DataFrame([[1,2,3],[4,5,6]], columns=['col 1', 'col 2', 'col 3'])

# the line bellow will raise an error:

# but next is ok:
demo_df['col 1']


0    1
1    4
Name: col 1, dtype: int64

### Slicing ranges with [] operator

Slicing inside of [] **slices the rows**. This is provided largely as a convenience since it is such a common operation.

In [10]:
# get the first two rows:
prices_df[0:2]

Unnamed: 0,fruits,prices,suppliers
1,apples,1.5,supplier1
2,oranges,2.0,supplier2


In [11]:
# get all odd rows
prices_df[::2]

Unnamed: 0,fruits,prices,suppliers
1,apples,1.5,supplier1
3,bananas,2.5,supplier4


### Select List of Columns

Note, that the columns will be selected in the order specified in the list

In [12]:
prices_df[['prices', 'fruits']]

Unnamed: 0,prices,fruits
1,1.5,apples
2,2.0,oranges
3,2.5,bananas
4,3.0,strawberries


The returned slice is a DataFrame object!

In [13]:
type(prices_df[['prices', 'fruits']])

pandas.core.frame.DataFrame

In [14]:
type(prices_df['prices'])

pandas.core.series.Series


## Access data with the loc method

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.loc.html

Access a group of rows and columns by **label**(s) or a boolean array.

**Syntax**: df.loc[row_label,column_label]

In [15]:
prices_df

Unnamed: 0,fruits,prices,suppliers
1,apples,1.5,supplier1
2,oranges,2.0,supplier2
3,bananas,2.5,supplier4
4,strawberries,3.0,supplier3


In [21]:
# select value in row with label '1' and column  'prices'
prices_df.loc[1, 'prices']

np.float64(1.5)

In [22]:
# get all rows for columns 'fruits' and 'prices'
prices_df.loc[:, ['fruits', 'prices']]

# equivalent to:
# prices_df[['fruits', 'prices']]

Unnamed: 0,fruits,prices
1,apples,1.5
2,oranges,2.0
3,bananas,2.5
4,strawberries,3.0


In [32]:
prices_df.columns

Index(['prices', 'suppliers'], dtype='object')

If we have meaniningfull labeled indexes, we can see the real power of loc method. So lets set the fruits column data as index

In [43]:
prices_df.set_index('fruits', inplace=True)
prices_df.columns

Index(['prices', 'suppliers'], dtype='object')

In [46]:
prices_df.index.name

'fruits'

In [14]:
# get the price of 'oranges':
prices_df.loc['oranges', 'prices']

2.0

In [15]:
# get the price of 'oranges' and 'bananas':
prices_df.loc[['oranges','bananas'], 'prices']

fruits
oranges    2.0
bananas    2.5
Name: prices, dtype: float64

In [16]:
# lets reset the index back
prices_df.reset_index(inplace=True)
prices_df

Unnamed: 0,fruits,prices,suppliers
0,apples,1.5,supplier1
1,oranges,2.0,supplier2
2,bananas,2.5,supplier4
3,strawberries,3.0,supplier3


## Access data with the iloc method

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.iloc.html

Purely **integer-location** based indexing for selection by position.

**Syntax**: df.loc[row_indexer,column_indexer]

In [17]:
prices_df

Unnamed: 0,fruits,prices,suppliers
0,apples,1.5,supplier1
1,oranges,2.0,supplier2
2,bananas,2.5,supplier4
3,strawberries,3.0,supplier3


In [18]:
# get the data in first row, second column
prices_df.iloc[0,1]

1.5

In [19]:
# get the data from second row till the end for all columns
prices_df.iloc[1:,]

Unnamed: 0,fruits,prices,suppliers
1,oranges,2.0,supplier2
2,bananas,2.5,supplier4
3,strawberries,3.0,supplier3


In [20]:
# get the cells from second row till the end, ant the last column (using the -1 index)
prices_df.iloc[1:,-1]

1    supplier2
2    supplier4
3    supplier3
Name: suppliers, dtype: object

#### pass Boolean array to loc/iloc method

As with Series, we can pass a Boolean array as index/column value in loc and iloc.
Note, that the index/column Boolean array must have the same shape as the DF index/columns

In [21]:
columns_mask = [False, True, False]
row_mask = [False, False, True, True]
prices_df.loc[row_mask, columns_mask]

Unnamed: 0,prices
2,2.5
3,3.0


In [22]:
# get all trows for data which have price > 2:
mask = prices_df.prices>2
prices_df.loc[mask]

# the same can be done with:
# prices_df[prices_df.prices>2]

Unnamed: 0,fruits,prices,suppliers
2,bananas,2.5,supplier4
3,strawberries,3.0,supplier3


In [23]:
# gat all fruit names starting with letter 'a'
mask = prices_df.fruits.str.startswith('a')
prices_df.loc[mask, 'fruits']

0    apples
Name: fruits, dtype: object

## Examples

In [24]:
# Load the dataset
url = "https://raw.githubusercontent.com/geekcourses/JupyterNotebooksExamples/master/datasets/various/drinks.csv"
drinks = pd.read_csv(url)

# Display the first few rows of the dataframe to understand its structure
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,AS
1,Albania,89,132,54,4.9,EU
2,Algeria,25,0,14,0.7,AF
3,Andorra,245,138,312,12.4,EU
4,Angola,217,57,45,5.9,AF


In [25]:
# Selecting the 'beer_servings' column
beer_servings = drinks['beer_servings']
beer_servings


0        0
1       89
2       25
3      245
4      217
      ... 
188    333
189    111
190      6
191     32
192     64
Name: beer_servings, Length: 193, dtype: int64

In [26]:
# Selecting the 'country' and 'wine_servings' columns
country_wine = drinks[['country', 'wine_servings']]
country_wine


Unnamed: 0,country,wine_servings
0,Afghanistan,0
1,Albania,54
2,Algeria,14
3,Andorra,312
4,Angola,45
...,...,...
188,Venezuela,3
189,Vietnam,1
190,Yemen,0
191,Zambia,4


In [27]:
# Selecting the first 5 rows
first_five_rows = drinks[:5]
first_five_rows


Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,AS
1,Albania,89,132,54,4.9,EU
2,Algeria,25,0,14,0.7,AF
3,Andorra,245,138,312,12.4,EU
4,Angola,217,57,45,5.9,AF


In [28]:
# Selecting rows for 'Andorra' and 'Angola' and columns 'beer_servings' and 'wine_servings'
andorra_angola = drinks.loc[[3, 4], ['beer_servings', 'wine_servings']]
andorra_angola

Unnamed: 0,beer_servings,wine_servings
3,245,312
4,217,45


In [29]:
# Selecting the first 5 rows and first 3 columns
first_five_rows_cols = drinks.iloc[:5, :3]
first_five_rows_cols


Unnamed: 0,country,beer_servings,spirit_servings
0,Afghanistan,0,0
1,Albania,89,132
2,Algeria,25,0
3,Andorra,245,138
4,Angola,217,57


In [30]:
# Selecting rows where 'beer_servings' is greater than 300
high_beer_servings = drinks[drinks['beer_servings'] > 300]
high_beer_servings


Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
45,Czech Republic,361,170,134,11.8,EU
62,Gabon,347,98,59,8.9,AF
65,Germany,346,117,175,11.3,EU
81,Ireland,313,118,165,11.4,EU
98,Lithuania,343,244,56,12.9,EU
117,Namibia,376,3,1,6.8,AF
129,Palau,306,63,23,6.9,OC
135,Poland,343,215,56,10.9,EU
188,Venezuela,333,100,3,7.7,SA


In [31]:
# Filter the rows where beer_servings is greater than 100 and wine_servings is less than 200.
mask = (drinks['beer_servings'] > 100) & (drinks['wine_servings'] < 200)
filtered_data = drinks[mask]
filtered_data

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
4,Angola,217,57,45,5.9,AF
5,Antigua & Barbuda,102,128,45,4.9,
9,Austria,279,75,191,9.7,EU
11,Bahamas,122,176,51,6.3,
14,Barbados,143,173,36,6.3,
...,...,...,...,...,...,...
180,Ukraine,206,237,45,8.9,EU
182,United Kingdom,219,126,195,10.4,EU
184,USA,249,158,84,8.7,
188,Venezuela,333,100,3,7.7,SA
