# Lesson 19b: Pandas - Data Frame - cont-1

## Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math as math

## Load the data

In [2]:
frame = pd.read_csv("Mcdonalds.csv", usecols = ["Item", "Category", "Serving Size", "Calories", "TotalFat"])
frame.head()

Unnamed: 0,Category,Item,Serving Size,Calories,TotalFat
0,Breakfast,Egg McMuffin,4.8 oz (136 g),300,13.0
1,Breakfast,Egg White Delight,4.8 oz (135 g),250,8.0
2,Breakfast,Sausage McMuffin,3.9 oz (111 g),370,23.0
3,Breakfast,Sausage McMuffin with Egg,5.7 oz (161 g),450,28.0
4,Breakfast,Sausage McMuffin with Egg Whites,5.7 oz (161 g),400,23.0


## Sorting and comparing data

In [3]:
frame.sort_values(by = "Calories", ascending = False).head()

Unnamed: 0,Category,Item,Serving Size,Calories,TotalFat
82,Chicken & Fish,Chicken McNuggets (40 piece),22.8 oz (646 g),1880,118.0
32,Breakfast,Big Breakfast with Hotcakes (Large Biscuit),15.3 oz (434 g),1150,60.0
31,Breakfast,Big Breakfast with Hotcakes (Regular Biscuit),14.8 oz (420 g),1090,56.0
34,Breakfast,Big Breakfast with Hotcakes and Egg Whites (La...,15.4 oz (437 g),1050,50.0
33,Breakfast,Big Breakfast with Hotcakes and Egg Whites (Re...,14.9 oz (423 g),990,46.0


In [4]:
# To convert int type to float type we use:

frame["Calories"] = frame["Calories"].astype("float")

# Now we spoil one value:

frame.loc[2, "Calories"] = np.NaN

frame.head()

Unnamed: 0,Category,Item,Serving Size,Calories,TotalFat
0,Breakfast,Egg McMuffin,4.8 oz (136 g),300.0,13.0
1,Breakfast,Egg White Delight,4.8 oz (135 g),250.0,8.0
2,Breakfast,Sausage McMuffin,3.9 oz (111 g),,23.0
3,Breakfast,Sausage McMuffin with Egg,5.7 oz (161 g),450.0,28.0
4,Breakfast,Sausage McMuffin with Egg Whites,5.7 oz (161 g),400.0,23.0


In [5]:
# What to do if we have NaN among values:
# Note that NaN is treated as the biggest number

frame.sort_values(by = "Calories").tail()

Unnamed: 0,Category,Item,Serving Size,Calories,TotalFat
34,Breakfast,Big Breakfast with Hotcakes and Egg Whites (La...,15.4 oz (437 g),1050.0,50.0
31,Breakfast,Big Breakfast with Hotcakes (Regular Biscuit),14.8 oz (420 g),1090.0,56.0
32,Breakfast,Big Breakfast with Hotcakes (Large Biscuit),15.3 oz (434 g),1150.0,60.0
82,Chicken & Fish,Chicken McNuggets (40 piece),22.8 oz (646 g),1880.0,118.0
2,Breakfast,Sausage McMuffin,3.9 oz (111 g),,23.0


In [6]:
# To show NaN at the beginning we use

frame.sort_values(by = "Calories", na_position = "first").head()


Unnamed: 0,Category,Item,Serving Size,Calories,TotalFat
2,Breakfast,Sausage McMuffin,3.9 oz (111 g),,23.0
114,Beverages,Diet Coke (Small),16 fl oz cup,0.0,0.0
139,Coffee & Tea,Iced Tea (Large),30 fl oz cup,0.0,0.0
138,Coffee & Tea,Iced Tea (Medium),21 fl oz cup,0.0,0.0
137,Coffee & Tea,Iced Tea (Small),16 fl oz cup,0.0,0.0


In [7]:
# To sort more than one column, and to make first column in ascending order and the second one 
# in descending order, we use

frame.sort_values(by = ["Category", "Item"], ascending = [True, False]).head(20)

Unnamed: 0,Category,Item,Serving Size,Calories,TotalFat
43,Beef & Pork,Quarter Pounder with Cheese,7.1 oz (202 g),520.0,26.0
45,Beef & Pork,Quarter Pounder with Bacon Habanero Ranch,8.3 oz (235 g),610.0,31.0
44,Beef & Pork,Quarter Pounder with Bacon & Cheese,8 oz (227 g),600.0,29.0
46,Beef & Pork,Quarter Pounder Deluxe,8.6 oz (244 g),540.0,27.0
56,Beef & Pork,McRib,7.3 oz (208 g),500.0,26.0
52,Beef & Pork,McDouble,5.2 oz (147 g),380.0,17.0
55,Beef & Pork,Jalapeño Double,5.6 oz (159 g),430.0,23.0
48,Beef & Pork,Hamburger,3.5 oz (98 g),240.0,8.0
47,Beef & Pork,Double Quarter Pounder with Cheese,10 oz (283 g),750.0,43.0
50,Beef & Pork,Double Cheeseburger,5.7 oz (161 g),430.0,21.0


In [8]:
# Now we will try to sort by index.

frame = pd.read_csv("Mcdonalds.csv", usecols = ["Item", "Category", "Serving Size", "Calories", "TotalFat"], 
                    index_col = "Item")
frame.head()

Unnamed: 0_level_0,Category,Serving Size,Calories,TotalFat
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Egg McMuffin,Breakfast,4.8 oz (136 g),300,13.0
Egg White Delight,Breakfast,4.8 oz (135 g),250,8.0
Sausage McMuffin,Breakfast,3.9 oz (111 g),370,23.0
Sausage McMuffin with Egg,Breakfast,5.7 oz (161 g),450,28.0
Sausage McMuffin with Egg Whites,Breakfast,5.7 oz (161 g),400,23.0


In [9]:
frame.sort_index(ascending = True).head()

Unnamed: 0_level_0,Category,Serving Size,Calories,TotalFat
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1% Low Fat Milk Jug,Beverages,1 carton (236 ml),100,2.5
Apple Slices,Snacks & Sides,1.2 oz (34 g),15,0.0
Bacon Buffalo Ranch McChicken,Chicken & Fish,5.7 oz (161 g),430,21.0
Bacon Cheddar McChicken,Chicken & Fish,6 oz (171 g),480,24.0
Bacon Clubhouse Burger,Beef & Pork,9.5 oz (270 g),720,40.0


## Control over types

In [10]:
frame.dtypes

Category         object
Serving Size     object
Calories          int64
TotalFat        float64
dtype: object

In [11]:
# Memory usage control:

frame.info(memory_usage = "deep")

<class 'pandas.core.frame.DataFrame'>
Index: 260 entries, Egg McMuffin to McFlurry with Reese's Peanut Butter Cups (Snack)
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Category      260 non-null    object 
 1   Serving Size  260 non-null    object 
 2   Calories      260 non-null    int64  
 3   TotalFat      260 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 61.5 KB


In [12]:
frame.loc[1, "Calories"] = np.NaN

frame.info(memory_usage = "deep")

# Note that if an int type value is changed to NaN, then the whole column type becomes float:

<class 'pandas.core.frame.DataFrame'>
Index: 261 entries, Egg McMuffin to 1
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Category      260 non-null    object 
 1   Serving Size  260 non-null    object 
 2   Calories      260 non-null    float64
 3   TotalFat      260 non-null    float64
dtypes: float64(2), object(2)
memory usage: 69.8 KB


In [13]:
# Now it is not possible to change the type of this column by using the method "astype()".
# We have to replace NaN by some number.

frame["Calories"].fillna(value = 0, inplace = True)
frame.head()

Unnamed: 0_level_0,Category,Serving Size,Calories,TotalFat
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Egg McMuffin,Breakfast,4.8 oz (136 g),300.0,13.0
Egg White Delight,Breakfast,4.8 oz (135 g),250.0,8.0
Sausage McMuffin,Breakfast,3.9 oz (111 g),370.0,23.0
Sausage McMuffin with Egg,Breakfast,5.7 oz (161 g),450.0,28.0
Sausage McMuffin with Egg Whites,Breakfast,5.7 oz (161 g),400.0,23.0


In [14]:
# Now we can change the type of this column:

frame["Calories"] = frame["Calories"].astype(int)
frame

Unnamed: 0_level_0,Category,Serving Size,Calories,TotalFat
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Egg McMuffin,Breakfast,4.8 oz (136 g),300,13.0
Egg White Delight,Breakfast,4.8 oz (135 g),250,8.0
Sausage McMuffin,Breakfast,3.9 oz (111 g),370,23.0
Sausage McMuffin with Egg,Breakfast,5.7 oz (161 g),450,28.0
Sausage McMuffin with Egg Whites,Breakfast,5.7 oz (161 g),400,23.0
...,...,...,...,...
McFlurry with Oreo Cookies (Medium),Smoothies & Shakes,13.4 oz (381 g),690,23.0
McFlurry with Oreo Cookies (Snack),Smoothies & Shakes,6.7 oz (190 g),340,11.0
McFlurry with Reese's Peanut Butter Cups (Medium),Smoothies & Shakes,14.2 oz (403 g),810,32.0
McFlurry with Reese's Peanut Butter Cups (Snack),Smoothies & Shakes,7.1 oz (202 g),410,16.0


### Reducing memory for repeated values within a column

In [15]:
frame["Category"].value_counts()

Coffee & Tea          95
Breakfast             42
Smoothies & Shakes    28
Chicken & Fish        27
Beverages             27
Beef & Pork           15
Snacks & Sides        13
Desserts               7
Salads                 6
Name: Category, dtype: int64

In [16]:
# Note that I can use a parameter "category" of the method "astype()"

frame["Category"] = frame["Category"].astype("category")

frame.head()

frame.info(memory_usage = "deep")

# Look at the memory saved.

# We can obviously repreat this method to other columns, if possible, and save more memory.

<class 'pandas.core.frame.DataFrame'>
Index: 261 entries, Egg McMuffin to 1
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   Category      260 non-null    category
 1   Serving Size  260 non-null    object  
 2   Calories      261 non-null    int64   
 3   TotalFat      260 non-null    float64 
dtypes: category(1), float64(1), int64(1), object(1)
memory usage: 53.4 KB


## Ranking

In [17]:
# Ranking means ascribing positions to values:

frame.head()

Unnamed: 0_level_0,Category,Serving Size,Calories,TotalFat
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Egg McMuffin,Breakfast,4.8 oz (136 g),300,13.0
Egg White Delight,Breakfast,4.8 oz (135 g),250,8.0
Sausage McMuffin,Breakfast,3.9 oz (111 g),370,23.0
Sausage McMuffin with Egg,Breakfast,5.7 oz (161 g),450,28.0
Sausage McMuffin with Egg Whites,Breakfast,5.7 oz (161 g),400,23.0


In [18]:
frame["Calories"].rank().head()

Item
Egg McMuffin                        116.0
Egg White Delight                    83.5
Sausage McMuffin                    147.0
Sausage McMuffin with Egg           179.5
Sausage McMuffin with Egg Whites    157.0
Name: Calories, dtype: float64

In [19]:
frame["CaloriesRank"] = frame["Calories"].rank()

In [20]:
frame

Unnamed: 0_level_0,Category,Serving Size,Calories,TotalFat,CaloriesRank
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Egg McMuffin,Breakfast,4.8 oz (136 g),300,13.0,116.0
Egg White Delight,Breakfast,4.8 oz (135 g),250,8.0,83.5
Sausage McMuffin,Breakfast,3.9 oz (111 g),370,23.0,147.0
Sausage McMuffin with Egg,Breakfast,5.7 oz (161 g),450,28.0,179.5
Sausage McMuffin with Egg Whites,Breakfast,5.7 oz (161 g),400,23.0,157.0
...,...,...,...,...,...
McFlurry with Oreo Cookies (Medium),Smoothies & Shakes,13.4 oz (381 g),690,23.0,241.0
McFlurry with Oreo Cookies (Snack),Smoothies & Shakes,6.7 oz (190 g),340,11.0,133.5
McFlurry with Reese's Peanut Butter Cups (Medium),Smoothies & Shakes,14.2 oz (403 g),810,32.0,250.0
McFlurry with Reese's Peanut Butter Cups (Snack),Smoothies & Shakes,7.1 oz (202 g),410,16.0,160.5


In [21]:
frame.sort_values(by = "Calories").head()

# This does not look nice.

Unnamed: 0_level_0,Category,Serving Size,Calories,TotalFat,CaloriesRank
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,,,0,,9.0
Diet Dr Pepper (Medium),Beverages,21 fl oz cup,0,0.0,9.0
Diet Dr Pepper (Large),Beverages,30 fl oz cup,0,0.0,9.0
Diet Dr Pepper (Child),Beverages,12 fl oz cup,0,0.0,9.0
Diet Coke (Large),Beverages,30 fl oz cup,0,0.0,9.0


In [22]:
# Try again

frame["CaloriesRank"] = frame["Calories"].rank(ascending = False)
frame.sort_values(by = "Calories", ascending = False).head()

Unnamed: 0_level_0,Category,Serving Size,Calories,TotalFat,CaloriesRank
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Chicken McNuggets (40 piece),Chicken & Fish,22.8 oz (646 g),1880,118.0,1.0
Big Breakfast with Hotcakes (Large Biscuit),Breakfast,15.3 oz (434 g),1150,60.0,2.0
Big Breakfast with Hotcakes (Regular Biscuit),Breakfast,14.8 oz (420 g),1090,56.0,3.0
Big Breakfast with Hotcakes and Egg Whites (Large Biscuit),Breakfast,15.4 oz (437 g),1050,50.0,4.0
Big Breakfast with Hotcakes and Egg Whites (Regular Biscuit),Breakfast,14.9 oz (423 g),990,46.0,5.0


In [27]:
# Extract 3 the biggest values:

frame.sort_values(by = ["Calories", "Item"], ascending = [False,True], inplace = True)
frame.head(3)

Unnamed: 0_level_0,Category,Serving Size,Calories,TotalFat,CaloriesRank
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Chicken McNuggets (40 piece),Chicken & Fish,22.8 oz (646 g),1880,118.0,1.0
Big Breakfast with Hotcakes (Large Biscuit),Breakfast,15.3 oz (434 g),1150,60.0,2.0
Big Breakfast with Hotcakes (Regular Biscuit),Breakfast,14.8 oz (420 g),1090,56.0,3.0


In [28]:
# Note that there is a special function to do this:

frame.nlargest(n = 3, columns = "Calories")

Unnamed: 0_level_0,Category,Serving Size,Calories,TotalFat,CaloriesRank
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Chicken McNuggets (40 piece),Chicken & Fish,22.8 oz (646 g),1880,118.0,1.0
Big Breakfast with Hotcakes (Large Biscuit),Breakfast,15.3 oz (434 g),1150,60.0,2.0
Big Breakfast with Hotcakes (Regular Biscuit),Breakfast,14.8 oz (420 g),1090,56.0,3.0


In [31]:
frame.tail(5)

Unnamed: 0_level_0,Category,Serving Size,Calories,TotalFat,CaloriesRank
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Diet Dr Pepper (Small),Beverages,16 fl oz cup,0,0.0,253.0
Iced Tea (Child),Coffee & Tea,12 fl oz cup,0,0.0,253.0
Iced Tea (Large),Coffee & Tea,30 fl oz cup,0,0.0,253.0
Iced Tea (Medium),Coffee & Tea,21 fl oz cup,0,0.0,253.0
Iced Tea (Small),Coffee & Tea,16 fl oz cup,0,0.0,253.0


In [32]:
frame.nsmallest(n = 5, columns = "Calories")

Unnamed: 0_level_0,Category,Serving Size,Calories,TotalFat,CaloriesRank
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,,,0,,253.0
Coffee (Large),Coffee & Tea,16 fl oz cup,0,0.0,253.0
Coffee (Medium),Coffee & Tea,16 fl oz cup,0,0.0,253.0
Coffee (Small),Coffee & Tea,12 fl oz cup,0,0.0,253.0
Dasani Water Bottle,Beverages,16.9 fl oz,0,0.0,253.0


In [33]:
# Note that if the values are not unique, the two methods above give different results, 
# although they all belong to the same group of "CaloriesRank".