# Pandas 101: Solutions Notebook


**important**: For your learning's sake, we strongly advise you to only look at this notebook to compare with your answers or if you are reallly stuck somewhere.

first things first, import:

In [1]:
import pandas as pd
import numpy as np
from hashlib import sha256
import json

In this notebook the following is tested:

- Pandas Series
- Pandas DataFrames
- Printing the columns
- Load a dataset
- Preview a dataframe
- Make use of info and describe

## Exercise 1: Series

In this first exercise the goal is to get used to creating series.

#### 1.1) Create a series with the name of the football club champions of 2019:

In [2]:
# Create a series with the names of the football champions, using the order provided.
# Let pandas generate the index on its own. 
# Don't forget to delete the raise error

football_clubs_list = ['Barcelona', 'Benfica', 'Juventus', 'PSG', 'Manchester City']

# Create a series for the football_clubs, and call it football_clubs_series. 

football_clubs_series = pd.Series(football_clubs_list)

# YOUR CODE HERE
#raise NotImplementedError()

In [3]:
# run this cell to see the results 
football_clubs_series

0          Barcelona
1            Benfica
2           Juventus
3                PSG
4    Manchester City
dtype: object

In [4]:
assert len(football_clubs_series.index ) == 5 
assert ((football_clubs_series.values[3] == 'PSG') & (football_clubs_series.values[-1] =='Manchester City'))
assert (isinstance(football_clubs_series, pd.Series))

In [5]:
#### 1.2) Create a series for the countries of each football club (using the football clubs as index):

In [6]:
football_clubs_list = ['Barcelona', 'Benfica', 'Juventus', 'PSG', 'Manchester City']

country_list = ['Spain', 'Portugal', 'Italy', 'France', 'England']

# Create a series which has the football club as index, and the respective country as values. 
country_series = pd.Series(data=country_list, index=football_clubs_list)

# YOUR CODE HERE
#raise NotImplementedError()

In [7]:
# run this cell to see the results 
country_series

Barcelona             Spain
Benfica            Portugal
Juventus              Italy
PSG                  France
Manchester City     England
dtype: object

In [8]:
assert (isinstance(country_series, pd.Series))
assert(len(country_series==5))
assert(country_series[-3] == 'Italy')
assert(country_series.index[3] == 'PSG')

#### 1.3) Get only the index values of a given series

In [9]:
series = pd.Series(['Spain', 'Portugal', 'Italy', 'France', 'England'], 
                          index=['Barcelona', 'Benfica', 'Juventus', 'PSG', 'Manchester City'])

# Get an array with the indexes of the serie.

index_series = series.index

# YOUR CODE HERE
#raise NotImplementedError()

In [10]:
# run this cell to see the results 
index_series

Index(['Barcelona', 'Benfica', 'Juventus', 'PSG', 'Manchester City'], dtype='object')

In [11]:
assert (isinstance(index_series, pd.core.indexes.base.Index))
assert(len(index_series==5))
assert(index_series[2] == 'Juventus')

## Exercise 2: DataFrames

In this exercise the goal is to create a simple DataFrame.

#### 2.1) Create a DataFrame with FootballClub/Country/TotalMarketValue columns

In [12]:
# Create a dataframe with 3 columns: FootballClub, Country, TotalMarketValue. 
# Let pandas create the index itself
# Make sure the columns are in the right order 
# winners_2019 must be the name of the final dataframe.

football_clubs_list = ['Barcelona', 'Benfica', 'Juventus', 'PSG', 'Manchester City']

country_list = ['Spain', 'Portugal', 'Italy', 'France', 'England']

totalmarketvalue_list = [1070.0, 365.50, 745.50, 922.25, 1170.0]

# in one line, build the dataframe, using a dictionary to tell it what should be the column names
# remember to give the columns the right names, or it won't pass the asserts
winners_2019 = pd.DataFrame(data = {'FootballClub': football_clubs_list, 'Country': country_list, 'TotalMarketValue': totalmarketvalue_list})


# YOUR CODE HERE
#raise NotImplementedError()

In [13]:
# The expected output is a dataframe with columns named
# "FootballClub", "Country" and "TotalMarketValue", indexed from 0 to 4.
winners_2019

Unnamed: 0,FootballClub,Country,TotalMarketValue
0,Barcelona,Spain,1070.0
1,Benfica,Portugal,365.5
2,Juventus,Italy,745.5
3,PSG,France,922.25
4,Manchester City,England,1170.0


In [14]:
assert(isinstance(winners_2019, pd.DataFrame))
assert(winners_2019.shape == (5, 3))
assert(winners_2019.iloc[3, 1] == 'France')
assert(winners_2019.columns[2] == 'TotalMarketValue')

#### 2.2) Create a DataFrame with the Country/TotalMarketValue columns, but with FootballClub as row indexes

In [15]:
# Create a dataframe with columns Country and TotalMarketValue (in that order), but use FootballClub as index
# winners_2019_ind must be the name of the final dataframe.

football_clubs_list = ['Barcelona', 'Benfica', 'Juventus', 'PSG', 'Manchester City']

country_list = ['Spain', 'Portugal', 'Italy', 'France', 'England']

totalmarketvalue_list = [1070.0, 365.50, 745.50, 922.25, 1170.0]

# Use the lists to create Series for Country and TotalMarketValue
# Hint: use the argument index

country_series = pd.Series(country_list, index= football_clubs_list)

totalmarketvalue_series = pd.Series(totalmarketvalue_list, index= football_clubs_list)

# Finally create the DataFrame, using a dictionary. This should only be 1 line of code. 

winners_2019_ind = pd.DataFrame({'Country': country_series, 'TotalMarketValue': totalmarketvalue_series})

# YOUR CODE HERE
#raise NotImplementedError()

In [16]:
# run this cell to see the results 
# The expected output is a dataframe with columns named "Country", "TotalMarketValue", indexed with the correct football clubs.
winners_2019_ind

Unnamed: 0,Country,TotalMarketValue
Barcelona,Spain,1070.0
Benfica,Portugal,365.5
Juventus,Italy,745.5
PSG,France,922.25
Manchester City,England,1170.0


In [17]:
assert(isinstance(winners_2019_ind, pd.DataFrame))
assert(winners_2019_ind.shape == (5, 2))
assert(winners_2019_ind.index[3] == 'PSG')
assert(winners_2019_ind.columns[1] == 'TotalMarketValue')
backup_winners_2019_ind = winners_2019_ind.copy()

## Exercise 3: Load the zomato dataset


In this exercise you will load the zomato data set.

You will then preview it and retrieve some information.

#### 3.1) Load the data set

In [18]:
# Load the zomato data set
# it is located locally at: data/zomato_restaurants.csv
# it is located remotely at: https://raw.githubusercontent.com/vohcolab/PandaViz-Workshop/main/Pandas/Pandas%20101/data/zomato_restaurants.csv

# I chose remote path in order to work in colab notebooks
data_path = "https://raw.githubusercontent.com/vohcolab/PandaViz-Workshop/main/Pandas/Pandas%20101/data/zomato_restaurants.csv"
zomato_df = pd.read_csv(data_path)

# YOUR CODE HERE
#raise NotImplementedError()

In [19]:
# print head it to get an idea of what you've just loaded 
zomato_df.head()

Unnamed: 0,id,name,locality,city,cuisines,establishment,average_cost_for_two,all_reviews_count,photo_count,has_table_booking,has_online_delivery,aggregate_rating,votes,timings
0,18487870,Boa-Bao,Chiado,Lisboa,"Oriental, Thai, Malaysian, Filipino, Vietnames...",Casual Dining,40,1521,3733,0,0,4.9,3943,"12:00 a 23:00 (Seg, Ter, Qua, Qui, Dom), 12:00..."
1,18550207,SOI,Cais do Sodré,Lisboa,"Chinese, Oriental, Japanese, Vietnamese, Korea...",Casual Dining,45,480,1363,0,0,4.3,1168,"12 Noon to 12 Midnight (Mon-Thu, Sun), 12 Noon..."
2,8212998,Ground Burger,Praça de Espanha,Lisboa,"Burger, American",Casual Dining,40,1430,2337,0,0,4.5,3000,12 Noon a 12 Midnight
3,18301225,Amaterasu Pateo do Sushi,Algés,Lisboa,"Sushi, Japanese",Casual Dining,40,676,1094,0,0,4.5,1615,"12:00 a 15:30, 19:00 a 23:30 (Seg-Dom)"
4,18605050,Contrabando,Santos,Lisboa,"American, Latin American, BBQ, Steak",Casual Dining,30,372,1117,1,0,4.3,869,"Seg-Qui, Dom (12 Noon to 11:59 PM), Sex-Sáb (1..."


In [20]:
assert(isinstance(zomato_df, pd.DataFrame))
assert(zomato_df.shape == (600, 14))

#### 3.2) Set an index

In [21]:
# set the index the "id" column 

# (check the documentatio: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html)

zomato_df_ind = zomato_df.set_index('id')

# YOUR CODE HERE
#raise NotImplementedError()

In [22]:
# print head it to get an idea of what you've just loaded 
zomato_df_ind.head()

Unnamed: 0_level_0,name,locality,city,cuisines,establishment,average_cost_for_two,all_reviews_count,photo_count,has_table_booking,has_online_delivery,aggregate_rating,votes,timings
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
18487870,Boa-Bao,Chiado,Lisboa,"Oriental, Thai, Malaysian, Filipino, Vietnames...",Casual Dining,40,1521,3733,0,0,4.9,3943,"12:00 a 23:00 (Seg, Ter, Qua, Qui, Dom), 12:00..."
18550207,SOI,Cais do Sodré,Lisboa,"Chinese, Oriental, Japanese, Vietnamese, Korea...",Casual Dining,45,480,1363,0,0,4.3,1168,"12 Noon to 12 Midnight (Mon-Thu, Sun), 12 Noon..."
8212998,Ground Burger,Praça de Espanha,Lisboa,"Burger, American",Casual Dining,40,1430,2337,0,0,4.5,3000,12 Noon a 12 Midnight
18301225,Amaterasu Pateo do Sushi,Algés,Lisboa,"Sushi, Japanese",Casual Dining,40,676,1094,0,0,4.5,1615,"12:00 a 15:30, 19:00 a 23:30 (Seg-Dom)"
18605050,Contrabando,Santos,Lisboa,"American, Latin American, BBQ, Steak",Casual Dining,30,372,1117,1,0,4.3,869,"Seg-Qui, Dom (12 Noon to 11:59 PM), Sex-Sáb (1..."


In [23]:
assert(isinstance(zomato_df_ind, pd.DataFrame))
assert(zomato_df_ind.shape == (600, 13))

#### 3.3) Get general information about the dataframe

In [24]:
# Print zomato_df_ind info
# reminder: jupyter just prints out the last output, 
# so you don't need to use a print statement

zomato_df_ind.info()

# YOUR CODE HERE
#raise NotImplementedError()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 600 entries, 18487870 to 17877644
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   name                  600 non-null    object 
 1   locality              600 non-null    object 
 2   city                  600 non-null    object 
 3   cuisines              596 non-null    object 
 4   establishment         517 non-null    object 
 5   average_cost_for_two  600 non-null    int64  
 6   all_reviews_count     600 non-null    int64  
 7   photo_count           600 non-null    int64  
 8   has_table_booking     600 non-null    int64  
 9   has_online_delivery   600 non-null    int64  
 10  aggregate_rating      600 non-null    float64
 11  votes                 600 non-null    int64  
 12  timings               572 non-null    object 
dtypes: float64(1), int64(6), object(6)
memory usage: 65.6+ KB


In [25]:
# How many columns of type "int" exists?

count_int = 6

# YOUR CODE HERE
#raise NotImplementedError()

In [26]:
hash_count = 'e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683'

assert (hash_count == sha256(str(count_int).encode()).hexdigest())   

#### 3.4) Preview the top 7 entries

In [27]:
# Print the top 7 entries of zomato_df_ind

zomato_df_ind_top = zomato_df_ind.head(7)

# YOUR CODE HERE
#raise NotImplementedError()

In [28]:
assert (zomato_df_ind_top.shape == (7,13))

#### 3.5) Preview the bottom 3 entries

In [29]:
# Print the bottom 3 entries of zomato_df_ind

zomato_df_ind_bottom = zomato_df_ind.tail(3)

# YOUR CODE HERE
#raise NotImplementedError()

In [30]:
assert (zomato_df_ind_bottom.shape == (3,13))

#### 3.6) Use a method to determine all the types of data in zomato_df_ind

In [31]:
# Print the type of data in zomato_df_ind

# YOUR CODE HERE

zomato_df_ind.dtypes
#raise NotImplementedError()

name                     object
locality                 object
city                     object
cuisines                 object
establishment            object
average_cost_for_two      int64
all_reviews_count         int64
photo_count               int64
has_table_booking         int64
has_online_delivery       int64
aggregate_rating        float64
votes                     int64
timings                  object
dtype: object

#### 3.7) Print the number of rows and the number of columns

In [32]:
# Get the nr_rows and nr_cols from zomato_df_ind, using a method

nr_rows = zomato_df_ind.shape[0]
nr_cols = zomato_df_ind.shape[1]

# YOUR CODE HERE
#raise NotImplementedError()

print("There are {} rows and {} columns.".format(nr_rows, nr_cols))

There are 600 rows and 13 columns.


In [33]:
assert (nr_rows == 600 )
assert (nr_cols == 13 )

#### 3.8) Get the array of column names

In [34]:
# Create an array with the column names of zomato_df_ind.
# there is, as always, a method for this 

zomato_df_columns = zomato_df_ind.columns

# YOUR CODE HERE
#raise NotImplementedError()

In [35]:
print(zomato_df_columns)

Index(['name', 'locality', 'city', 'cuisines', 'establishment',
       'average_cost_for_two', 'all_reviews_count', 'photo_count',
       'has_table_booking', 'has_online_delivery', 'aggregate_rating', 'votes',
       'timings'],
      dtype='object')


The expected output is an array with 'name', 'locality', 'city', 'cuisines', 'establishment','average_cost_for_two', 'all_reviews_count', 'photo_count', 'has_table_booking', 'has_online_delivery', 'aggregate_rating', 'votes','timings'.

In [36]:
assert (zomato_df_columns[1] == 'locality' )
assert (len(zomato_df_columns) == 13)

#### 3.9) Making use of describe() what is the standard deviation value of the votes?

In [37]:
# Print zomato_df_ind describe

# YOUR CODE HERE

zomato_df_ind.describe()
#raise NotImplementedError()

Unnamed: 0,average_cost_for_two,all_reviews_count,photo_count,has_table_booking,has_online_delivery,aggregate_rating,votes
count,600.0,600.0,600.0,600.0,600.0,600.0,600.0
mean,2327.295,170.311667,450.48,0.118333,0.12,3.910667,367.405
std,7716.362878,258.212511,621.216338,0.323272,0.325233,1.263836,594.930687
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20.0,5.0,12.75,0.0,0.0,3.9,12.75
50%,40.0,78.0,223.0,0.0,0.0,4.3,143.5
75%,100.0,241.25,616.75,0.0,0.0,4.5,461.75
max,50000.0,2640.0,4464.0,1.0,1.0,4.9,5703.0


In [38]:
# Manually input the standard deviation value of the votes
# yep, that simple. No tricks here :) 

votes_std = 594.930687

# YOUR CODE HERE
#raise NotImplementedError()

In [39]:
print("Standard deviation value of the votes is {}.".format(votes_std))

Standard deviation value of the votes is 594.930687.


In [40]:
assert 594 < votes_std < 595