In [1]:
import requests
import pandas as pd
# from datetime import datetime

In [2]:

# Toronto Open Data is stored in a CKAN instance. It's APIs are documented here:
# https://docs.ckan.org/en/latest/api/

# To hit our API, you'll be making requests to:
base_url = "https://ckan0.cf.opendata.inter.prod-toronto.ca"

# Datasets are called "packages". Each package can contain many "resources"
# To retrieve the metadata for this package and its resources, use the package name in this page's URL:
url = base_url + "/api/3/action/package_show"
params = {"id": "licensed-dog-and-cat-names"}
package = requests.get(url, params = params).json()


In [3]:
f'Package success: {package["success"]}'

'Package success: True'

In [4]:
print(f'Number of resources: {package["result"]["num_resources"]}\n')
for idx, resource in enumerate(package["result"]["resources"]):
    # print(resource)
    print(f'{idx}: {resource["name"]}\n**Active: {resource["datastore_active"]}\n')

Number of resources: 22

0: licensed-names-for-dogs-2012
**Active: False

1: licensed-names-for-cats-2012
**Active: False

2: licensed-names-for-dogs-2013
**Active: False

3: licensed-names-for-cats-2013
**Active: False

4: licensed-names-for-dogs-2014
**Active: False

5: licensed-names-for-cats-2014
**Active: False

6: licensed-names-for-dogs-2015
**Active: False

7: licensed-names-for-cats-2015
**Active: False

8: licensed-names-for-dogs-2016
**Active: False

9: licensed-names-for-cats-2016
**Active: False

10: licensed-names-for-dogs-2017
**Active: False

11: licensed-names-for-cats-2017
**Active: False

12: licensed-names-for-dogs-2018
**Active: False

13: licensed-names-for-cats-2018
**Active: False

14: licensed-names-for-dogs-2019
**Active: False

15: licensed-names-for-cats-2019
**Active: False

16: licensed-names-for-dog-2020	
**Active: False

17: licensed-names-for-cats-2020	
**Active: False

18: licensed-names-for-dogs-2021
**Active: False

19: licensed-names-for-cats-2021
*

In [5]:
# create df to store result
df = pd.DataFrame(columns=['name', 'count'])
# list of names that will be changed to 'NO NAME'
no_name_values = ['', 'N/A', 'NO NAME LISTED']

# To get resource data:
for idx, resource in enumerate(package["result"]["resources"]):
    # To get data from files:
    if not resource["datastore_active"]:
        print(f'**getting data from url**')
        print(f'\t{resource["url"]}')
        # read data to temporary df
        tmp_df = pd.read_excel(
            resource["url"],
            header=None,
            names=['name', 'count'],
            converters={
                'name': lambda x: 'NO NAME' if x in no_name_values else x,
                'count': lambda x: 0 if x == '' else x
            }
        )
        # get rid of empty rows by dropping those with count 0
        tmp_df = tmp_df.loc[tmp_df['count']>0]
        # get name of resource and extract year and species from it
        title = resource['name'].strip('\t').split('-')
        tmp_df['year'] = title[-1]
        tmp_df['species'] = title[-2].strip('s')

        print(tmp_df.head())
        # append to result df
        df = df._append(tmp_df, ignore_index=True)

# if columns dont display completely
# pd.set_option('display.max_columns', None)

df.head()

**getting data from url**
	https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/37b2915c-f291-4b0e-bdd4-d6ccb070119a/resource/d57249a8-a7df-4974-9bdb-08c58a3b1b18/download/licensed-names-for-dogs-2012.xls
      name  count  year species
1  CHARLIE    674  2012     dog
2      MAX    660  2012     dog
3    BUDDY    534  2012     dog
4    MOLLY    506  2012     dog
5    BELLA    489  2012     dog
**getting data from url**
	https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/37b2915c-f291-4b0e-bdd4-d6ccb070119a/resource/04b1765c-c509-473b-8fa9-9642db9d7fec/download/licensed-names-for-cats-2012.xls
      name  count  year species
1  NO NAME    658  2012     cat
2      MAX    198  2012     cat
3    TIGER    183  2012     cat
4  CHARLIE    181  2012     cat
5   SMOKEY    178  2012     cat
**getting data from url**
	https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/37b2915c-f291-4b0e-bdd4-d6ccb070119a/resource/158c165b-6a58-4090-84e0-b802c9206c1e/download/licensed-names-for-dogs-2

Unnamed: 0,name,count,year,species
0,CHARLIE,674,2012,dog
1,MAX,660,2012,dog
2,BUDDY,534,2012,dog
3,MOLLY,506,2012,dog
4,BELLA,489,2012,dog


In [6]:
df['count'] = df['count'].astype('Int64')
df = df.groupby(['year', 'species', 'name'], as_index=False).sum()
df.sort_values(
    by=['year', 'species', 'count'],
    ascending=[True, False, False],
    inplace=True, ignore_index=True
)
df.head()

Unnamed: 0,year,species,name,count
0,2012,dog,CHARLIE,674
1,2012,dog,MAX,660
2,2012,dog,BUDDY,534
3,2012,dog,MOLLY,506
4,2012,dog,BELLA,489


In [7]:
# assign rank based on count within year and species
df['rank'] = (df
    .drop(columns='name')
    .groupby(['year', 'species'])
    .rank(method='min', ascending=False)
)
df.head()

Unnamed: 0,year,species,name,count,rank
0,2012,dog,CHARLIE,674,1.0
1,2012,dog,MAX,660,2.0
2,2012,dog,BUDDY,534,3.0
3,2012,dog,MOLLY,506,4.0
4,2012,dog,BELLA,489,5.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201615 entries, 0 to 201614
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   year     201615 non-null  object 
 1   species  201615 non-null  object 
 2   name     201615 non-null  object 
 3   count    201615 non-null  Int64  
 4   rank     201615 non-null  Float64
dtypes: Float64(1), Int64(1), object(3)
memory usage: 8.1+ MB


In [9]:
df['year'].unique()

array(['2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019',
       '2020', '2021', '2022'], dtype=object)

In [10]:
df['species'].unique()

array(['dog', 'cat'], dtype=object)

In [11]:
# Update saved csv
df.to_csv(
    'Licensed_pets.csv',
    header=True,
    index=False
)