# Append climate zone data

In [1]:
import pandas as pd
import geopandas as gp
import os

## Obtain Koppen climate zone

In [2]:
cities = pd.read_csv('./sample_data/cities.csv')
cities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   city          2 non-null      object 
 1   city_ascii    2 non-null      object 
 2   city_lat      2 non-null      float64
 3   city_lon      2 non-null      float64
 4   country       2 non-null      object 
 5   iso2          2 non-null      object 
 6   iso3          2 non-null      object 
 7   admin_name    2 non-null      object 
 8   capital       2 non-null      object 
 9   population    2 non-null      float64
 10  city_id       2 non-null      int64  
 11  img_count     2 non-null      int64  
 12  timezone      2 non-null      object 
 13  utc_offset_s  2 non-null      float64
dtypes: float64(4), int64(2), object(8)
memory usage: 352.0+ bytes


In [3]:
import requests
from tqdm import tqdm

d = {
    'city_id': [],
    'koppen_geiger_zone': [],
    'zone_description': []
}

for index, row in tqdm(cities.iterrows(), total=cities.shape[0]):
    city_id = row['city_id']
    lat = row['city_lat']
    lon = row['city_lon']
    url = f"http://climateapi.scottpinkelman.com/api/v1/location/{lat}/{lon}"
    response = requests.get(url)
    values = response.json()['return_values'][0]
    zone = values['koppen_geiger_zone']
    desc = values['zone_description']
    d['city_id'].append(city_id)
    d['koppen_geiger_zone'].append(zone)
    d['zone_description'].append(desc)

100%|████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.29it/s]


In [4]:
temp = pd.DataFrame.from_dict(d)
temp

Unnamed: 0,city_id,koppen_geiger_zone,zone_description
0,1702341327,Af,Tropical rainforest
1,1276171358,Cfb,"Marine west coast, warm summer"


In [5]:
cities_zones = cities.merge(temp, on='city_id', how='left')
cities_zones.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2 entries, 0 to 1
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   city                2 non-null      object 
 1   city_ascii          2 non-null      object 
 2   city_lat            2 non-null      float64
 3   city_lon            2 non-null      float64
 4   country             2 non-null      object 
 5   iso2                2 non-null      object 
 6   iso3                2 non-null      object 
 7   admin_name          2 non-null      object 
 8   capital             2 non-null      object 
 9   population          2 non-null      float64
 10  city_id             2 non-null      int64  
 11  img_count           2 non-null      int64  
 12  timezone            2 non-null      object 
 13  utc_offset_s        2 non-null      float64
 14  koppen_geiger_zone  2 non-null      object 
 15  zone_description    2 non-null      object 
dtypes: float64(4

In [6]:
cities_zones[cities_zones['koppen_geiger_zone'].isnull()]

Unnamed: 0,city,city_ascii,city_lat,city_lon,country,iso2,iso3,admin_name,capital,population,city_id,img_count,timezone,utc_offset_s,koppen_geiger_zone,zone_description


In [None]:
# in case of null values from above^, manually search for the koppen climate zone for the missing cities and 
# fill in the 'missing' dictionary accordingly, and run the code below to fill in the information
# missing = {
#     # city id: {'koppen_geiger_zone': insert zone code, 'zone_description': insert zone description}
#     # for example,
#     # 1296152641: {'koppen_geiger_zone': 'Af', 
#     #             'zone_description': 'Tropical rainforest'}
# }

# def fill_missing_code(row):
#     if pd.isna(row['koppen_geiger_zone']):
#         city_id = row['city_id']
#         return missing[city_id]['koppen_geiger_zone']
#     else:
#         return row['koppen_geiger_zone']

# def fill_missing_desc(row):
#     if pd.isna(row['zone_description']):
#         city_id = row['city_id']
#         return missing[city_id]['zone_description']
#     else:
#         return row['zone_description']

# cities_zones['koppen_geiger_zone'] = cities_zones.apply(lambda row: fill_missing_code(row), axis=1)
# cities_zones['zone_description'] = cities_zones.apply(lambda row: fill_missing_desc(row), axis=1)

In [10]:
cities_zones.zone_description.value_counts()

Tropical rainforest               1
Marine west coast, warm summer    1
Name: zone_description, dtype: int64

In [8]:
def correct_typo(row):
    zone = row['zone_description']
    if 'Subartic' in zone:
        return 'Subarctic, severe winter, no dry season, cool summer'
    else:
        return zone

cities_zones['zone_description'] = cities_zones.apply(lambda row: correct_typo(row), axis=1)

In [9]:
cities_zones.zone_description.value_counts()

Tropical rainforest               1
Marine west coast, warm summer    1
Name: zone_description, dtype: int64

In [11]:
cities_zones.to_csv('./sample_data/cities.csv', index=False)

## Join data

In [12]:
sm = pd.read_csv('./sample_data/01_simplemaps.csv')

In [14]:
cols = ['city_id', 'koppen_geiger_zone', 'zone_description']
climate = sm.merge(cities_zones[cols], on='city_id', how='left')
climate.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2319 entries, 0 to 2318
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   uuid                2319 non-null   object 
 1   source              2319 non-null   object 
 2   orig_id             2319 non-null   int64  
 3   city                2319 non-null   object 
 4   city_ascii          2319 non-null   object 
 5   city_id             2319 non-null   int64  
 6   city_lat            2319 non-null   float64
 7   city_lon            2319 non-null   float64
 8   country             2319 non-null   object 
 9   iso2                2319 non-null   object 
 10  iso3                2319 non-null   object 
 11  admin_name          2319 non-null   object 
 12  capital             2319 non-null   object 
 13  population          2319 non-null   float64
 14  continent           2319 non-null   object 
 15  koppen_geiger_zone  2319 non-null   object 
 16  zone_d

In [17]:
cols = ['uuid', 'source', 'orig_id', 'koppen_geiger_zone', 'zone_description']
climate = climate[cols]
climate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2319 entries, 0 to 2318
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   uuid                2319 non-null   object
 1   source              2319 non-null   object
 2   orig_id             2319 non-null   int64 
 3   koppen_geiger_zone  2319 non-null   object
 4   zone_description    2319 non-null   object
dtypes: int64(1), object(4)
memory usage: 108.7+ KB


In [18]:
climate.to_csv('./sample_data/03_climate.csv', index=False)