### Import dependencies and API keys

In [1]:
import pandas as pd
from sqlalchemy import create_engine, inspect 
import numpy as np
import requests
import json

# Google API Key
from config import gkey, weatherkey

# Import .csv Data

### First Store .csv data from worldcities and World Happiness Report data into DataFrames

In [41]:
# World Happiness Report data:
happiness_file = "Resources/WorldHappiness_2019.csv"
raw_happiness_data_df = pd.read_csv(happiness_file)
raw_happiness_data_df.head()

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393
1,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341
3,4,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298


In [42]:
# World Cities data, has 42905 cities:
cities_file = "Resources/worldcities_2022.csv"
raw_cities_data_df = pd.read_csv(cities_file)
# len(cities_data_df)
raw_cities_data_df.head()

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.6839,139.7744,Japan,JP,JPN,Tōkyō,primary,39105000.0,1392685764
1,Jakarta,Jakarta,-6.2146,106.8451,Indonesia,ID,IDN,Jakarta,primary,35362000.0,1360771077
2,Delhi,Delhi,28.6667,77.2167,India,IN,IND,Delhi,admin,31870000.0,1356872604
3,Manila,Manila,14.6,120.9833,Philippines,PH,PHL,Manila,primary,23971000.0,1608618140
4,São Paulo,Sao Paulo,-23.5504,-46.6339,Brazil,BR,BRA,São Paulo,admin,22495000.0,1076532519


# Clean .csv Data

### Create new dataframes with selected columns from raw data

In [None]:
# Create a new df with cols we need from raw_happiness_data_df:
# list(happiness_data_df.columns)
happiness_data_df = raw_happiness_data_df[['Overall rank','Country or region','Score', 'Generosity', 'Perceptions of corruption']].copy()
happiness_data_df.head()

# Set "city_ID" as index for the primary key later in the database for each table with city data:
happiness_df = happiness_data_df.rename_axis('country_ID').reset_index()
# Clean up column names:
happiness_data_df = happiness_data_df.rename(columns={'Overall rank': 'overall_rank', 'Country or region': 'country', 'Score':'score','Generosity':'generosity','Perceptions of corruption':'perceptions_of_corruption'})

happiness_data_df.head()

In [6]:
# Create a new df with cols we need from raw_cities_data_df but has all the rows (42905 cities):
# list(raw_cities_data_df.columns)
big_cities_data_df = raw_cities_data_df[['city', 'lat', 'lng', 'country', 'population']].copy()
big_cities_data_df.head()

Unnamed: 0,city,lat,lng,country,population
0,Tokyo,35.6839,139.7744,Japan,39105000.0
1,Jakarta,-6.2146,106.8451,Indonesia,35362000.0
2,Delhi,28.6667,77.2167,India,31870000.0
3,Manila,14.6,120.9833,Philippines,23971000.0
4,São Paulo,-23.5504,-46.6339,Brazil,22495000.0


### Filter data:

In [28]:
# Filter big_cities_data_df to only cities that are in the countries from the happiness_data_df
# First, filter happiness_data_df to get the top 40 countries by rank for that data:
top40_happiness_df = happiness_data_df.loc[(happiness_data_df["overall_rank"] <= 40)]
# This will be one table in database:
top40_happiness_df
# Next, get a list of countries from top40_happiness_df:
top40_happ_cntry_list = list(top40_happiness_df["country"])

# Then, filter big_cities_data_df by the col of "country" using the happiness_cntry_list inside isin():
cities_data_df = big_cities_data_df[big_cities_data_df["country"].isin(top40_happ_cntry_list)]
cities_data_df

Unnamed: 0,city,lat,lng,country,population
4,São Paulo,-23.5504,-46.6339,Brazil,22495000.0
8,Mexico City,19.4333,-99.1333,Mexico,21505000.0
12,New York,40.6943,-73.9249,United States,18713220.0
27,Los Angeles,34.1139,-118.4068,United States,12750807.0
28,Rio de Janeiro,-22.9083,-43.1964,Brazil,12486000.0
...,...,...,...,...,...
42832,Villa O’Higgins,-48.4669,-72.5930,Chile,250.0
42835,Al Qunfudhah,19.1264,41.0789,Saudi Arabia,157.0
42850,Cuya,-19.1597,-70.1794,Chile,20.0
42854,Chuquicamata,-22.3169,-68.9301,Chile,0.0


In [29]:
# Next, sort cities_data_df by country and population
top_cities_data_df = cities_data_df.sort_values(by=['country', 'population'], ascending=[True, False])
# Finally, groupby country and take the top 50 cities based on population
top_cities_data_df = top_cities_data_df.groupby(by=['country'], as_index=False, sort=False).head(50).reset_index(drop=True)
top_cities_data_df

Unnamed: 0,city,lat,lng,country,population
0,Sydney,-33.8650,151.2094,Australia,4840600.0
1,Melbourne,-37.8136,144.9631,Australia,4529500.0
2,Brisbane,-27.4678,153.0281,Australia,2360241.0
3,Perth,-31.9522,115.8589,Australia,2039200.0
4,Adelaide,-34.9275,138.6000,Australia,1295714.0
...,...,...,...,...,...
1518,Joaquín Suárez,-34.7336,-56.0367,Uruguay,6570.0
1519,Sauce,-34.6469,-56.0628,Uruguay,6132.0
1520,Sarandí Grande,-33.7250,-56.3303,Uruguay,6130.0
1521,Atlántida,-34.7701,-55.7613,Uruguay,5562.0


In [31]:
# Set "city_ID" as index for the primary key later in the database for each table with city data:
top_cities_data_df = top_cities_data_df.rename_axis('city_ID').reset_index()
top_cities_data_df

Unnamed: 0,city_ID,city,lat,lng,country,population
0,0,Sydney,-33.8650,151.2094,Australia,4840600.0
1,1,Melbourne,-37.8136,144.9631,Australia,4529500.0
2,2,Brisbane,-27.4678,153.0281,Australia,2360241.0
3,3,Perth,-31.9522,115.8589,Australia,2039200.0
4,4,Adelaide,-34.9275,138.6000,Australia,1295714.0
...,...,...,...,...,...,...
1518,1518,Joaquín Suárez,-34.7336,-56.0367,Uruguay,6570.0
1519,1519,Sauce,-34.6469,-56.0628,Uruguay,6132.0
1520,1520,Sarandí Grande,-33.7250,-56.3303,Uruguay,6130.0
1521,1521,Atlántida,-34.7701,-55.7613,Uruguay,5562.0


In [35]:
# This will be one table in database:
# top40_happiness_df

# This will be another table in database:
# top_cities_data_df

# Need to do API call to get weather and places data and those will each be an additional table, respectively.
# So we will have 4 tables (or more if we get more data) in our database. 

## Google Places API Call

### Airport API Call:

In [50]:
# First call to get the airport data:
# Airports- names, address and rating within radius of 50,000 m (about 31 miles):
# Set up a dataframe to save the API info into, this will include city_ID, city,lat,lng, country from the cities data:
airport_df = top_cities_data_df[['city_ID','city', 'lat', 'lng', 'country']].copy()

# Add the cols that we want to get from the API call:
airport_df["airport_name"] = ""
airport_df["airport_address"] = ""
airport_df["airport_rating"] = ""
# check df:
airport_df

Unnamed: 0,city_ID,city,lat,lng,country,airport_name,airport_address,airport_rating
0,0,Sydney,-33.8650,151.2094,Australia,,,
1,1,Melbourne,-37.8136,144.9631,Australia,,,
2,2,Brisbane,-27.4678,153.0281,Australia,,,
3,3,Perth,-31.9522,115.8589,Australia,,,
4,4,Adelaide,-34.9275,138.6000,Australia,,,
...,...,...,...,...,...,...,...,...
1518,1518,Joaquín Suárez,-34.7336,-56.0367,Uruguay,,,
1519,1519,Sauce,-34.6469,-56.0628,Uruguay,,,
1520,1520,Sarandí Grande,-33.7250,-56.3303,Uruguay,,,
1521,1521,Atlántida,-34.7701,-55.7613,Uruguay,,,


In [56]:
# Google Places API Call
# params dictionary to update each iteration 
params = {
    "radius": 50000,
    "types": "airport",
    "keyword": "international airport",
    "key": gkey
}

# Use the lat/lng we recovered to identify airports
for index, row in second_airport_df.iterrows():
    # get lat, lng from df
    lat = row["lat"]
    lng = row["lng"]

    # change location each iteration while leaving original params in place
    params["location"] = f"{lat},{lng}"

    # Use the search term: "International Airport" and our lat/lng
    base_url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"

    # make request and print url
    name_address = requests.get(base_url, params=params)
    
#     print the name_address url, avoid doing for public github repos in order to avoid exposing key
#     print(name_address.url)

    # convert to json
    name_address = name_address.json()
    # print(json.dumps(name_address, indent=4, sort_keys=True))

    # Since some data may be missing we incorporate a try-except to skip any that are missing a data point.
    try:
        second_airport_df.loc[index, "airport_name"] = name_address["results"][0]["name"]
        second_airport_df.loc[index, "airport_address"] = name_address["results"][0]["vicinity"]
        second_airport_df.loc[index, "airport_rating"] = name_address["results"][0]["rating"]
    except (KeyError, IndexError):
        print("Missing field/result... skipping.")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/

In [5]:
# Save airport data to csv after API call:
airport_df.to_csv("Resources/airport_cities.csv", index=False)

# Visualize to confirm airport data appears
# This has 139 missing values- we should ask if we should remove them now or if that would be after it is in the database?
airport_df

Unnamed: 0,city_ID,city,lat,lng,country,airport_name,airport_address,airport_rating
0,0,Sydney,-33.8650,151.2094,Australia,Sydney Airport,Sydney,3.7
1,1,Melbourne,-37.8136,144.9631,Australia,,,
2,2,Brisbane,-27.4678,153.0281,Australia,Brisbane Airport,"Airport Dr, Brisbane Airport",4.0
3,3,Perth,-31.9522,115.8589,Australia,Perth Airport,Perth Airport,3.6
4,4,Adelaide,-34.9275,138.6000,Australia,Adelaide Airport,"Adelaide Airport Terminal, Sir Richard William...",4.2
...,...,...,...,...,...,...,...,...
1518,1518,Joaquín Suárez,-34.7336,-56.0367,Uruguay,Carrasco International Airport,"5X7M+5G8, Capitán Juan Antonio Artigas, Ciudad...",4.6
1519,1519,Sauce,-34.6469,-56.0628,Uruguay,Carrasco International Airport,"5X7M+5G8, Capitán Juan Antonio Artigas, Ciudad...",4.6
1520,1520,Sarandí Grande,-33.7250,-56.3303,Uruguay,Centro de Aviacion Civil de Florida,"WR67+P3J, Florida",4.5
1521,1521,Atlántida,-34.7701,-55.7613,Uruguay,Carrasco International Airport,"5X7M+5G8, Capitán Juan Antonio Artigas, Ciudad...",4.6


### Beaches API call:

In [70]:
# First call to get the airport data:
# Airports- names, address and rating within radius of 50,000 m (about 31 miles):
# Set up a dataframe to save the API info into, this will include city_ID, city,lat,lng, country from the cities data:
beaches_df = top_cities_data_df[['city_ID','city', 'lat', 'lng', 'country']].copy()

# Add the cols that we want to get from the API call:
beaches_df["beach_name"] = ""
beaches_df["beach_address"] = ""
beaches_df["beach_rating"] = ""
beaches_df

beaches_df.head()

Unnamed: 0,city_ID,city,lat,lng,country,beach_name,beach_address,beach_rating
0,0,Sydney,-33.865,151.2094,Australia,,,
1,1,Melbourne,-37.8136,144.9631,Australia,,,
2,2,Brisbane,-27.4678,153.0281,Australia,,,
3,3,Perth,-31.9522,115.8589,Australia,,,
4,4,Adelaide,-34.9275,138.6,Australia,,,


In [71]:
# Google Places API Call for beaches data:
# params dictionary to update each iteration 
params = {
    "radius": 50000,
    "types": "tourist_attraction",
    "keyword": "beach",
    "key": gkey
}

# Use the lat/lng we recovered to identify airports
for index, row in beaches_df.iterrows():
    # get lat, lng from df
    lat = row["lat"]
    lng = row["lng"]

    # change location each iteration while leaving original params in place
    params["location"] = f"{lat},{lng}"

    # Use the search term: "International Airport" and our lat/lng
    base_url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"

    # make request and print url
    name_address = requests.get(base_url, params=params)
    
#     print the name_address url, avoid doing for public github repos in order to avoid exposing key
#     print(name_address.url)

    # convert to json
    name_address = name_address.json()
    # print(json.dumps(name_address, indent=4, sort_keys=True))

    # Since some data may be missing we incorporate a try-except to skip any that are missing a data point.
    try:
        beaches_df.loc[index, "beach_name"] = name_address["results"][0]["name"]
        beaches_df.loc[index, "beach_address"] = name_address["results"][0]["vicinity"]
        beaches_df.loc[index, "beach_rating"] = name_address["results"][0]["rating"]
    except (KeyError, IndexError):
        print("Missing field/result... skipping.")

Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/

In [11]:
# Save airport data to csv after API call:
beaches_df.to_csv("Resources/beaches_cities.csv", index=False)

# Visualize to confirm airport data appears
beaches_df.head(10)

Unnamed: 0,city_ID,city,lat,lng,country,beach_name,beach_address,beach_rating
0,0,Sydney,-33.865,151.2094,Australia,Barrenjoey Lighthouse,"1199D Barrenjoey Rd, Palm Beach",4.7
1,1,Melbourne,-37.8136,144.9631,Australia,Brighton Bathing Boxes,"Esplanade, Brighton",4.4
2,2,Brisbane,-27.4678,153.0281,Australia,Streets Beach,"Stanley St Plaza, South Brisbane",4.7
3,3,Perth,-31.9522,115.8589,Australia,Parsa the Beach,Scarborough,0.0
4,4,Adelaide,-34.9275,138.6,Australia,Moana Beach,"404 Esplanade, Moana",4.7
5,5,Gold Coast,-28.0167,153.4,Australia,Snapper Rocks,"Snapper Rocks Rd, Coolangatta",4.8
6,6,Cranbourne,-38.0996,145.2834,Australia,Koonya Ocean Beach,"93 Hughes Rd, Blairgowrie",4.6
7,7,Canberra,-35.2931,149.1269,Australia,Casuarina Sands Reserve,Stromlo,4.6
8,8,Central Coast,-33.3,151.2,Australia,Barrenjoey Lighthouse,"1199D Barrenjoey Rd, Palm Beach",4.7
9,9,Wollongong,-34.4331,150.8831,Australia,Little Garie Beach,Lilyvale,4.6


## Read in Data After API Calls:

In [7]:
# Read in airport data from csv:
airport_file = "Resources/airport_cities.csv"
airport_df = pd.read_csv(airport_file)
airport_df.head()

Unnamed: 0,city_ID,city,lat,lng,country,airport_name,airport_address,airport_rating
0,0,Sydney,-33.865,151.2094,Australia,Sydney Airport,Sydney,3.7
1,1,Melbourne,-37.8136,144.9631,Australia,,,
2,2,Brisbane,-27.4678,153.0281,Australia,Brisbane Airport,"Airport Dr, Brisbane Airport",4.0
3,3,Perth,-31.9522,115.8589,Australia,Perth Airport,Perth Airport,3.6
4,4,Adelaide,-34.9275,138.6,Australia,Adelaide Airport,"Adelaide Airport Terminal, Sir Richard William...",4.2


In [12]:
# Read in beaches data from csv:
beaches_file = "Resources/beaches_cities.csv"
beaches_df = pd.read_csv(beaches_file)
beaches_df.head()

Unnamed: 0,city_ID,city,lat,lng,country,beach_name,beach_address,beach_rating
0,0,Sydney,-33.865,151.2094,Australia,Barrenjoey Lighthouse,"1199D Barrenjoey Rd, Palm Beach",4.7
1,1,Melbourne,-37.8136,144.9631,Australia,Brighton Bathing Boxes,"Esplanade, Brighton",4.4
2,2,Brisbane,-27.4678,153.0281,Australia,Streets Beach,"Stanley St Plaza, South Brisbane",4.7
3,3,Perth,-31.9522,115.8589,Australia,Parsa the Beach,Scarborough,0.0
4,4,Adelaide,-34.9275,138.6,Australia,Moana Beach,"404 Esplanade, Moana",4.7


### So far 4 tables for the database:
- top40_happiness_df
- top_cities_data_df
- airport_df
- beaches_df

### To DO:
- Need to do API call to get weather data which will be the 5th table, possibly 6 if we want air pollution/UV data?
- So we will have 5 tables (or more if we get more data) in our database. 

## SQL Work: