### OpenWeather api to collect weather data
Now I will collect weather forecast data for the selected cities from the OpenWeather api ([API Documentation](https://openweathermap.org/forecast5)). The api allows us a free access to **5 Day / 3 Hour Forecast**: a 5 day forecast data with 3-hour step. 


Relevant for the Gans use case, I will collect information on the following:
- *'Temp'* - Average temperature
- *'Feels_like'* - Avg. temp felt. 
- *'Sunrise'* - Datetime for the sunrise 
- *'Sunset'* - Datetime for the sunset 
- *'Weather'* - Overall description of the weather (Rain, Snow, Clouds etc.)
- *'Snow'* - Avg. snow
- *'Rain'* - Avg. rain
- *'Wind'* -  Avg. wind speed

In [1]:
import pandas as pd
import requests
import sqlalchemy
from datetime import datetime
from pytz import timezone

# Get the api keys
from keys import MySQL_bootcamp, OW_API_Key

#### Read cleaned webscraped city_df

In [2]:
df_sql_city_id = pd.read_csv("data/df_sql_city_id.csv")
df_sql_city_id

Unnamed: 0,City,country_2c,latitude,longitude,is_capital,Country,Elevation (in m),Population,city_id,Elevation
0,Berlin,DE,52.5167,13.3833,True,Germany,34.0,3576873,1,34
1,Hamburg,DE,53.55,10.0,False,Germany,23.0,1945532,2,23
2,Munich,DE,48.1372,11.5755,False,Germany,520.0,1512491,3,520
3,Cologne,DE,50.9422,6.9578,False,Germany,37.0,1073096,4,37
4,Paris,FR,48.8566,2.3522,True,France,35.0,2102650,5,35
5,Nice,FR,43.7034,7.2663,False,France,10.0,348085,6,10
6,Rome,IT,41.8931,12.4828,True,Italy,21.0,2860009,7,21
7,Milan,IT,45.4669,9.19,False,Italy,120.0,1371498,8,120
8,Warsaw,PL,52.2167,21.0333,True,Poland,100.0,1863056,9,100
9,Barcelona,ES,41.3825,2.1769,False,Spain,12.0,1620343,10,12


#### Open weather map data collection

In [3]:
API_key = OW_API_Key 
city_weather = df_sql_city_id["City"].unique()
cities_weather_json = []

berlin_timezone = timezone('Europe/Berlin')

def cities_dataframe(cities):
    cities_data = []
    for i in df_sql_city_id["City"]:
        cities_weather = requests.get(f"http://api.openweathermap.org/data/2.5/forecast?q={i}&appid={API_key}&units=metric")
        cities_weather_json = cities_weather.json()
        # As we are now using the data from our relational database
        # the city should reflect the city_id and not the city name
        city_id = df_sql_city_id.loc[df_sql_city_id["City"] == i, "city_id"].values[0]
        retrieval_time = datetime.now(berlin_timezone).strftime("%Y-%m-%d %H:%M:%S")
        
        for entry in cities_weather_json["list"]:
            city_data_df = {}
            city_data_df["city_id"]= city_id
            # city_data_df["City"] = cities_weather_json["city"]["name"]
            city_data_df["Datetime"] = entry['dt_txt'] if 'dt_txt' in entry else 0 # Check if 'rain' key exists in the entry, set to 0 if not present
            city_data_df["Temp"] = entry["main"]["temp"]
            city_data_df["Feels_like"] = entry["main"]["feels_like"]
            # city_data_df["Coordinates"] = cities_weather_json["city"]["coord"]
            city_data_df["Sunrise"] = pd.to_datetime(cities_weather_json["city"]["sunrise"], unit='s')
            city_data_df["Sunset"] = pd.to_datetime(cities_weather_json["city"]["sunset"], unit='s')
            # city_data_df["Population"] = cities_weather_json["city"]["population"]
            city_data_df["Weather"] = entry['weather'][0]['description'] 
            city_data_df["Snow"] = entry['snow']['3h'] if 'snow' in entry and '3h' in entry['snow'] else 0 # Check if 'rain' key exists in the entry, set to 0 if not present
            city_data_df["Rain"] = entry['rain']['3h'] if 'rain' in entry and '3h' in entry['rain'] else 0 # Check if 'rain' key exists in the entry, set to 0 if not present            
            city_data_df["Wind"] = entry['wind']['speed'] if 'wind' in entry and 'speed' in entry['wind'] else 0 # Check if 'rain' key exists in the entry, set to 0 if not present
            city_data_df["data_retrieved_at"]= retrieval_time
            cities_data.append(city_data_df.copy())  # Make sure to copy the dictionary to avoid overwriting
                 
    return pd.DataFrame(cities_data)

cities_weather_df = cities_dataframe(city_weather).sort_values(by=["city_id", "Datetime"], ascending=[True, True])
cities_weather_df.head()

Unnamed: 0,city_id,Datetime,Temp,Feels_like,Sunrise,Sunset,Weather,Snow,Rain,Wind,data_retrieved_at
0,1,2024-02-15 15:00:00,12.33,11.97,2024-02-15 06:24:13,2024-02-15 16:17:04,light rain,0,0.15,2.71,2024-02-15 13:58:16
1,1,2024-02-15 18:00:00,11.39,11.01,2024-02-15 06:24:13,2024-02-15 16:17:04,overcast clouds,0,0.0,2.81,2024-02-15 13:58:16
2,1,2024-02-15 21:00:00,10.29,9.83,2024-02-15 06:24:13,2024-02-15 16:17:04,overcast clouds,0,0.0,2.49,2024-02-15 13:58:16
3,1,2024-02-16 00:00:00,9.38,7.99,2024-02-15 06:24:13,2024-02-15 16:17:04,overcast clouds,0,0.0,2.65,2024-02-15 13:58:16
4,1,2024-02-16 03:00:00,8.74,7.01,2024-02-15 06:24:13,2024-02-15 16:17:04,overcast clouds,0,0.0,2.98,2024-02-15 13:58:16


In [4]:
cities_weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   city_id            800 non-null    int64         
 1   Datetime           800 non-null    object        
 2   Temp               800 non-null    float64       
 3   Feels_like         800 non-null    float64       
 4   Sunrise            800 non-null    datetime64[ns]
 5   Sunset             800 non-null    datetime64[ns]
 6   Weather            800 non-null    object        
 7   Snow               800 non-null    int64         
 8   Rain               800 non-null    float64       
 9   Wind               800 non-null    float64       
 10  data_retrieved_at  800 non-null    object        
dtypes: datetime64[ns](2), float64(4), int64(2), object(3)
memory usage: 68.9+ KB


#### Merge with city_df

In [5]:
df_city_id_weather = cities_weather_df.merge(df_sql_city_id, how="outer", on="city_id")

#### Save df as a csv

In [6]:
df_city_id_weather.to_csv("data/df_city_id_weather.csv", sep=',', index=False, encoding='utf-8')

#### Create a weather table in SQL DB

In [7]:
# Create a connecting link
schema = "gans_cities"
host = "127.0.0.1"
user = "root"
password = MySQL_bootcamp
port = 3306

connection_string = f'mysql+pymysql://{user}:{password}@{host}:{port}/{schema}'

In [8]:
Weather_df = df_city_id_weather[['city_id', 'Datetime', 'Temp', 'Feels_like', 'Sunrise', 'Sunset', 'Weather', 'Snow', 'Rain', 'Wind']]
Weather_df.to_sql('weather',
                  if_exists='append',
                  con=connection_string,
                  index=False)

800