# Get data on new-years 18/19 and 19/20 for several locations

In [34]:
import pandas as pd
import numpy as np

import requests
import pickle
import datetime

import seaborn as sns
import matplotlib.pyplot as plt

## load locations of interest and all boxes

In [35]:
with open('data/relevant_censors_cities.pickle', 'rb') as handle:
    cities_sensors = pickle.load(handle)

In [36]:
boxes = pd.read_pickle("data/boxes_18-19_19-20.pkl")

## get data for relevant boxes

In [37]:
# define newyears times for 17-18,18-19,19-20
time_frames = {"18_19": ["2018-12-31T05:00:00Z","2019-01-01T05:00:00Z"],
              "19_20": ["2019-12-31T05:00:00Z","2020-01-01T05:00:00Z"]
             }

### functions

In [38]:
def get_box_sensor_data(box_id: str,sensor_id:str,timeframe:list):
    """
    - takes box_id, sensor_id and dates specified in timeframe
    - requests data for that sensor in that timeframe from opensensemap
    - return dataframe with values of the sensor for the specified times
    """
    url = ("https://api.opensensemap.org/boxes/" 
           + f"{box_id}/data/"
           + sensor_id
           + f"?from-date={timeframe[0]}&to-date={timeframe[1]}"
           + "&download=false&format=json")
    try:
        return pd.DataFrame(requests.get(url).json()).drop(columns = "location")
    except:
        return pd.DataFrame()
    
    

In [39]:
def get_transformed_box_sensor_data(box_id:str, sensor_id:str, timeframe:list, all_times:pd.DataFrame):
    """
    get data transformed data for box and sensor
    returns dataframe with data
    """
    
    sensor_df = get_box_sensor_data(box_id,sensor_id,timeframe)
    
    if len(sensor_df) > 0:
                   
        # convert values to float 
        sensor_df.value = sensor_df.value.astype(float)
        
        # convert time-column to datetime
        sensor_df.createdAt = pd.to_datetime(sensor_df.createdAt)
            
        sensor_df.createdAt = sensor_df.createdAt.values
        sensor_df.columns = ["value","date_time"]
            
        # get data for the 5 minute intervals
        sensor_df = sensor_df.assign(date = pd.to_datetime(sensor_df.date_time.dt.date))
        sensor_df = sensor_df.assign(hour = pd.to_timedelta(sensor_df.date_time.dt.hour,unit="hours"))
        sensor_df = sensor_df.assign(minute = pd.to_timedelta(sensor_df.date_time.dt.minute - (sensor_df.date_time.dt.minute % 5),unit="m"))
        sensor_df = sensor_df.assign(date_time_5min = sensor_df.date+sensor_df.hour+sensor_df.minute)
        sensor_df = sensor_df.drop(columns = ["date","hour","minute","date_time"])
            
        sensor_df = sensor_df.groupby(by = "date_time_5min").mean().reset_index()
        sensor_df = sensor_df.rename(columns = {"date_time_5min":"date_time"})
            
        
        # merge with all times spans to make sure that missing times in the data is accounted for as NAN
        sensor_df = all_times.copy().merge(right = sensor_df, on = "date_time",how="left")
           
        
        # since times are in UTC, and Germany local time is UTC+1, we have to add one hour
        sensor_df["date_time"] = sensor_df["date_time"] + datetime.timedelta(hours=1)
            
        return  sensor_df
    else:
        return None

In [40]:
def get_sensor_id_from_box_id(box_id:str,boxes,phenom="PM10"):
    """
    look up phenomnen sensor id in boxes datafrage for phenom =  PM10 or PM2.5
    """
    return boxes.loc[boxes._id.eq(box_id),phenom].values[0]

In [41]:
def transform_city_data_in_dfs(city_data_dict:dict,times2018,times2019):
    """
    take data for cites an all sensors and put them in one data frame for each city
    """
    
    city_data_dfs = {}
       
    for city,city_data in city_data_dict.items():    
        city_data_dfs[city] = {"18_19": minutes1819,
                               "19_20": minutes1920}
        
        for data in city_data["18_19"]["regular"]:
            if not (data["values"] is None):         
                city_data_dfs[city]["18_19"] = (city_data_dfs[city]["18_19"]
                                                .merge(right = data["values"],
                                                       on = "date_time")
                                               ).rename(columns = {"value":data["box_id"]})
        
        if city_data["18_19"]["zone"]:
            for data in city_data["18_19"]["zone"]:
                if not (data["values"] is None):
                    city_data_dfs[city]["18_19"] = (city_data_dfs[city]["18_19"]
                                                    .merge(right = data["values"],
                                                           on = "date_time")
                                                   ).rename(columns = {"value":"zone"})
        
                
        for data in city_data["19_20"]["regular"]:
            if not (data["values"] is None):         
                city_data_dfs[city]["19_20"] = (city_data_dfs[city]["19_20"]
                                                .merge(right = data["values"],
                                                       on = "date_time")
                                               ).rename(columns = {"value":data["box_id"]})
        if city_data["19_20"]["zone"]:
            for data in city_data["19_20"]["zone"]:
                if not (data["values"] is None):
                    city_data_dfs[city]["19_20"] = (city_data_dfs[city]["19_20"]
                                                    .merge(right = data["values"],
                                                           on = "date_time")
                                                   ).rename(columns = {"value":"zone"})
            
    return  city_data_dfs 

In [42]:
def add_agg_2_city_data_dfs(city_data_dfs):
    """
    add aggregates to the df for every city
    """
    dfs = city_data_dfs.copy()
    for city,city_data in city_data_dfs.items():
        dfs[city]["18_19"]["mean"] = dfs[city]["18_19"].drop(columns="date_time").mean(axis=1)
        dfs[city]["19_20"]["mean"] = dfs[city]["19_20"].drop(columns="date_time").mean(axis=1)
    return dfs

### request data for all the PM10 sensors

In [43]:
# create a df with all the relevant times to catch missing data
minutes1819 = pd.DataFrame([datetime.timedelta(minutes = 10)]*24*1*6,columns=["deltas"])
minutes1819.deltas = minutes1819.deltas.cumsum()
minutes1819["date_time"] = minutes1819.deltas + pd.to_datetime("2018-12-31-05:00:00") - datetime.timedelta(minutes = 10)
minutes1819 = minutes1819[["date_time"]]

minutes1920 = pd.DataFrame([datetime.timedelta(minutes = 10)]*24*1*6,columns=["deltas"])
minutes1920.deltas = minutes1920.deltas.cumsum()
minutes1920["date_time"] = minutes1920.deltas + pd.to_datetime("2019-12-31-05:00:00") - datetime.timedelta(minutes = 10)
minutes1920 = minutes1920[["date_time"]]

#### Germany

In [44]:
# get data for boxes in the cities from requests

city_data_dicts={}

for city, city_data in cities_sensors.items():
    city_data_dicts[city] = {"18_19": {"zone" : [],
                                     "regular" : []},
                           "19_20": {"zone" : [],
                                     "regular" : []}}
    print(city)
    
    if city_data["zone"]:
        for box_id in city_data["zone"]:
            city_data_dicts[city]["18_19"]["zone"] += [{"box_id": box_id,
                                                        "values":
                                                        get_transformed_box_sensor_data(box_id,
                                                                                    get_sensor_id_from_box_id(box_id,boxes,"PM2.5"),
                                                                                    time_frames["18_19"],
                                                                                    minutes1819)
                                                       }]
            city_data_dicts[city]["19_20"]["zone"] += [{"box_d": box_id,
                                                        "values":
                                                        get_transformed_box_sensor_data(box_id,
                                                                                    get_sensor_id_from_box_id(box_id,boxes,"PM2.5"),
                                                                                    time_frames["19_20"],
                                                                                    minutes1920)
                                                       }]
    for box_id in city_data["regular"]:
        city_data_dicts[city]["18_19"]["regular"] += [{"box_id":box_id,
                                                       "values":
                                                       get_transformed_box_sensor_data(box_id,
                                                                                    get_sensor_id_from_box_id(box_id,boxes,"PM2.5"),
                                                                                    time_frames["18_19"],
                                                                                    minutes1819)}]
        city_data_dicts[city]["19_20"]["regular"] += [{"box_id":box_id,
                                                       "values":
                                                       get_transformed_box_sensor_data(box_id,
                                                                                    get_sensor_id_from_box_id(box_id,boxes,"PM2.5"),
                                                                                    time_frames["19_20"],
                                                                                    minutes1920)}]
    
    


Hamburg
Berlin
Duesseldorf
München
Dresden
Köln
Frankfurt
Stuttgart
Nuernberg
Leipzig
Muenster
Duisburg


In [45]:
# putting all measurements for one city in one df
city_data_dfs = transform_city_data_in_dfs(city_data_dicts,minutes1819,minutes1920)

In [46]:
# make sure that 2018 and 2019 contain the same sensors
for city in city_data_dfs:
    sensors1819 = list(city_data_dfs[city]["18_19"].drop(columns = "date_time").columns)
    sensors1920 = list(city_data_dfs[city]["19_20"].drop(columns = "date_time").columns)
    drop1819 = list(set(sensors1819)-set(sensors1920))
    drop1920 = list(set(sensors1920)-set(sensors1819))
    city_data_dfs[city]["18_19"].drop(columns = list(set(sensors1819)-set(sensors1920)),inplace = True)
    city_data_dfs[city]["19_20"].drop(columns = list(set(sensors1920)-set(sensors1819)),inplace = True)

del sensors1819
del sensors1920

In [47]:
# clearing NANs

# in total we have 288 time points (len(minutes1819))
# we throw out those sensors that are missing more than 20%, i.e. more than 50


for city in city_data_dfs:
    # identify nans
    isna18_19 = city_data_dfs[city]["18_19"].isna().sum() #18_19
    isna19_20 = city_data_dfs[city]["19_20"].isna().sum() # 19_20
    
    # check which columns to drop  
    drop_cols = list(isna18_19 [isna18_19 >10].index) + list(isna19_20 [isna19_20 >10].index)
    
    # drop from 18_19 and 19_20
    city_data_dfs[city]["18_19"] = city_data_dfs[city]["18_19"].drop(columns = drop_cols)
    city_data_dfs[city]["19_20"] = city_data_dfs[city]["19_20"].drop(columns = drop_cols)

    

del drop_cols
del isna18_19
del isna19_20
del city

In [48]:
## add mean to each city df
city_data_dfs =add_agg_2_city_data_dfs(city_data_dfs)

### Inspect data

In [50]:
# only two sensors remained, kick them out! 

cities = [city for city in city_data_dfs]

#### handling nans

In [51]:
#interplation
for city in cities:
    city_data_dfs[city]["18_19"] = city_data_dfs[city]["18_19"].interpolate()
    city_data_dfs[city]["19_20"] = city_data_dfs[city]["19_20"].interpolate()


## Save Data

In [52]:
with open('data/city_data_dfs_PM25.pickle', 'wb') as handle:
    pickle.dump(city_data_dfs, handle, protocol=pickle.HIGHEST_PROTOCOL)