# get hourly data for 2019 for single box

In [1]:
import datetime
import pandas as pd
import numpy as np
import requests
import pickle
import geopy.distance

import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import clear_output

## define functions

In [2]:
def get_weekly_data(box_id: str,sensor_id:str,week:list,save_mode = False):
    """
    - takes box_id, sensor_id and dates specified in week
    - requests data for that sensor in that timeframe from opensensemap
    - return dataframe with values of the sensor for the specified times
    """
    
    # 1) get data from API
    url = ("https://api.opensensemap.org/boxes/" 
           + f"{box_id}/data/"
           + sensor_id
           + f"?from-date={week[0]}&to-date={week[1]}"
           + "&download=false&format=json")
    
    df = pd.DataFrame(requests.get(url).json()) 
    
    if len(df) > 0:
        if save_mode:
            df.to_pickle(f"data/one_year_berlin/{box_id}_{week[0][:10]}.pkl")
        return pd.DataFrame(requests.get(url).json()).drop(columns = "location")
    else:
        if save_mode:
            df.to_pickle(f"data/one_year_berlin/{box_id}_{week[0][:10]}.pkl")
        return df
    
    
    

In [3]:
def transform_2_hourly_data(df: pd.DataFrame,week:list):
    """
    imput: dataframe with sensor data
    return: dataframe with hourly aggregated data
    """
    
    
        
    # (1) define hourly time-stamps for the week
    hours = pd.DataFrame([datetime.timedelta(hours=1)]*24*7,columns=["deltas"])
    hours.deltas = hours.deltas.cumsum()
    hours["date_time"] = hours.deltas + pd.to_datetime(f"{week[0][:10]} 00:00:00") - datetime.timedelta(hours = 1)
    hours = hours[["date_time"]]
    
    # (2) check if df is empty
    if len(df) == 0:
        return hours.merge(right=pd.DataFrame(columns = ["date_time","value"]), on = "date_time", how="left")
    else:   
        df_ret = df.copy()
        # convert values to float 
        df_ret.value = df_ret.value.astype(float)
        
        # convert time-column to datetime
        df_ret.createdAt = pd.to_datetime(df_ret.createdAt)
        
        df_ret["hour"] = df_ret.createdAt.dt.floor("h")
        df_ret = df_ret.drop(columns = "createdAt").groupby(by = "hour").mean()
        df_ret = df_ret.reset_index()
        df_ret.columns = ["date_time","value"]
        df_ret["date_time"] = df_ret["date_time"].values
        
    
        
        return hours.merge(right=df_ret,on="date_time",how="left")

In [5]:
def get_data_4_sensor(box_id: str, sensor_id:str,dates):
    """
    for the times specified in dates, returns a df with hourly data for all the weeks
    """
    df = pd.DataFrame(columns = ["date_time","value"])
    for date in dates:
        print(date)
        df_dates = get_weekly_data(box_id,sensor_id,date)
        #df_dates = transform_2_10min_data(df_dates,date)
        df_dates = transform_2_hourly_data(df_dates,date)
        df = pd.concat([df,df_dates],axis=0,sort=False)
    return df.reset_index(drop=True)

## get data for one of the sensors

In [6]:
# define the start and end of all weeks between Monday 2018-12-31 and Sunday 2020-01-05
weeks2019 = [[ (datetime.date(2018, 12, 31)+datetime.timedelta(week_num*7)).isoformat()+"T00:00:00Z",
          (datetime.date(2019, 1, 6)+datetime.timedelta(week_num*7)).isoformat()+"T23:59:59Z"
         ]
         for week_num in range(53)]

In [7]:
# load all boxes
boxes = pd.read_pickle("data/boxes_18-19_19-20.pkl")

In [8]:
# only consider boxes in Berlin
boxes_berlin = boxes.loc[boxes.City.eq("Berlin"),["_id","PM10","coordinates","District"]]

In [9]:
# define reference coordinates (city center)
ref_coord = boxes_berlin.loc[boxes_berlin._id.eq("592ca4b851d3460011ea2635"),"coordinates"].values[0]

In [10]:
# calculate distance to reference
boxes_berlin["dist"] = boxes_berlin["coordinates"].apply(lambda x: geopy.distance.distance((ref_coord[1],ref_coord[0]), (x[1],x[0])).km)

In [11]:
# define sensors in the center of berlin
boxes_berlin_center = boxes_berlin.loc[boxes_berlin["dist"].le(8),].reset_index(drop=True)

In [12]:
boxes_berlin_center_values = pd.DataFrame()

In [18]:
for i in boxes_berlin_center.index:
    # get data from API for box in Mitte
    print(i)
    if len(boxes_berlin_center_values) == 0:
        boxes_berlin_center_values = get_data_4_sensor(boxes_berlin_center.loc[i,"_id"],
                                                           boxes_berlin_center.loc[i,"PM10"],
                                                           weeks2019)
        
    else:
        boxes_berlin_center_values[boxes_berlin_center.loc[i,"_id"]] = get_data_4_sensor(boxes_berlin_center.loc[i,"_id"],
                                                                       boxes_berlin_center.loc[i,"PM10"],
                                                                       weeks2019)["value"]
    
    clear_output()


In [14]:
# rename first sensor columns
boxes_berlin_center_values = boxes_berlin_center_values.rename(columns = {"value":boxes_berlin_center.loc[0,"_id"]})

In [15]:
# see how many nans are there
boxes_nans = boxes_berlin_center_values.isna().sum()/len(boxes_berlin_center_values)

In [18]:
# only consider boxes with less than 10% nans
boxes_nans = boxes_nans[boxes_nans.le(0.2)]

In [17]:
# save as pickle
boxes_berlin_center_values.to_pickle("data/data_berlin_2019_all.pkl")

In [19]:
# select only those sensors with less than 10% missing
boxes_berlin_center_values = boxes_berlin_center_values[list(boxes_nans.index)]

In [20]:
# save as pickle
boxes_berlin_center_values.to_pickle("data/data_berlin_2019.pkl")