In [1]:
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline

## Defining methods to join bike_data and weather data

In [2]:
def cut_timestamp(timestamp_col):
    return timestamp_col.map(lambda x:int(str(x)[:10]))

def assemble_timestamp_cut(day_frame, time_frame):
    day_clean = day_frame.map(lambda x: x.replace("-", ""))
    hour = time_frame.map(lambda x: x.replace(":", "")[:2])
    return (day_clean + hour).astype("int64")

def join_by_hour(bike_frame, weather_frame, column_name):
    bike_frame["timestamp_cut"] = assemble_timestamp_cut(bike_frame["day"], bike_frame["time"])
    weather_frame["timestamp_cut"] = cut_timestamp(weather_frame["Zeitstempel"])
    
    joined = bike_frame.join(weather_frame[["Wert", "timestamp_cut"]].set_index("timestamp_cut"), on="timestamp_cut")
    joined[column_name] = joined["Wert"]
    
    joined = joined.drop("Wert", axis=1)
    joined = joined.drop("timestamp_cut", axis=1)
    
    return joined

In [3]:
brm_frame = pd.read_csv("resources/bremen.csv")
brm_precipitation = pd.read_csv("resources/bremen_precipitation.csv")
brm_temperature = pd.read_csv("resources/bremen_temperature.csv")
schwarme_precipitation = pd.read_csv("resources/schwarme_precipitation.csv")
bassum_precipitation = pd.read_csv("resources/bassum_precipitation.csv")

brm_precipitation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8775 entries, 0 to 8774
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Produkt_Code      8775 non-null   object 
 1   SDO_ID            8775 non-null   int64  
 2   Zeitstempel       8775 non-null   int64  
 3   Wert              8775 non-null   float64
 4   Qualitaet_Niveau  8775 non-null   int64  
 5   Qualitaet_Byte    8775 non-null   int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 411.5+ KB


Identifying missing values

In [4]:
# loading precipitation data of bremen and mark missing values
brm_precipitation_full = brm_precipitation.join(all_dates_frame.set_index("Zeitstempel"), on="Zeitstempel", how="outer")
brm_precipitation_full = brm_precipitation_full.set_index("Zeitstempel")
brm_precipitation_full = brm_precipitation_full.sort_values("Zeitstempel")
brm_precipitation_full = brm_precipitation_full.reset_index()

brm_precipitation_full[brm_precipitation_full["Wert"].isnull()]

NameError: name 'all_dates_frame' is not defined

In [5]:
brm_sub_frame = brm_precipitation_full[(brm_precipitation_full["Zeitstempel"] >= 201902070000) & (brm_precipitation_full["Zeitstempel"] <= 201902080300)]
brm_sub_frame

NameError: name 'brm_precipitation_full' is not defined

## Getting data from station of schwarme and station of bassum

In [6]:
# loading precipitation data of schwarme and mark missing values
schwarme_precipitation_full = schwarme_precipitation.join(all_dates_frame.set_index("Zeitstempel"), on="Zeitstempel", how="outer")
schwarme_precipitation_full = schwarme_precipitation_full.set_index("Zeitstempel")
schwarme_precipitation_full = schwarme_precipitation_full.sort_values("Zeitstempel")
schwarme_precipitation_full = schwarme_precipitation_full.reset_index()

bassum_precipitation_full = bassum_precipitation.join(all_dates_frame.set_index("Zeitstempel"), on="Zeitstempel", how="outer")
bassum_precipitation_full = bassum_precipitation_full.set_index("Zeitstempel")
bassum_precipitation_full = bassum_precipitation_full.sort_values("Zeitstempel")
bassum_precipitation_full = bassum_precipitation_full.reset_index()

NameError: name 'all_dates_frame' is not defined

## Finding the station which has approximately the same precipitation than the station of bremen

In [7]:
schwarme_sub_frame = schwarme_precipitation_full[(schwarme_precipitation_full["Zeitstempel"] >= 201902070000) & (schwarme_precipitation_full["Zeitstempel"] <= 201902080300)]

NameError: name 'schwarme_precipitation_full' is not defined

In [8]:
bassum_sub_frame = bassum_precipitation_full[(bassum_precipitation_full["Zeitstempel"] >= 201902070000) & (bassum_precipitation_full["Zeitstempel"] <= 201902080300)]

NameError: name 'bassum_precipitation_full' is not defined

In [9]:

plt.plot(bassum_sub_frame["Zeitstempel"], bassum_sub_frame["Wert"], '.-', label="bassum")
plt.plot(schwarme_sub_frame["Zeitstempel"], schwarme_sub_frame["Wert"], '-', label="schwarme")
plt.plot(brm_sub_frame["Zeitstempel"], brm_sub_frame["Wert"], '+-', label="bremen")


plt.legend()

plt.show()


NameError: name 'bassum_sub_frame' is not defined

Since there are missing hours in the weather data, we want to fill these gaps with weater data from the nearest station. 


## combine missing values of brm_precipitation with precipitation values from the city of schwarme

In [10]:
# create a dataframe which contains all dates from 2019-01-20 until 2020-01-20
all_dates_frame = pd.DataFrame(pd.date_range(
    start = datetime.strptime("2019-01-20 00", "%Y-%m-%d %H"), 
    end = datetime.strptime("2020-01-20 00", "%Y-%m-%d %H"),
    freq="1h"), columns=["date"])

all_dates_frame["Zeitstempel"] = all_dates_frame["date"].map(lambda x: int(x.strftime("%Y%m%d%H%M")))
all_dates_frame = all_dates_frame.drop("date", 1)

# loading precipitation data of schwarme and mark missing values
schwarme_precipitation_full = schwarme_precipitation.join(all_dates_frame.set_index("Zeitstempel"), on="Zeitstempel", how="outer")
schwarme_precipitation_full = schwarme_precipitation_full.set_index("Zeitstempel")
schwarme_precipitation_full = schwarme_precipitation_full.sort_values("Zeitstempel")

# loading precipitation data of bremen and mark missing values
brm_precipitation_full = brm_precipitation.join(all_dates_frame.set_index("Zeitstempel"), on="Zeitstempel", how="outer")
brm_precipitation_full = brm_precipitation_full.set_index("Zeitstempel")
brm_precipitation_full = brm_precipitation_full.sort_values("Zeitstempel")

# fill missing values from bremen with values of schwerme
brm_precipitation_full[brm_precipitation_full["Wert"].isnull()] = schwarme_precipitation_full

brm_precipitation_combined = brm_precipitation_full
brm_precipitation_combined = brm_precipitation_combined.reset_index()

In [11]:
brm_frame_precipitation = join_by_hour(brm_frame, brm_precipitation_combined, "precipitation")
brm_frame_precipitation

brm_frame_weather = join_by_hour(brm_frame_precipitation, brm_temperature, "temperature")

brm_frame_weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157576 entries, 0 to 157575
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   day            157576 non-null  object 
 1   time           157576 non-null  object 
 2   b_number       157576 non-null  int64  
 3   city           157576 non-null  object 
 4   trip_duration  157576 non-null  object 
 5   orig_lat       157576 non-null  float64
 6   orig_lng       157576 non-null  float64
 7   dest_lat       157576 non-null  float64
 8   dest_lng       157576 non-null  float64
 9   precipitation  157576 non-null  float64
 10  temperature    157576 non-null  float64
dtypes: float64(6), int64(1), object(4)
memory usage: 13.2+ MB


In [12]:
frb_frame = pd.read_csv("resources/freiburg.csv")
frb_precipitation = pd.read_csv("resources/freiburg_precipitation.csv")
frb_temperature = pd.read_csv("resources/freiburg_temperature.csv")

frb_frame_precipitation = join_by_hour(frb_frame, frb_precipitation, "precipitation")
frb_frame_weather = join_by_hour(frb_frame_precipitation, frb_temperature, "temperature")

frb_frame_weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127529 entries, 0 to 127528
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   day            127529 non-null  object 
 1   time           127529 non-null  object 
 2   b_number       127529 non-null  int64  
 3   city           127529 non-null  object 
 4   trip_duration  127529 non-null  object 
 5   orig_lat       127529 non-null  float64
 6   orig_lng       127529 non-null  float64
 7   dest_lat       127529 non-null  float64
 8   dest_lng       127529 non-null  float64
 9   precipitation  127529 non-null  float64
 10  temperature    127529 non-null  float64
dtypes: float64(6), int64(1), object(4)
memory usage: 10.7+ MB


In [13]:
frb_frame_weather

Unnamed: 0,day,time,b_number,city,trip_duration,orig_lat,orig_lng,dest_lat,dest_lng,precipitation,temperature
0,2019-05-06,14:22:00,32560,freiburg,0 days 00:07:00.000000000,47.993178,7.795708,47.994027,7.796084,0.0,11.2
1,2019-05-07,10:42:00,32560,freiburg,0 days 00:07:00.000000000,47.994191,7.796853,47.991960,7.797405,0.0,11.2
2,2019-05-07,11:02:00,32560,freiburg,0 days 00:09:00.000000000,47.992044,7.797352,47.992000,7.797478,0.0,12.5
3,2019-05-07,12:48:00,32560,freiburg,0 days 00:23:00.000000000,47.992107,7.797499,47.994271,7.796668,0.0,12.9
4,2019-05-08,09:00:00,32560,freiburg,0 days 00:03:00.000000000,47.994178,7.796721,47.992307,7.796708,0.0,13.2
...,...,...,...,...,...,...,...,...,...,...,...
127524,2020-01-20,08:14:00,32999,freiburg,0 days 00:12:00.000000000,48.012050,7.854987,47.994729,7.846862,0.0,0.5
127525,2020-01-20,09:10:00,32999,freiburg,0 days 00:04:00.000000000,47.996100,7.846160,48.000858,7.849587,0.0,2.0
127526,2020-01-20,10:03:00,32999,freiburg,0 days 00:24:00.000000000,48.000858,7.849587,48.002664,7.851253,0.0,3.8
127527,2020-01-20,13:55:00,32999,freiburg,0 days 00:07:00.000000000,48.002664,7.851253,47.997430,7.842500,0.0,6.6
