In [1]:
import pandas as pd
import seaborn as sns
sns.set()
sns.set_style("whitegrid")
sns.set_palette("GnBu_d")
from haversine import haversine # conda install -c conda-forge haversine ''

In [2]:
brm = pd.read_csv("../resources/bremen.csv")
frb = pd.read_csv("../resources/freiburg.csv")

In [3]:
brm_origin = (brm["orig_lat"]),(brm["orig_lng"])
brm_destination = (brm["dest_lat"]),(brm["dest_lng"])

frb_origin = (frb["orig_lat"]),(frb["orig_lng"])
frb_destination = (frb["dest_lat"]),(frb["dest_lng"])

In [4]:
brm_distance = []
for row in range(len(brm)):
    brm_distance.append(haversine(
        (brm_origin[0][row],brm_origin[1][row]),(brm_destination[0][row],brm_destination[1][row])
    ))
    
brm["distance"] = brm_distance

In [5]:
frb_distance = []
for row in range(len(frb)):
    frb_distance.append(haversine(
        (frb_origin[0][row],frb_origin[1][row]),(frb_destination[0][row],frb_destination[1][row])
    ))
    
frb["distance"] = frb_distance

We checked the longest trip duration both for the city of Bremen and the city of Freiburg. By doing so we came to the conclusion that both durations are just under 3 hours. So by removing the day-values and the redundant milliseconds we are making the data easier to grasp for the average reader and easier to work with for the data scientist.

In [6]:
def timedelta_to_hours(timedelta_array):
    return timedelta_array.map(lambda x: x / pd.Timedelta("1 hour"))
def drop_noise(frame):
    index = frame[(frame["distance"] < 0.1)].index
    frame.drop(index, inplace=True)
    frame = frame.reset_index(drop=True)
    return frame
def drop_kmh(frame):
    frame["kmh"] = (frame["distance"] / frame["trip_duration"])
    index = frame[(frame["kmh"] < 5)].index
    frame.drop(index, inplace=True) 
    frame=frame.reset_index(drop=True)
    frame = frame.drop("kmh", 1)
    return frame


In [7]:
frb["trip_duration"] = timedelta_to_hours(frb["trip_duration"])
brm["trip_duration"] = timedelta_to_hours(brm["trip_duration"])

As part of the data cleaning we are making the assumption that all the trips with total distance under 100m are due to some kind of error, such as unintended log-in, hence they are removed from the data set.  
After this procedure the dataset for the city of Bremen is reduced from 157 575 to 151 728 data points (5 847 rows were deleted).
The length of the dataset for the city of Freiburg decreases from 127 528 to  126 970 (558 rows were deleted).

In [8]:
frb = drop_noise(frb)
brm = drop_noise(brm)

Another assumption made for the purpose of data cleaning is that trips with average speed under 5kmh are also of no interest for us.
Due to this assumtion another reduction in both datasets occurs.
We are now left with 116 244 data points in Bremen (34 484 rows were deleted) and with 100879 data points in Freiburg (26 091 rows were deleted).
And these are the final datasets on which we will perform our analysis.

In [9]:
brm = drop_kmh(brm)
frb = drop_kmh(frb)

In [10]:
brm.tail(4)

Unnamed: 0,day,time,b_number,city,trip_duration,orig_lat,orig_lng,dest_lat,dest_lng,distance
116241,2019-11-17,18:39:00,20999,bremen,0.083333,53.072071,8.828776,53.070533,8.821335,0.525679
116242,2019-11-18,09:23:00,20999,bremen,0.15,53.070369,8.821749,53.07936,8.813848,1.130559
116243,2019-11-19,15:35:00,20999,bremen,0.716667,53.079591,8.813925,53.145093,8.910599,9.730569
116244,2019-11-21,17:45:00,20999,bremen,0.183333,53.145798,8.909947,53.15816,8.94525,2.726051


In [11]:
frb.tail(4)

Unnamed: 0,day,time,b_number,city,trip_duration,orig_lat,orig_lng,dest_lat,dest_lng,distance
100876,2020-01-20,08:14:00,32999,freiburg,0.2,48.01205,7.854987,47.994729,7.846862,2.018664
100877,2020-01-20,09:10:00,32999,freiburg,0.066667,47.9961,7.84616,48.000858,7.849587,0.587308
100878,2020-01-20,13:55:00,32999,freiburg,0.116667,48.002664,7.851253,47.99743,7.8425,0.873416
100879,2020-01-20,14:16:00,32999,freiburg,0.3,47.99743,7.8425,48.01377,7.80708,3.200782


In [None]:
brm.to_csv("../resources/brm_cleaned.csv")
frb.to_csv("../resources/frb_cleaned.csv")