In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
sns.set_style("whitegrid")
sns.set_palette("GnBu_d")
import folium
from folium import plugins
from folium.plugins import HeatMap
from datetime import datetime
from datetime import timedelta
import math
import random 
import timeit
from haversine import haversine # conda install -c conda-forge haversine ''

In [2]:
BRM = pd.read_csv("bremen.csv")
FRB = pd.read_csv("freiburg.csv")

In [3]:
BRM_origin = (BRM["orig_lat"]),(BRM["orig_lng"])
BRM_destination = (BRM["dest_lat"]),(BRM["dest_lng"])

FRB_origin = (FRB["orig_lat"]),(FRB["orig_lng"])
FRB_destination = (FRB["dest_lat"]),(FRB["dest_lng"])

In [None]:
BRM_distance = []
for row in range(len(BRM)):
    BRM_distance.append(haversine(
        (BRM_origin[0][row],BRM_origin[1][row]),(BRM_destination[0][row],BRM_destination[1][row])
    ))
    
BRM["distance"] = BRM_distance

In [None]:
FRB_distance = []
for row in range(len(FRB)):
    FRB_distance.append(haversine(
        (FRB_origin[0][row],FRB_origin[1][row]),(FRB_destination[0][row],FRB_destination[1][row])
    ))
    
FRB["distance"] = FRB_distance


In [None]:
BRM.tail(3)

In [None]:
FRB.tail(3)

We cheched the longest trip duration both for the city of Bremen and the city of Freiburg. By doing so we came to the conclusion that both durations are just under 3 hours. So by removing the day-values and the redundant milliseconds we are making the data easier to grasp for the average reader and easier to work with for the data scientist.

In [None]:
 BRM["trip_duration"].max()

In [None]:
 FRB["trip_duration"].max()

In [None]:
BRM["trip_duration"] = BRM["trip_duration"].astype(str).str.extract("days (.*?)\.")

In [None]:
FRB["trip_duration"] = FRB["trip_duration"].astype(str).str.extract("days (.*?)\.")

In [None]:
FRB.head(3)

In [None]:
BRM.head(3)

As part of the data cleaning we are making the assumption that all the trips whit total distance under 100m are due to some kind of error, such as unintended log-in, hence they are removed from the data set.  

In [None]:
#dropping the noise-rows
indexF = FRB[(FRB["distance"] < 0.1)].index
FRB.drop(indexF, inplace=True)
indexB = BRM[(BRM["distance"] < 0.1)].index
BRM.drop(indexB, inplace=True)

#resetting the indices after the cleaning
FRB=FRB.reset_index(drop=True)
BRM=BRM.reset_index(drop=True)

Another assumption made for the purpose of data cleaning is that trips with average speed under 0.5kmh are also of no interest for us.

In [None]:
#creating helper-columns for the calculation of the  average kmh
FRB["trip_duration_seconds"] = pd.to_timedelta(FRB['trip_duration'])
FRB["trip_duration_seconds"] = FRB["trip_duration_seconds"].dt.seconds
FRB["trip_duration_hours"] = (FRB["trip_duration_seconds"]/3600)
FRB["kmh"] = (FRB["distance"] / FRB["trip_duration_hours"])

BRM["trip_duration_seconds"] = pd.to_timedelta(BRM['trip_duration'])
BRM["trip_duration_seconds"] = BRM["trip_duration_seconds"].dt.seconds
BRM["trip_duration_hours"] = (BRM["trip_duration_seconds"]/3600)
BRM["kmh"] = (BRM["distance"] / BRM["trip_duration_hours"])

# applying the condition
indexF = FRB[(FRB["kmh"] < 0.5)].index
FRB.drop(indexF, inplace=True) 
indexB = BRM[(BRM["kmh"] < 0.5)].index
BRM.drop(indexB, inplace=True) 

#resetting the indices after the cleaning
FRB=FRB.reset_index(drop=True)
BRM=BRM.reset_index(drop=True)

#removing the helper-columns from the data set
FRB = FRB.drop("trip_duration_seconds", 1)
FRB = FRB.drop("trip_duration_hours", 1)
FRB = FRB.drop("kmh", 1)

BRM = BRM.drop("trip_duration_seconds", 1)
BRM = BRM.drop("trip_duration_hours", 1)
BRM = BRM.drop("kmh", 1)



In [None]:
BRM.tail(3)

In [None]:
FRB.tail(3)

As a conclusion:
the data points in the city of Bremen have been reduced from 157 576 (before the cleaning) to 148 514 (after the cleaning);
the data points in the city of Freiburg have been reduced from 127 529 (before the cleaning) to 124 763 (after the cleaning);