Author: Harry Yau

Date: Aug 28, 2019

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import glob as glob
import pickle
import re
import seaborn as sns

### Loading the Data

In [2]:
def standardize_colnames(colnames):
    for i, val in enumerate(colnames):
        if 'Departure temperature' in val:
            colnames[i] = 'Departure temperature'
        if 'Return temperature' in val:
            colnames[i] = 'Return temperature'
        if 'Stopover duration' in val:
            colnames[i] = 'Stopover duration'
        if 'Membership type' in val:
            colnames[i] = 'Membership Type'
        if 'Formula' in val:
            colnames[i] = 'Membership Type'
            
    return(colnames)

In [3]:
pickle_folder = 'pickle'

#Data that has all the bike station info. It was pre-processed.
station_info_df = pd.read_csv('data/station_data.csv', dtype={'id': str, 'name': str, 'lat': float, 'lon': float, 'total_slots':int})

#Loading pre-loaded CSV files that was loaded into memory and saved as a pickle file. Please check another notebook
filename = 'loaded_mobi_data.pkl'
infile = open('pickle' + '/'+ filename, 'rb')
data_list = pickle.load(infile)
infile.close()

#Filter the data by standardizing the columns. The provided CSV files all have different column names, and different data.
col_names_ref = data_list[0].columns
col_names_ref_list = standardize_colnames(list(col_names_ref))

for i, df in enumerate(data_list):
    data_list[i].columns = standardize_colnames(list(df.columns))
for i, d in enumerate(data_list):
    data_list[i] = d.reindex(columns=col_names_ref_list)

del col_names_ref, col_names_ref_list
    
#Combine all the data frames in the list.
combined_df = pd.concat(data_list, axis=0)
combined_df.reset_index(drop=True, inplace=True)

del data_list

#drop rows with departure station and return station being NA
combined_df = combined_df[~combined_df['Departure station'].isna()]
combined_df = combined_df[~combined_df['Return station'].isna()]

#To save space in memory, splitting the Departure and Return station names so that it only shows the
temp_dep=[]
temp_ret=[]

for i in combined_df['Departure station']:
    temp = re.split('\s', i, 1)
    temp_dep.append(temp[0])
    
for i in combined_df['Return station']:
    temp = re.split('\s', i, 1)
    temp_ret.append(temp[0])

combined_df['Departure station'] = temp_dep
combined_df['Return station'] = temp_ret

del temp_dep, temp_ret

#Remove non-existent stations. Pair up with master station list.
vec = combined_df['Departure station'].isin(station_info_df['id'])
combined_df = combined_df[vec]

vec = combined_df['Return station'].isin(station_info_df['id'])
combined_df = combined_df[vec]

#Convert to datetime..
combined_df['Departure'] = pd.to_datetime(combined_df['Departure'])
combined_df['Return'] = pd.to_datetime(combined_df['Return'])

del vec

Check the Size of the data frame for sanity reasons.

In [4]:
combined_df.shape

(1387272, 15)

### Adding Features

Adding 2 useful features: Adjusted Duaration and Average Speed.

In [5]:
combined_df['Adj Duration (sec.)'] = (combined_df['Duration (sec.)']-combined_df['Stopover duration'])

combined_df['Average speed (km/h)'] = (combined_df['Covered distance (m)']/1000) / \
    ((combined_df['Adj Duration (sec.)'] )/3600)