## Imports and definitions

In [1]:
import sys, os, glob, time, requests, json
from time import time, strftime
from tqdm import tqdm, trange

import numpy as np
from numpy.random import *
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import folium
from folium import plugins, Map, CircleMarker

In [2]:
dataDir = '../data/'
buildDir = '../build/'

downloadsDir = '../downloads/'
metadataDir = '../metadata/'
archiveDir = '../archive/'

## Analyze data

In [4]:
stations = pd.read_csv(os.path.join(metadataDir, 'stations.csv'), index_col='id')
stations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 508 entries, 72 to 3249
Data columns (total 5 columns):
latitude       508 non-null float64
longitude      508 non-null float64
stationName    508 non-null object
statusValue    508 non-null object
totalDocks     508 non-null int64
dtypes: float64(2), int64(1), object(2)
memory usage: 23.8+ KB


In [None]:
%%time

dfArray = []

files = glob.glob(os.path.join(downloadsDir, '*.csv'))
for fileName in tqdm(files):
    df = pd.read_csv(fileName)
    dfArray.append(df)

 42%|████▏     | 5/12 [00:06<00:10,  1.50s/it]

In [None]:
%%time
dfFull = pd.concat(dfArray, ignore_index=True)
dfFull.info()

In [None]:
reset_selective dfArray

In [None]:
maxNormalTime = 6 * 60 * 60

fieldsToExtract = ['tripduration', 'start station id', 'start station name',
                   'start station latitude', 'start station longitude']

df = dfFull.loc[dfFull.tripduration <= maxNormalTime, fieldsToExtract
                ].groupby('start station id').agg(    
    {'tripduration': {'dep_count': 'count',
                      'dep_mean': 'mean',
                      'dep_min': 'min',
                      'dep_max': 'max',
                      'dep_median': 'median'
                      },
     'start station name': {'name': 'first'},
     'start station latitude': {'latitude': 'first'},
     'start station longitude': {'longitude': 'first'}})

df.index.name = 'id'
df.columns = df.columns.get_level_values(1)
df = df[['name', 'latitude', 'longitude', 'dep_count', 'dep_mean', 'dep_min', 'dep_max', 'dep_median']]

dfDep = df

In [None]:
fieldsToExtract = ['tripduration', 'end station id']

df = dfFull.loc[dfFull.tripduration <= maxNormalTime, fieldsToExtract
                ].groupby('end station id').agg(    
    {'tripduration': {'arr_count': 'count',
                      'arr_mean': 'mean',
                      'arr_min': 'min',
                      'arr_max': 'max',
                      'arr_median': 'median'
                      }})

df.index.name = 'id'
df.columns = df.columns.get_level_values(1)
df = df[['arr_count', 'arr_mean', 'arr_min', 'arr_max', 'arr_median']]

dfArr = df

In [9]:
df = dfDep.join(dfArr, how='left').join(stations[['statusValue', 'totalDocks']], how='left')

In [10]:
reset_selective dfArr

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


In [11]:
reset_selective dfDep

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


In [12]:
df.loc[3120, 'name'] = 'Center Blvd & Borden Ave'

df.loc[df.statusValue.isnull(), 'statusValue'] = 'In Service'
df.loc[(df.totalDocks.isnull()) | (df.totalDocks == 0), 'totalDocks'] = df[df.totalDocks != 0, 'totalDocks'].mean()

df.totalDocks = df.totalDocks.astype(int)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 488 entries, 72 to 3242
Data columns (total 15 columns):
name           488 non-null object
latitude       488 non-null float64
longitude      488 non-null float64
dep_count      488 non-null int64
dep_mean       488 non-null float64
dep_min        488 non-null int64
dep_max        488 non-null int64
dep_median     488 non-null float64
arr_count      488 non-null int64
arr_mean       488 non-null float64
arr_min        488 non-null int64
arr_max        488 non-null int64
arr_median     488 non-null float64
statusValue    488 non-null object
totalDocks     488 non-null int64
dtypes: float64(6), int64(7), object(2)
memory usage: 61.0+ KB


In [13]:
df.to_csv(os.path.join(dataDir, 'trips-pivot.csv'))