# 2.  Compiling the station-level dataset (part 1)

In this section, I will add features from the divvy bike historic trip data (April 2021)

In [1]:
#importing packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import datetime

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#importing the divvy trip data (April 2021)
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Deep Learning/divvy_data/data/202104-divvy-tripdata.csv')

In [None]:
#only include station to station
df = df[df['start_station_name'].notna() & df['end_station_name'].notna()]
df['hour'] = df['started_at'].map(lambda x: x[10:13]).astype(int)
df['date'] = df['started_at'].map(lambda x: x[8:10]).astype(int)
df = df.reset_index(drop=True)

In [None]:
#creating a dataframe for station names and total_count (usage volume)
all_stations = list(df['start_station_name']) + list(df['end_station_name'])
df_stations = pd.DataFrame(pd.Series(all_stations).value_counts(), columns=['total_count'])
df_start = pd.DataFrame(df['start_station_name'].value_counts())
dfs = df_stations.merge(df_start, how='left', left_on=df_stations.index, right_on=df_start.index)
dfs = dfs.rename(columns={"key_0": "station_name", "start_station_name":"start_count"})

In [None]:
#create columes that record the porportion of trip that use each station as the end station
dfs['start_count'] = dfs['start_count'].astype(int)
dfs['end_count'] = dfs['total_count'] - dfs['start_count']
dfs['end_percent'] = dfs['end_count']/dfs['total_count']

In [None]:
#adding coordinates

df['start_loc'] = df['start_lat'].map(lambda x: str(x)[:9]) + ',' + df['start_lng'].map(lambda x: str(x)[:10])
df['end_loc'] = df['end_lat'].map(lambda x: str(x)[:9]) + ',' + df['end_lng'].map(lambda x: str(x)[:10])

#create a dictionary from station name to coordinates
station2coord = {}
for i in df.index:
  start_loc = df.loc[i, 'start_loc']
  start_station_name = df.loc[i, 'start_station_name']
  end_loc = df.loc[i, 'end_loc']
  end_station_name = df.loc[i, 'end_station_name']
  if start_station_name not in station2coord:
      station2coord[start_station_name] = start_loc
  if end_station_name not in station2coord:
      station2coord[end_station_name] = end_loc 

In [None]:
#add the coordinates as a feature of each station
dfs['coord'] = dfs['station_name'].map(station2coord)

In [None]:
def ave_dis(df):
  '''
  find the average distance and variance of all trips
  in a dataframe df
  '''
  lng_dis = df['end_lng'] - df['start_lng']
  lat_dis = df['end_lat'] - df['start_lat']
  dis = (lng_dis**2 + lat_dis**2).map(lambda x: math.sqrt(x))
  return [np.mean(dis), np.var(dis)]

In [None]:
def getdata(name):
  '''
  get the trip observations in the dataframe that
  eithers starts or ends at a certain station name.
  record the proportions of trips that use classic bike and 
  e-bike.
  record the proportions of trips that are used by causal riders.
  record the proportions of trips that started in the morning,
  evening, and weekend.
  record the average distance and variance of the trips.
  '''
  row = [name]
  df_temp = df[(df['start_station_name']==name)|(df['end_station_name']==name)]
  length = df_temp.shape[0]
  row.append((df_temp['rideable_type']=='classic_bike').mean()) #classic_bike%
  row.append((df_temp['rideable_type']=='electric_bike').mean()) #ebike%
  row.append((df_temp['member_casual']=='casual').mean()) #casual%
  row.append((df_temp['date'].isin([3,4,10,11,17,18,24,25])).mean()) #weekend%
  row.append((df_temp['hour'].isin([6,7,8,9,10,11])).mean()) #morning%
  row.append((df_temp['hour'].isin([21,22,23,0,1,2,3,4,5])).mean()) #evening%
  row.extend(ave_dis(df_temp)) #average distance and variance

  return row

In [None]:
#run the "getdata" function over every station name
features = []
for i in dfs['station_name']:
  row = getdata(i)
  features.append(row)
df_features = pd.DataFrame(features, columns=['station_name', 'classic_bike_p',
                                              'ebike_p', 'casual_p', 'weekend_p',
                                              'morning_p', 'evening_p', 'average_distance', 'var_distance'])

In [None]:
# merge the station level data
dfs = dfs.merge(df_features, how='left', on='station_name')

In [None]:
#save the data as "stations.csv"
dfs.to_csv('/content/drive/MyDrive/Colab Notebooks/Deep Learning/divvy_data/data/stations.csv')

### additional feature: average duration of trips of each station (not used in the final dataset)

In [None]:
# df['started_at2'] = df['started_at'].map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
# df['ended_at2'] = df['ended_at'].map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))

In [None]:
# def timediff(d1, d2):
#   diff = (d2 - d1).total_seconds() / 60
#   return diff

In [None]:
# df['duration'] = 0
# for i in df.index:
#   df.loc[i, 'duration'] = timediff(df.loc[i, 'started_at'], df.loc[i, 'ended_at'])