In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import random

%matplotlib inline

random.seed(30)

In [2]:
data_dir = './data'

In [3]:
bs_filepath = os.path.join(data_dir, 'austin_bikeshare_stations.csv')
bt_filepath = os.path.join(data_dir, 'austin_bikeshare_trips.csv')
bw_filepath = os.path.join(data_dir, 'austin_weather.csv')

station_df = pd.read_csv(bs_filepath)
trip_df = pd.read_csv(bt_filepath)
weather_df = pd.read_csv(bw_filepath)

In [4]:
trip_df_clean = trip_df.copy()

In [5]:
weather_df_clean = weather_df.copy()

In [6]:
w_column_map = {
    'Date': 'date',
    'TempHighF': 'temp_high_F',
    'TempAvgF': 'temp_avg_F',
    'TempLowF': 'temp_low_F',
    'DewPointHighF': 'dew_point_high_F',
    'DewPointAvgF': 'dew_point_avg_F',
    'DewPointLowF': 'dew_point_low_F',
    'HumidityHighPercent': 'humidity_high_percent',
    'HumidityAvgPercent': 'humidity_avg_percent',
    'HumidityLowPercent': 'humidity_low_percent',
    'SeaLevelPressureHighInches': 'sea_level_pressure_high_inches',
    'SeaLevelPressureAvgInches': 'sea_level_pressure_avg_inches',
    'SeaLevelPressureLowInches': 'sea_level_pressure_low_inches',
    'VisibilityHighMiles': 'visiblility_high_miles',
    'VisibilityAvgMiles': 'visiblility_avg_miles',
    'VisibilityLowMiles': 'visiblility_low_miles',
    'WindHighMPH': 'wind_high_MPH',
    'WindAvgMPH': 'wind_avg_MPH',
    'WindGustMPH': 'wind_low_MPH',
    'PrecipitationSumInches': 'precipitaion_sum_inches',
    'Events': 'events',
}

weather_df_clean.rename(columns=w_column_map, inplace=True)

In [8]:
trip_df_clean.drop(trip_df_clean[trip_df_clean['subscriber_type'].isnull()].index, inplace=True)

In [10]:
trip_df_clean['main_subsciber_type'] = trip_df_clean['subscriber_type'].str.split('(').str[0].str.strip()

In [11]:
trip_df_clean['main_subsciber_type'].unique()

array(['Walk Up', 'Local365', '24-Hour Kiosk', 'Local30', 'Weekender',
       'Annual Membership', 'Explorer', '7-Day', 'Semester Membership',
       'Local365+Guest Pass', 'Annual', 'Founding Member',
       'ACL Weekend Pass Special', 'Local365 Youth with helmet',
       'Try Before You Buy Special', '7-Day Membership', 'Republic Rider',
       'RideScout Single Ride', '24-Hour-Online', 'Annual Member',
       'Annual Plus', 'RESTRICTED', 'Annual Pass',
       'FunFunFun Fest 3 Day Pass', 'Local365 Youth',
       'Denver B-cycle Founder',
       'Membership: pay once  one-year commitment', 'PROHIBITED',
       '24-Hour Membership'], dtype=object)

In [None]:
trip_df_clean.dropna(inplace=True, axis=0)

In [None]:
rows_to_remove = trip_df_clean.query('year == "nan"').index

trip_df_clean.drop(rows_to_remove, inplace=True)

In [None]:
trip_df_clean['year'] = pd.to_numeric(trip_df_clean['year'], downcast='integer')
trip_df_clean['month'] = pd.to_numeric(trip_df_clean['month'], downcast='integer')

In [None]:
trip_df_clean['date'] = trip_df_clean['start_time'].str.split().str[0]
trip_df_clean['start_time'] = trip_df_clean['start_time'].str.split().str[1]

In [None]:
trip_df_clean['start_station_id'] = pd.to_numeric(trip_df_clean['start_station_id'], downcast='integer')
trip_df_clean['end_station_id'] = pd.to_numeric(trip_df_clean['end_station_id'], downcast='integer')

In [None]:
df_final = trip_df_clean.copy()

df_final = df_final.merge(weather_df_clean, on='date')

In [None]:
df = df_final.sample(10000)

df = df.sort_index(ascending=True, ignore_index=True)

In [None]:
df['bikeid'] = pd.to_numeric(df['bikeid'], downcast='integer')

df['bikeid'] = df['bikeid'].astype(str)

In [None]:
master_filepath = os.path.join(data_dir, 'austin_data.csv')

df.to_csv(master_filepath, index=True)