# Analyzing Boston's Rideshare Data

In [1]:
# General imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Data pipeline imports
from sklearn import set_config
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer

# Load Data Set

In [2]:
df = pd.read_excel('rideshare.xls')
df.head()

Unnamed: 0,id,timestamp,hour,day,month,datetime,timezone,source,destination,cab_type,...,precipIntensityMax,uvIndexTime,temperatureMin,temperatureMinTime,temperatureMax,temperatureMaxTime,apparentTemperatureMin,apparentTemperatureMinTime,apparentTemperatureMax,apparentTemperatureMaxTime
0,424553bb-7174-41ea-aeb4-fe06d4f4b9d7,1544953000.0,9,16,12,2018-12-16 09:30:07,America/New_York,Haymarket Square,North Station,Lyft,...,0.1276,1544979600,39.89,1545012000,43.68,1544968800,33.73,1545012000,38.07,1544958000
1,4bd23055-6827-41c6-b23b-3c491f24e74d,1543284000.0,2,27,11,2018-11-27 02:00:23,America/New_York,Haymarket Square,North Station,Lyft,...,0.13,1543251600,40.49,1543233600,47.3,1543251600,36.2,1543291200,43.92,1543251600
2,981a3613-77af-4620-a42a-0c0866077d1e,1543367000.0,1,28,11,2018-11-28 01:00:22,America/New_York,Haymarket Square,North Station,Lyft,...,0.1064,1543338000,35.36,1543377600,47.55,1543320000,31.04,1543377600,44.12,1543320000
3,c2d88af2-d278-4bfd-a8d0-29ca77cc5512,1543554000.0,4,30,11,2018-11-30 04:53:02,America/New_York,Haymarket Square,North Station,Lyft,...,0.0,1543507200,34.67,1543550400,45.03,1543510800,30.3,1543550400,38.53,1543510800
4,e0126e1f-8ca9-4f2e-82b3-50505a09db9a,1543463000.0,3,29,11,2018-11-29 03:49:20,America/New_York,Haymarket Square,North Station,Lyft,...,0.0001,1543420800,33.1,1543402800,42.18,1543420800,29.11,1543392000,35.75,1543420800


## EDA

In [3]:
# Get shape of the data set
df.shape

(11999, 57)

In [4]:
# Get number of null values
df.isnull().sum()

id                               0
timestamp                        0
hour                             0
day                              0
month                            0
datetime                         0
timezone                         0
source                           0
destination                      0
cab_type                         0
product_id                       0
name                             0
price                          931
distance                         0
surge_multiplier                 0
latitude                         0
longitude                        0
temperature                      0
apparentTemperature              0
short_summary                    0
long_summary                     0
precipIntensity                  0
precipProbability                0
humidity                         0
windSpeed                        0
windGust                         0
windGustTime                     0
visibility                       0
temperatureHigh     

In [5]:
# Number of unique values in each data set
df.nunique()

id                             11999
timestamp                      10878
hour                              24
day                               17
month                              2
datetime                        9253
timezone                           1
source                            12
destination                       12
cab_type                           2
product_id                        13
name                              13
price                            101
distance                         360
surge_multiplier                   6
latitude                          11
longitude                         12
temperature                      308
apparentTemperature              319
short_summary                      9
long_summary                      11
precipIntensity                   63
precipProbability                 29
humidity                          51
windSpeed                        291
windGust                         286
windGustTime                      25
v

In [6]:
# Get column names
df.columns

Index(['id', 'timestamp', 'hour', 'day', 'month', 'datetime', 'timezone',
       'source', 'destination', 'cab_type', 'product_id', 'name', 'price',
       'distance', 'surge_multiplier', 'latitude', 'longitude', 'temperature',
       'apparentTemperature', 'short_summary', 'long_summary',
       'precipIntensity', 'precipProbability', 'humidity', 'windSpeed',
       'windGust', 'windGustTime', 'visibility', 'temperatureHigh',
       'temperatureHighTime', 'temperatureLow', 'temperatureLowTime',
       'apparentTemperatureHigh', 'apparentTemperatureHighTime',
       'apparentTemperatureLow', 'apparentTemperatureLowTime', 'icon',
       'dewPoint', 'pressure', 'windBearing', 'cloudCover', 'uvIndex',
       'visibility.1', 'ozone', 'sunriseTime', 'sunsetTime', 'moonPhase',
       'precipIntensityMax', 'uvIndexTime', 'temperatureMin',
       'temperatureMinTime', 'temperatureMax', 'temperatureMaxTime',
       'apparentTemperatureMin', 'apparentTemperatureMinTime',
       'apparentTemperat

# Data Visualization

# Preprocessing Pipeline

- Drop unneeded columns
- Encode categorical columns
- Impute price column:
    - <strong>Remove column</strong> = Losing potential data values but also means that we aren't guessing the price

In [7]:
# Define drop columns
drop = ['id', 'hour', 'day', 'month', 'timezone', 'timestamp', 'latitude', 'longitude', 'product_id', 
        'uvIndexTime', 'short_summary']

# Add all columns with Time measurements -> we only need one time measurement
for cols in df.columns:
    if "Time" in cols:
        drop.append(cols)

In [8]:
# Drop columns
df = df.drop(drop,axis=1)
df = df.dropna()

In [10]:
# Define categorical columns to process
cat_cols = ['source', 'destination', 'cab_type', 'name', 'short_summary', 'moonPhase']

# Create ColumnTransformer to apply different processing steps for categorical columns
preprocessing_steps = [('encoding', LabelEncoder(), cat_cols)]

transformer = ColumnTransformer(transformers=preprocessing_steps)

In [12]:
# Create two seperate dataframes (Uber & Lyft) for further analysis later on 

# ML Implementation

## Overview

<strong>Executive Summary:</strong> I'd like to take the various inputs using various Regression models to try and predict the price of an Uber and Lyft. Using the same variables, I'd like to create seperate regression models for these two rideshare companies to analyze how their pricing model works. By analyzing their pricing models, we can determine if one company or the other unfairly prices customers.

In [None]:
# Extract features and target variables

In [None]:
# Split dataset into test and train values

In [11]:
# Fit and transform test and train data on ColumnTransformer
transformed_data = transformer.fit_transform(X)

NameError: name 'X' is not defined