In [4]:
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt

In [10]:
cabs = pd.read_csv('../Cab-Weather-Data/cab_rides.txt')
weather = pd.read_csv('../Cab-Weather-Data/weather.txt')

In [11]:
cabs = cabs.dropna(axis=0).reset_index(drop=True)

In [12]:
weather.isnull().sum()

temp             0
location         0
clouds           0
pressure         0
rain          5382
time_stamp       0
humidity         0
wind             0
dtype: int64

In [16]:
weather = weather.fillna(0)
weather = weather.drop(columns=['time_stamp'])

use average weather as we can't use time stamps

In [21]:
average_weather = weather.groupby("location").mean(numeric_only=True).reset_index()

In [22]:
average_weather

Unnamed: 0,location,temp,clouds,pressure,rain,humidity,wind
0,Back Bay,39.082122,0.678432,1008.44782,0.007925,0.764073,6.778528
1,Beacon Hill,39.047285,0.677801,1008.448356,0.008297,0.765048,6.810325
2,Boston University,39.047744,0.679235,1008.459254,0.007738,0.763786,6.69218
3,Fenway,38.964379,0.679866,1008.453289,0.007343,0.767266,6.711721
4,Financial District,39.410822,0.67673,1008.435793,0.008563,0.754837,6.860019
5,Haymarket Square,39.067897,0.676711,1008.445239,0.00866,0.764837,6.843193
6,North End,39.090841,0.67673,1008.441912,0.008644,0.764054,6.853117
7,North Station,39.035315,0.676998,1008.442811,0.008649,0.765545,6.835755
8,Northeastern University,38.975086,0.678317,1008.444168,0.007358,0.767648,6.749426
9,South Station,39.394092,0.677495,1008.438031,0.00831,0.755468,6.848948


get starting and ending weather dicts for joining

In [23]:
start_w = average_weather.rename(columns=lambda x: f"start_{x}" if x != "location" else "source")
end_w = average_weather.rename(columns=lambda x: f"end_{x}" if x != "location" else "destination")

In [24]:
start_w

Unnamed: 0,source,start_temp,start_clouds,start_pressure,start_rain,start_humidity,start_wind
0,Back Bay,39.082122,0.678432,1008.44782,0.007925,0.764073,6.778528
1,Beacon Hill,39.047285,0.677801,1008.448356,0.008297,0.765048,6.810325
2,Boston University,39.047744,0.679235,1008.459254,0.007738,0.763786,6.69218
3,Fenway,38.964379,0.679866,1008.453289,0.007343,0.767266,6.711721
4,Financial District,39.410822,0.67673,1008.435793,0.008563,0.754837,6.860019
5,Haymarket Square,39.067897,0.676711,1008.445239,0.00866,0.764837,6.843193
6,North End,39.090841,0.67673,1008.441912,0.008644,0.764054,6.853117
7,North Station,39.035315,0.676998,1008.442811,0.008649,0.765545,6.835755
8,Northeastern University,38.975086,0.678317,1008.444168,0.007358,0.767648,6.749426
9,South Station,39.394092,0.677495,1008.438031,0.00831,0.755468,6.848948


In [25]:
cabs.shape

(637976, 10)

In [26]:
cabs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 637976 entries, 0 to 637975
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   distance          637976 non-null  float64
 1   cab_type          637976 non-null  object 
 2   time_stamp        637976 non-null  int64  
 3   destination       637976 non-null  object 
 4   source            637976 non-null  object 
 5   price             637976 non-null  float64
 6   surge_multiplier  637976 non-null  float64
 7   id                637976 non-null  object 
 8   product_id        637976 non-null  object 
 9   name              637976 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 48.7+ MB


join weather data to cab fare data

In [40]:
combined_df = cabs.merge(start_w, on="source", how="left").merge(end_w, on="destination", how="left")

In [28]:
combined_df.shape

(637976, 22)

In [29]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 637976 entries, 0 to 637975
Data columns (total 22 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   distance          637976 non-null  float64
 1   cab_type          637976 non-null  object 
 2   time_stamp        637976 non-null  int64  
 3   destination       637976 non-null  object 
 4   source            637976 non-null  object 
 5   price             637976 non-null  float64
 6   surge_multiplier  637976 non-null  float64
 7   id                637976 non-null  object 
 8   product_id        637976 non-null  object 
 9   name              637976 non-null  object 
 10  start_temp        637976 non-null  float64
 11  start_clouds      637976 non-null  float64
 12  start_pressure    637976 non-null  float64
 13  start_rain        637976 non-null  float64
 14  start_humidity    637976 non-null  float64
 15  start_wind        637976 non-null  float64
 16  end_temp          63

get meaningful info out of the timestamp data

In [41]:
combined_df['pickup_time'] = pd.to_datetime(combined_df['time_stamp'], unit='ms')
combined_df['hour'] = combined_df['pickup_time'].dt.hour
combined_df['day_of_week'] = combined_df['pickup_time'].dt.dayofweek
combined_df['is_weekend'] = combined_df['day_of_week'].isin([5,6]).astype(int)
combined_df['is_night'] = ((combined_df['hour'] >= 22) | (combined_df['hour'] <= 5)).astype(int)

combined_df = combined_df.drop(columns=['pickup_time', 'time_stamp', 'id', 'product_id'])

In [42]:
combined_df['cab_type'] = combined_df['cab_type'].replace({'Uber': 0, 'Lyft': 1})

  combined_df['cab_type'] = combined_df['cab_type'].replace({'Uber': 0, 'Lyft': 1})


one hot encode categorical features

In [48]:
from sklearn.preprocessing import OneHotEncoder

cat_cols = ['destination', 'source', 'name']
encoder = OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False)

encoded_array = encoder.fit_transform(combined_df[cat_cols])
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(cat_cols), dtype="uint8")

total_df = pd.concat([combined_df.drop(columns=cat_cols), encoded_df], axis=1)

In [49]:
simplified_df = total_df.drop(columns=['start_clouds', 'start_pressure', 'start_rain', 'start_humidity', 'end_clouds', 'end_pressure', 'end_rain', 'end_humidity'])

In [50]:
simplified_df.shape

(637976, 45)

In [51]:
simplified_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 637976 entries, 0 to 637975
Data columns (total 45 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   distance                             637976 non-null  float64
 1   cab_type                             637976 non-null  int64  
 2   price                                637976 non-null  float64
 3   surge_multiplier                     637976 non-null  float64
 4   start_temp                           637976 non-null  float64
 5   start_wind                           637976 non-null  float64
 6   end_temp                             637976 non-null  float64
 7   end_wind                             637976 non-null  float64
 8   hour                                 637976 non-null  int32  
 9   day_of_week                          637976 non-null  int32  
 10  is_weekend                           637976 non-null  int64  
 11  is_night     

## feature selection

i'm going to run a quick random forest to find the best features for our tree-based model

In [54]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=50, random_state=42)
rf.fit(simplified_df.drop(columns=['price']), simplified_df['price'])

In [55]:
imps = pd.Series(rf.feature_importances_, index=simplified_df.drop(columns=['price']).columns)
imp_features = imps[imps > 0.01].index 
# this keeps features with greater than 1% importance

tree_data_df = simplified_df[imp_features]

tree_data_df.to_csv('../processed_data/tree_data.csv', index=False, encoding='utf-8')

here, i'm doing recursive feature elimination for our linear regression

In [56]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

selector = RFE(lr, n_features_to_select=20)
selector.fit(simplified_df.drop(columns=['price']), simplified_df['price'])

In [57]:
selected_feats = simplified_df.drop(columns=['price']).columns[selector.support_]
linear_data_df = simplified_df[selected_feats]

linear_data_df.to_csv('../processed_data/linear_data.csv', index=False, encoding='utf-8')