# NYC Taxi Fare & Duration
## Avg Speed

In [7]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
import os
import sys
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import wget
import pickle
import glob

sys.path.append("..")

import source.configs as configs
import preprocessing as preprocessing

# Check if we are in COLAB
IN_COLAB = 'google.colab' in sys.modules

Download dataset files (Avg Speed)

In [9]:
for url in configs.AVGSPEED_DATASET_URLS:
    fname = url.split("/")[-1] 
    if not os.path.exists(f"../dataset/avg_speed/{fname}"):
        print(f"Downloading {fname}")
        wget.download(url, "../dataset/avg_speed")
    else:
        print(f"{fname} already in disk. Skipping download.")

yellow_tripdata_2023-01.parquet already in disk. Skipping download.
yellow_tripdata_2023-05.parquet already in disk. Skipping download.
yellow_tripdata_2023-09.parquet already in disk. Skipping download.


In [10]:
files_list = glob.glob("../dataset/avg_speed/*.parquet")
dataset_list = []
for file in files_list:
    print(f"Reading {file}")
    df_month = pd.read_parquet(file)
    dataset_list.append(df_month)
print("Pandas concat for dataframes...")
dataset = pd.concat(dataset_list, axis=0, ignore_index=True)
print("Done!")

Reading ../dataset/avg_speed\yellow_tripdata_2023-01.parquet
Reading ../dataset/avg_speed\yellow_tripdata_2023-05.parquet
Reading ../dataset/avg_speed\yellow_tripdata_2023-09.parquet
Pandas concat for dataframes...
Done!


In [11]:
print("Add targets")
dataset = preprocessing.add_targets(dataset)
print("Process outliers")
dataset = preprocessing.process_outliers(dataset, "delete")
print("Add features")
dataset, avg_speed_dict = preprocessing.add_features(dataset)
print(avg_speed_dict)
# Save avg speed model feature
pickle.dump(avg_speed_dict, open("avg_speed_dict.model", "wb"))

Add targets
Process outliers
Add features
Building average speed dictionary
{0: 0.25195469322838193, 1: 0.2503693835995309, 2: 0.24999335352944238, 3: 0.26612218282651856, 4: 0.31345897957385943, 5: 0.3355459231509819, 6: 0.27443853508122146, 7: 0.21423070424047383, 8: 0.18310836751240536, 9: 0.1801333123203435, 10: 0.17675564690190834, 11: 0.16859408041651547, 12: 0.16663810140450216, 13: 0.16766048469717576, 14: 0.16282710598596348, 15: 0.15792099847450106, 16: 0.16202178873469653, 17: 0.16106102095237498, 18: 0.16875892761499706, 19: 0.18793062365816957, 20: 0.207260086062618, 21: 0.21805265270227106, 22: 0.22578244300596678, 23: 0.24366564777182562}


In [12]:
dataset.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,total_amount,congestion_surcharge,airport_fee,Airport_fee,trip_duration,hour_of_day,rush_hour,day_of_week,trip_d2,avg_speed
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,...,14.3,2.5,0.0,,8.433333,0,0,6,0.9409,0.11502
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,...,16.9,2.5,0.0,,6.316667,0,0,6,1.21,0.174142
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,...,34.9,2.5,0.0,,12.75,0,0,6,6.3001,0.196863
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,...,19.68,2.5,0.0,,10.833333,0,0,6,2.0449,0.132
5,2,2023-01-01 00:50:34,2023-01-01 01:02:52,1.0,1.84,1.0,N,161,137,1,...,27.8,2.5,0.0,,12.3,0,0,6,3.3856,0.149593
