# Data wrangling 

Dataset: https://www.kaggle.com/datasets/anandaramg/taxi-trip-data-nyc

Info: https://www1.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf

In [1]:
!python --version

Python 3.9.7


In [2]:
import pandas as pd
import numpy as np
from datetime import date, datetime, timedelta
from dateutil.relativedelta import relativedelta
import matplotlib.pyplot as plt
import seaborn as sns

# !pip install pandas-profiling  
# !pip install markupsafe==2.0.1
'''
Obtener EDA fácil:
from pandas_profiling import ProfileReport
profile = ProfileReport(df_taxi)
profile
prof.to_file(output_file='output.html')
'''

pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings(action='ignore')

In [3]:
# %load_ext autoreload
# %autoreload 2
%run src.py

In [4]:
df_taxi=pd.read_csv('dataset/taxi_tripdata.csv')
df_taxi.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,1.0,2021-07-01 00:30:52,2021-07-01 00:35:36,N,1.0,74,168,1.0,1.2,6.0,0.5,0.5,0.0,0.0,,0.3,7.3,2.0,1.0,0.0
1,2.0,2021-07-01 00:25:36,2021-07-01 01:01:31,N,1.0,116,265,2.0,13.69,42.0,0.5,0.5,0.0,0.0,,0.3,43.3,2.0,1.0,0.0
2,2.0,2021-07-01 00:05:58,2021-07-01 00:12:00,N,1.0,97,33,1.0,0.95,6.5,0.5,0.5,2.34,0.0,,0.3,10.14,1.0,1.0,0.0
3,2.0,2021-07-01 00:41:40,2021-07-01 00:47:23,N,1.0,74,42,1.0,1.24,6.5,0.5,0.5,0.0,0.0,,0.3,7.8,2.0,1.0,0.0
4,2.0,2021-07-01 00:51:32,2021-07-01 00:58:46,N,1.0,42,244,1.0,1.1,7.0,0.5,0.5,0.0,0.0,,0.3,8.3,2.0,1.0,0.0


### DATA WRANGLING

In [5]:
# Cambio de nombre algunas columnas
rename_columns={'VendorID':'vendorID',
                'lpep_pickup_datetime':'pickup_datetime',
                'lpep_dropoff_datetime':'dropoff_datetime',
                'trip_distance':'trip_distance_miles'}

renombrar_columnas(df_taxi,rename_columns)

Unnamed: 0,vendorID,pickup_datetime,dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance_miles,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,1.0,2021-07-01 00:30:52,2021-07-01 00:35:36,N,1.0,74,168,1.0,1.20,6.00,0.50,0.5,0.00,0.00,,0.3,7.30,2.0,1.0,0.0
1,2.0,2021-07-01 00:25:36,2021-07-01 01:01:31,N,1.0,116,265,2.0,13.69,42.00,0.50,0.5,0.00,0.00,,0.3,43.30,2.0,1.0,0.0
2,2.0,2021-07-01 00:05:58,2021-07-01 00:12:00,N,1.0,97,33,1.0,0.95,6.50,0.50,0.5,2.34,0.00,,0.3,10.14,1.0,1.0,0.0
3,2.0,2021-07-01 00:41:40,2021-07-01 00:47:23,N,1.0,74,42,1.0,1.24,6.50,0.50,0.5,0.00,0.00,,0.3,7.80,2.0,1.0,0.0
4,2.0,2021-07-01 00:51:32,2021-07-01 00:58:46,N,1.0,42,244,1.0,1.10,7.00,0.50,0.5,0.00,0.00,,0.3,8.30,2.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83686,,2021-07-02 07:59:00,2021-07-02 08:33:00,,,218,169,,18.04,50.24,2.75,0.0,0.00,6.55,,0.3,59.84,,,
83687,,2021-07-02 07:02:00,2021-07-02 07:18:00,,,74,137,,5.56,19.16,0.00,0.0,3.66,0.00,,0.3,25.87,,,
83688,,2021-07-02 07:53:00,2021-07-02 08:15:00,,,69,75,,5.13,22.45,0.00,0.0,0.00,0.00,,0.3,22.75,,,
83689,,2021-07-02 07:58:00,2021-07-02 08:30:00,,,117,82,,12.58,48.62,2.75,0.0,0.00,2.45,,0.3,54.12,,,


In [6]:
# Elimino columnas que no son relevantes
delete_columns=['ehail_fee',
                'store_and_fwd_flag',
                'fare_amount','extra',
                'mta_tax','tip_amount',
                'tolls_amount',
                'improvement_surcharge',
                'congestion_surcharge',
                'trip_type']
eliminar_columnas(df_taxi,delete_columns)

Unnamed: 0,vendorID,pickup_datetime,dropoff_datetime,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance_miles,total_amount,payment_type
0,1.0,2021-07-01 00:30:52,2021-07-01 00:35:36,1.0,74,168,1.0,1.20,7.30,2.0
1,2.0,2021-07-01 00:25:36,2021-07-01 01:01:31,1.0,116,265,2.0,13.69,43.30,2.0
2,2.0,2021-07-01 00:05:58,2021-07-01 00:12:00,1.0,97,33,1.0,0.95,10.14,1.0
3,2.0,2021-07-01 00:41:40,2021-07-01 00:47:23,1.0,74,42,1.0,1.24,7.80,2.0
4,2.0,2021-07-01 00:51:32,2021-07-01 00:58:46,1.0,42,244,1.0,1.10,8.30,2.0
...,...,...,...,...,...,...,...,...,...,...
83686,,2021-07-02 07:59:00,2021-07-02 08:33:00,,218,169,,18.04,59.84,
83687,,2021-07-02 07:02:00,2021-07-02 07:18:00,,74,137,,5.56,25.87,
83688,,2021-07-02 07:53:00,2021-07-02 08:15:00,,69,75,,5.13,22.75,
83689,,2021-07-02 07:58:00,2021-07-02 08:30:00,,117,82,,12.58,54.12,


In [7]:
# Nueva columna con la distancia en kilometros
miles_to_km(df_taxi,'trip_distance_km','trip_distance_miles')

Unnamed: 0,vendorID,pickup_datetime,dropoff_datetime,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance_miles,total_amount,payment_type,trip_distance_km
0,1.0,2021-07-01 00:30:52,2021-07-01 00:35:36,1.0,74,168,1.0,1.20,7.30,2.0,1.931
1,2.0,2021-07-01 00:25:36,2021-07-01 01:01:31,1.0,116,265,2.0,13.69,43.30,2.0,22.032
2,2.0,2021-07-01 00:05:58,2021-07-01 00:12:00,1.0,97,33,1.0,0.95,10.14,1.0,1.529
3,2.0,2021-07-01 00:41:40,2021-07-01 00:47:23,1.0,74,42,1.0,1.24,7.80,2.0,1.996
4,2.0,2021-07-01 00:51:32,2021-07-01 00:58:46,1.0,42,244,1.0,1.10,8.30,2.0,1.770
...,...,...,...,...,...,...,...,...,...,...,...
83686,,2021-07-02 07:59:00,2021-07-02 08:33:00,,218,169,,18.04,59.84,,29.032
83687,,2021-07-02 07:02:00,2021-07-02 07:18:00,,74,137,,5.56,25.87,,8.948
83688,,2021-07-02 07:53:00,2021-07-02 08:15:00,,69,75,,5.13,22.75,,8.256
83689,,2021-07-02 07:58:00,2021-07-02 08:30:00,,117,82,,12.58,54.12,,20.245


In [8]:
# Cambio Dtype a datetime64[ns]
lista_datetime=['pickup_datetime','dropoff_datetime']
dtype_datetime64(df_taxi,lista_datetime)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83691 entries, 0 to 83690
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   vendorID             51173 non-null  float64       
 1   pickup_datetime      83691 non-null  datetime64[ns]
 2   dropoff_datetime     83691 non-null  datetime64[ns]
 3   RatecodeID           51173 non-null  float64       
 4   PULocationID         83691 non-null  int64         
 5   DOLocationID         83691 non-null  int64         
 6   passenger_count      51173 non-null  float64       
 7   trip_distance_miles  83691 non-null  float64       
 8   total_amount         83691 non-null  float64       
 9   payment_type         51173 non-null  float64       
 10  trip_distance_km     83691 non-null  float64       
dtypes: datetime64[ns](2), float64(7), int64(2)
memory usage: 7.0 MB


In [9]:
# Elimmino nulos
eliminar_nulos(df_taxi)

¡Hay nulos en tu DataSet! Inicialmente esta compuesto por:
83691 filas y 11 columnas
Se han encontrado un total de 130072 nulos
 
 Eliminando... 
 
Tras eliminar los nulos hay un total de 0 nulos. 

El DataSet sin nullos ahora esta compuesto por:
51173 filas y 11 columnas


In [10]:
# Elimino duplicados
eliminar_duplicados(df_taxi)

¡Enhorabuena! No hay duplicados en el DataFrame


In [11]:
# Creo columnas Year, Month, Weekday
columns_year_month_weekday(df_taxi,'pickup_datetime')

Unnamed: 0,vendorID,pickup_datetime,dropoff_datetime,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance_miles,total_amount,payment_type,trip_distance_km,year,month,day,hour
0,1.0,2021-07-01 00:30:52,2021-07-01 00:35:36,1.0,74,168,1.0,1.2,7.3,2.0,1.931,2021,7,3,0
1,2.0,2021-07-01 00:25:36,2021-07-01 01:01:31,1.0,116,265,2.0,13.69,43.3,2.0,22.032,2021,7,3,0
2,2.0,2021-07-01 00:05:58,2021-07-01 00:12:00,1.0,97,33,1.0,0.95,10.14,1.0,1.529,2021,7,3,0
3,2.0,2021-07-01 00:41:40,2021-07-01 00:47:23,1.0,74,42,1.0,1.24,7.8,2.0,1.996,2021,7,3,0
4,2.0,2021-07-01 00:51:32,2021-07-01 00:58:46,1.0,42,244,1.0,1.1,8.3,2.0,1.77,2021,7,3,0


In [13]:
# Filtro por año:2021 y por mes:7(July) y algunas condiciones más ( ver src.py función df_filter( ) )
df_taxi=df_filter(df_taxi,2021,7)
df_taxi

Unnamed: 0,vendorID,pickup_datetime,dropoff_datetime,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance_miles,total_amount,payment_type,trip_distance_km,year,month,day,hour
0,1.0,2021-07-01 00:30:52,2021-07-01 00:35:36,1.0,74,168,1.0,1.20,7.30,2.0,1.931,2021,7,3,0
1,2.0,2021-07-01 00:25:36,2021-07-01 01:01:31,1.0,116,265,2.0,13.69,43.30,2.0,22.032,2021,7,3,0
2,2.0,2021-07-01 00:05:58,2021-07-01 00:12:00,1.0,97,33,1.0,0.95,10.14,1.0,1.529,2021,7,3,0
3,2.0,2021-07-01 00:41:40,2021-07-01 00:47:23,1.0,74,42,1.0,1.24,7.80,2.0,1.996,2021,7,3,0
4,2.0,2021-07-01 00:51:32,2021-07-01 00:58:46,1.0,42,244,1.0,1.10,8.30,2.0,1.770,2021,7,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51168,2.0,2021-07-31 22:58:03,2021-07-31 23:16:59,1.0,41,151,1.0,1.00,8.16,1.0,1.609,2021,7,5,22
51169,2.0,2021-07-31 23:24:30,2021-07-31 23:36:26,1.0,74,151,1.0,2.28,14.75,1.0,3.669,2021,7,5,23
51170,2.0,2021-07-31 23:42:19,2021-07-31 23:43:40,1.0,166,24,1.0,0.21,4.30,2.0,0.338,2021,7,5,23
51171,2.0,2021-07-31 23:02:05,2021-07-31 23:20:49,1.0,223,112,1.0,4.09,21.62,1.0,6.582,2021,7,5,23


In [14]:
# Reseteo el índice
df_taxi=reset_index(df_taxi,'index')

In [15]:
# Creo columna trip_time_h, calcula la hora entre PickUp y DropOff
df_taxi['trip_time_h']=trip_time_h(df_taxi,'trip_time_h','dropoff_datetime','pickup_datetime')

In [16]:
# Creo columna average_speed_kmh, calcula la velocidad media del trayecto
average_speed_kmh(df_taxi,'average_speed_kmh','trip_distance_km','trip_time_h')

Unnamed: 0,vendorID,pickup_datetime,dropoff_datetime,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance_miles,total_amount,payment_type,trip_distance_km,year,month,day,hour,trip_time_h,average_speed_kmh
0,1.0,2021-07-01 00:30:52,2021-07-01 00:35:36,1.0,74,168,1.0,1.2,7.3,2.0,1.931,2021,7,3,0,0.079,24.4
1,2.0,2021-07-01 00:25:36,2021-07-01 01:01:31,1.0,116,265,2.0,13.69,43.3,2.0,22.032,2021,7,3,0,0.599,36.8
2,2.0,2021-07-01 00:05:58,2021-07-01 00:12:00,1.0,97,33,1.0,0.95,10.14,1.0,1.529,2021,7,3,0,0.101,15.1
3,2.0,2021-07-01 00:41:40,2021-07-01 00:47:23,1.0,74,42,1.0,1.24,7.8,2.0,1.996,2021,7,3,0,0.095,21.0
4,2.0,2021-07-01 00:51:32,2021-07-01 00:58:46,1.0,42,244,1.0,1.1,8.3,2.0,1.77,2021,7,3,0,0.121,14.6


In [17]:
# Filtro para eliminar algunos registros no posibles:
# Tiempo mínimo: 30sec=0.0083h, velocidad media entre (200-10) km/h
df_taxi=df_filter2(df_taxi,10,200,0.0083)
df_taxi

Unnamed: 0,vendorID,pickup_datetime,dropoff_datetime,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance_miles,total_amount,payment_type,trip_distance_km,year,month,day,hour,trip_time_h,average_speed_kmh
0,1.0,2021-07-01 00:30:52,2021-07-01 00:35:36,1.0,74,168,1.0,1.20,7.30,2.0,1.931,2021,7,3,0,0.079,24.4
1,2.0,2021-07-01 00:25:36,2021-07-01 01:01:31,1.0,116,265,2.0,13.69,43.30,2.0,22.032,2021,7,3,0,0.599,36.8
2,2.0,2021-07-01 00:05:58,2021-07-01 00:12:00,1.0,97,33,1.0,0.95,10.14,1.0,1.529,2021,7,3,0,0.101,15.1
3,2.0,2021-07-01 00:41:40,2021-07-01 00:47:23,1.0,74,42,1.0,1.24,7.80,2.0,1.996,2021,7,3,0,0.095,21.0
4,2.0,2021-07-01 00:51:32,2021-07-01 00:58:46,1.0,42,244,1.0,1.10,8.30,2.0,1.770,2021,7,3,0,0.121,14.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45570,2.0,2021-07-31 22:58:03,2021-07-31 23:06:43,1.0,41,151,1.0,1.48,11.16,1.0,2.382,2021,7,5,22,0.144,16.5
45572,2.0,2021-07-31 23:24:30,2021-07-31 23:36:26,1.0,74,151,1.0,2.28,14.75,1.0,3.669,2021,7,5,23,0.199,18.4
45573,2.0,2021-07-31 23:42:19,2021-07-31 23:43:40,1.0,166,24,1.0,0.21,4.30,2.0,0.338,2021,7,5,23,0.022,15.4
45574,2.0,2021-07-31 23:02:05,2021-07-31 23:20:49,1.0,223,112,1.0,4.09,21.62,1.0,6.582,2021,7,5,23,0.312,21.1


In [18]:
# Reseteo el índice
df_taxi=reset_index(df_taxi,'index')

In [19]:
# Doy formato al weekday
dict_day={0:'Monday',
     1:'Tuesday',
     2:'Wednesday',
     3:'Thursday',
     4:'Friday',
     5:'Saturday',
     6:'Sunday'}

day_replace(df_taxi,'day',dict_day)

0        Thursday
1        Thursday
2        Thursday
3        Thursday
4        Thursday
           ...   
42780    Saturday
42781    Saturday
42782    Saturday
42783    Saturday
42784    Saturday
Name: day, Length: 42785, dtype: object

In [20]:
# Doy formato al month
dict_month={1:'January',
            2:'February',
            3:'March',
            4:'April',
            5:'May',
            6:'June',
            7:'July',
            8:'August',
            9:'September',
            10:'Octuber',
            11:'November',
            12:'December'}

month_replace(df_taxi,'month',dict_month)

0        July
1        July
2        July
3        July
4        July
         ... 
42780    July
42781    July
42782    July
42783    July
42784    July
Name: month, Length: 42785, dtype: object

In [21]:
# Cambio de DType a 'int'
int_list=['vendorID','RatecodeID',
          'passenger_count',
          'payment_type']
dtype_int(df_taxi,int_list)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42785 entries, 0 to 42784
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   vendorID             42785 non-null  int64         
 1   pickup_datetime      42785 non-null  datetime64[ns]
 2   dropoff_datetime     42785 non-null  datetime64[ns]
 3   RatecodeID           42785 non-null  int64         
 4   PULocationID         42785 non-null  int64         
 5   DOLocationID         42785 non-null  int64         
 6   passenger_count      42785 non-null  int64         
 7   trip_distance_miles  42785 non-null  float64       
 8   total_amount         42785 non-null  float64       
 9   payment_type         42785 non-null  int64         
 10  trip_distance_km     42785 non-null  float64       
 11  year                 42785 non-null  int64         
 12  month                42785 non-null  object        
 13  day                  42785 non-

In [22]:
# Cambio de nombre según tipo de pago
dict_payment_type={1:'Credit_Card',
                   2:'Cash',
                   3:'No_change',
                   4:'Dispute',
                   5:'Unknown',
                   6:'Voided_Trip'}
payment_type_replace(df_taxi,'payment_type',dict_payment_type)

0               Cash
1               Cash
2        Credit_Card
3               Cash
4               Cash
            ...     
42780    Credit_Card
42781    Credit_Card
42782           Cash
42783    Credit_Card
42784    Credit_Card
Name: payment_type, Length: 42785, dtype: object

In [23]:
# Cambio de nombre según compañía
vendorID_type={1:'Creative Mobile Technologies',
               2:'VeriFone Inc'}
vendorID_replace(df_taxi,'vendorID',vendorID_type)

0        Creative Mobile Technologies
1                        VeriFone Inc
2                        VeriFone Inc
3                        VeriFone Inc
4                        VeriFone Inc
                     ...             
42780                    VeriFone Inc
42781                    VeriFone Inc
42782                    VeriFone Inc
42783                    VeriFone Inc
42784                    VeriFone Inc
Name: vendorID, Length: 42785, dtype: object

In [24]:
# Cambio de nombres según tipo de tarifa
RateCodeID_type={1:'Standard_rate',
                 2:'JFK',
                 3:'Newark',
                 4:'Nassau_or_Westchester',
                 5:'Negotiated_fare',
                 6:'Group_ride'}
RatecodeID_replace(df_taxi,'RatecodeID',RateCodeID_type)

0          Standard_rate
1          Standard_rate
2          Standard_rate
3          Standard_rate
4          Standard_rate
              ...       
42780      Standard_rate
42781      Standard_rate
42782      Standard_rate
42783      Standard_rate
42784    Negotiated_fare
Name: RatecodeID, Length: 42785, dtype: object

In [25]:
# TLC Taxi Zones
TLC_Bronx=[200,240,259,254,81,51,184,46,183,58,
           208,213,250,182,3,185,242,32,31,199,
           126,168,147,159,167,247,69,119,235,
           169,47,59,60,212,248,20,94,136,18,241,
           174,220,78]

TLC_Brooklyn=[55,29,150,154,210,108,123,149,21,11,
              22,67,14,228,227,26,178,165,155,91,
              89,133,111,39,222,76,63,77,35,72,71,
              85,188,62,190,257,181,195,106,40,54,
              52,25,33,65,66,34,49,97,189,61,177,225,
              37,36,80,112,255,256,217,17]

TLC_Manhattan=[153,128,127,243,120,244,116,42,152,166,
               41,74,194,24,151,238,75,262,263,236,43,
               239,143,142,237,141,140,202,50,48,163,
               230,161,162,229,233,170,164,100,246,68,
               186,90,234,107,137,224,4,79,113,114,249,
               158,125,114,211,144,148,232,231,45,209,
               87,13,261,12,88,104,103,105]

TLC_Queens=[27,201,117,86,2,30,132,124,180,216,10,218,
            219,203,139,205,38,258,197,130,215,122,191,
            19,101,64,175,98,131,28,134,96,102,198,160,
            157,226,145,193,146,7,179,8,223,207,260,83,
            82,196,95,93,135,121,192,73,9,16,15,171,129,
            138,70,173,56,57,253,92,53,252]

TLC_Staten_Island=[44,204,84,5,99,23,118,109,110,176,172,
                   214,6,221,115,245,251,187,156,206]


In [26]:
# Asigno nombre en función de la localización PickUpLocationID
TLC_Zone(df_taxi,'PULocationID','PULocation',TLC_Bronx,TLC_Brooklyn,TLC_Manhattan,TLC_Queens,TLC_Staten_Island)

0        Manhattan
1        Manhattan
2         Brooklyn
3        Manhattan
4        Manhattan
           ...    
42780    Manhattan
42781    Manhattan
42782    Manhattan
42783       Queens
42784       Queens
Name: PULocation, Length: 42785, dtype: object

In [27]:
# Asigno nombre en función de la localización DropOffLocationID
TLC_Zone(df_taxi,'DOLocationID','DOLocation',TLC_Bronx,TLC_Brooklyn,TLC_Manhattan,TLC_Queens,TLC_Staten_Island)

0            Bronx
1          Unknown
2         Brooklyn
3        Manhattan
4        Manhattan
           ...    
42780    Manhattan
42781    Manhattan
42782    Manhattan
42783     Brooklyn
42784       Queens
Name: DOLocation, Length: 42785, dtype: object

In [28]:
# Creo una nueva columna con el trayecto (PULocation-DOLocation)
sum_strings(df_taxi,'trip_description','PULocation','DOLocation')

0            Manhattan-Bronx
1          Manhattan-Unknown
2          Brooklyn-Brooklyn
3        Manhattan-Manhattan
4        Manhattan-Manhattan
                ...         
42780    Manhattan-Manhattan
42781    Manhattan-Manhattan
42782    Manhattan-Manhattan
42783        Queens-Brooklyn
42784          Queens-Queens
Name: trip_description, Length: 42785, dtype: object

In [29]:
df_taxi

Unnamed: 0,vendorID,pickup_datetime,dropoff_datetime,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance_miles,total_amount,payment_type,trip_distance_km,year,month,day,hour,trip_time_h,average_speed_kmh,PULocation,DOLocation,trip_description
0,Creative Mobile Technologies,2021-07-01 00:30:52,2021-07-01 00:35:36,Standard_rate,74,168,1,1.20,7.30,Cash,1.931,2021,July,Thursday,0,0.079,24.4,Manhattan,Bronx,Manhattan-Bronx
1,VeriFone Inc,2021-07-01 00:25:36,2021-07-01 01:01:31,Standard_rate,116,265,2,13.69,43.30,Cash,22.032,2021,July,Thursday,0,0.599,36.8,Manhattan,Unknown,Manhattan-Unknown
2,VeriFone Inc,2021-07-01 00:05:58,2021-07-01 00:12:00,Standard_rate,97,33,1,0.95,10.14,Credit_Card,1.529,2021,July,Thursday,0,0.101,15.1,Brooklyn,Brooklyn,Brooklyn-Brooklyn
3,VeriFone Inc,2021-07-01 00:41:40,2021-07-01 00:47:23,Standard_rate,74,42,1,1.24,7.80,Cash,1.996,2021,July,Thursday,0,0.095,21.0,Manhattan,Manhattan,Manhattan-Manhattan
4,VeriFone Inc,2021-07-01 00:51:32,2021-07-01 00:58:46,Standard_rate,42,244,1,1.10,8.30,Cash,1.770,2021,July,Thursday,0,0.121,14.6,Manhattan,Manhattan,Manhattan-Manhattan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42780,VeriFone Inc,2021-07-31 22:58:03,2021-07-31 23:06:43,Standard_rate,41,151,1,1.48,11.16,Credit_Card,2.382,2021,July,Saturday,22,0.144,16.5,Manhattan,Manhattan,Manhattan-Manhattan
42781,VeriFone Inc,2021-07-31 23:24:30,2021-07-31 23:36:26,Standard_rate,74,151,1,2.28,14.75,Credit_Card,3.669,2021,July,Saturday,23,0.199,18.4,Manhattan,Manhattan,Manhattan-Manhattan
42782,VeriFone Inc,2021-07-31 23:42:19,2021-07-31 23:43:40,Standard_rate,166,24,1,0.21,4.30,Cash,0.338,2021,July,Saturday,23,0.022,15.4,Manhattan,Manhattan,Manhattan-Manhattan
42783,VeriFone Inc,2021-07-31 23:02:05,2021-07-31 23:20:49,Standard_rate,223,112,1,4.09,21.62,Credit_Card,6.582,2021,July,Saturday,23,0.312,21.1,Queens,Brooklyn,Queens-Brooklyn


In [30]:
df_taxi.to_csv('dataset/taxi_data_wrangling.csv')