# <center> NYC-Taxi-Trip Feature Engineering </center>

In [2]:
#load libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [3]:
#setup
df = sns.load_dataset("taxis").copy()

df['pickup'] = pd.to_datetime(df['pickup'])
df['dropoff'] = pd.to_datetime(df['dropoff'])
df = df.dropna(subset=['pickup','dropoff','fare','distance','passengers'])


### Time-Based Features

In [4]:
#Hour
df['hour'] = df['pickup'].dt.hour

#Impact: correlates with fare, tip, surge periods, customer type.

In [5]:
#Day of week
df['weekday'] = df['pickup'].dt.weekday  # 0=Mon, 6=Sun
df['weekday_name'] = df['pickup'].dt.day_name()

#Impact: weekdays behave very differently from weekends.

In [6]:
#Is weekend
df['is_weekend'] = df['weekday'].isin([5,6]).astype(int)

In [7]:
#Month / Day

df['month'] = df['pickup'].dt.month
df['day'] = df['pickup'].dt.day

In [8]:
#Trip hour bucket (categorical)
def hour_bucket(h):
    if 5 <= h <= 11:
        return 'morning'
    if 12 <= h <= 16:
        return 'afternoon'
    if 17 <= h <= 21:
        return 'evening'
    return 'night'

df['hour_bucket'] = df['hour'].apply(hour_bucket)


### Trip-Based Features

In [9]:
#Trip duration (minutes)
df['duration_min'] = (df['dropoff'] - df['pickup']).dt.total_seconds() / 60
df = df[df['duration_min'] > 0]  # remove corrupt rows

In [10]:
#Average speed (mph)
df['speed_mph'] = df['distance'] / (df['duration_min'] / 60)
df['speed_mph'].replace([np.inf, -np.inf], np.nan, inplace=True)
df = df.dropna(subset=['speed_mph'])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['speed_mph'].replace([np.inf, -np.inf], np.nan, inplace=True)


In [12]:
#Price per mile
df['fare_per_mile'] = df['fare'] / df['distance']
df['fare_per_mile'].replace([np.inf, -np.inf], np.nan, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['fare_per_mile'].replace([np.inf, -np.inf], np.nan, inplace=True)


In [13]:
#Trip efficiency (normalized)
df['efficiency'] = df['distance'] / df['duration_min']

### Fare & Tip Features

In [14]:
#Tip percentage (important target proxy)
df['tip_pct'] = df['tip'] / df['fare']
df['tip_pct'] = df['tip_pct'].clip(0, 1)   # remove unrealistic >100% tips

In [15]:
#Total paid by customer
df['total_paid'] = df['fare'] + df['tip'] + df['tolls']

In [16]:
#Has tip (binary)
df['tipped'] = (df['tip'] > 0).astype(int)


### Categorical Encodings

In [18]:
#Payment type (cash/credit)
df = pd.get_dummies(df, columns=['payment'], prefix='pay', drop_first=True)


In [19]:
#Hour bucket
df = pd.get_dummies(df, columns=['hour_bucket'], drop_first=True)

In [21]:
#Pickup/Dropoff Zones
top_zones = df['pickup_zone'].value_counts().nlargest(15).index
df['pickup_zone_top'] = df['pickup_zone'].apply(lambda x: x if x in top_zones else 'Other')

top_drop = df['dropoff_zone'].value_counts().nlargest(15).index
df['dropoff_zone_top'] = df['dropoff_zone'].apply(lambda x: x if x in top_drop else 'Other')

df = pd.get_dummies(df, columns=['pickup_zone_top','dropoff_zone_top'], drop_first=True)


### Outlier Treatment

In [22]:
#Duration outliers
df = df[df['duration_min'] < 120]  # remove 2+ hour trips

In [23]:
#Speed sanity check
df = df[(df['speed_mph'] > 1) & (df['speed_mph'] < 80)]


In [24]:
#Fare sanity
df = df[(df['fare'] > 0) & (df['fare'] < 250)]


In [27]:
df.shape
df.head()


Unnamed: 0,pickup,dropoff,passengers,distance,fare,tip,tolls,total,color,pickup_zone,...,dropoff_zone_top_Midtown North,dropoff_zone_top_Murray Hill,dropoff_zone_top_Other,dropoff_zone_top_Penn Station/Madison Sq West,dropoff_zone_top_Times Sq/Theatre District,dropoff_zone_top_Union Sq,dropoff_zone_top_Upper East Side North,dropoff_zone_top_Upper East Side South,dropoff_zone_top_Upper West Side North,dropoff_zone_top_Upper West Side South
0,2019-03-23 20:21:09,2019-03-23 20:27:24,1,1.6,7.0,2.15,0.0,12.95,yellow,Lenox Hill West,...,False,False,True,False,False,False,False,False,False,False
1,2019-03-04 16:11:55,2019-03-04 16:19:00,1,0.79,5.0,0.0,0.0,9.3,yellow,Upper West Side South,...,False,False,False,False,False,False,False,False,False,True
2,2019-03-27 17:53:01,2019-03-27 18:00:25,1,1.37,7.5,2.36,0.0,14.16,yellow,Alphabet City,...,False,False,True,False,False,False,False,False,False,False
3,2019-03-10 01:23:59,2019-03-10 01:49:51,1,7.7,27.0,6.15,0.0,36.95,yellow,Hudson Sq,...,False,False,True,False,False,False,False,False,False,False
4,2019-03-30 13:27:42,2019-03-30 13:37:14,3,2.16,9.0,1.1,0.0,13.4,yellow,Midtown East,...,False,False,True,False,False,False,False,False,False,False


In [29]:
print("Rows, Columns:", df.shape)
print("\nNew Feature Columns Present:")
new_cols = ['hour', 'weekday', 'is_weekend', 'duration_min', 'speed_mph', 
            'fare_per_mile', 'tip_pct', 'tipped']
[x for x in new_cols if x in df.columns]


Rows, Columns: (6371, 90)

New Feature Columns Present:


['hour',
 'weekday',
 'is_weekend',
 'duration_min',
 'speed_mph',
 'fare_per_mile',
 'tip_pct',
 'tipped']

In [30]:
df[['hour','weekday','duration_min','speed_mph','fare_per_mile','tip_pct']].sample(5)


Unnamed: 0,hour,weekday,duration_min,speed_mph,fare_per_mile,tip_pct
3601,21,6,24.716667,5.656102,6.866953,0.000625
1645,17,3,32.883333,8.758236,4.479167,0.24
6400,17,4,44.616667,18.517744,3.159041,0.091954
2428,10,2,8.116667,8.870637,6.25,0.36
38,16,0,7.366667,9.20362,5.752212,0.332308
