In [2]:
import pandas as pd

df = pd.read_csv('taxis.csv', encoding='unicode_escape')

# Question 1a
print("Question 1a")

# Split the 'pickup' column into date and time
pickup_date = df['pickup'].str.split(' ').str[0]
pickup_time = df['pickup'].str.split(' ').str[1]

# Split the 'dropoff' column into date and time
dropoff_date = df['dropoff'].str.split(' ').str[0]
dropoff_time = df['dropoff'].str.split(' ').str[1]

# Insert these new columns to the leftmost side of the dataframe
df.insert(0, 'pickup_date', pickup_date)
df.insert(1, 'pickup_time', pickup_time)
df.insert(2, 'dropoff_date', dropoff_date)
df.insert(3, 'dropoff_time', dropoff_time)


# Print top 5 rows, dropping pickup and dropoff temporarily
print("After spliting the date and time for pick and dropoff, here's the resultant df:\n")
print(df.drop(columns=['pickup', 'dropoff']).head(5).to_string(header=True, index=False, col_space=10))

Question 1a
After spliting the date and time for pick and dropoff, here's the resultant df:

pickup_date pickup_time dropoff_date dropoff_time  passengers   distance       fare        tip      tolls      total      color     payment           pickup_zone          dropoff_zone pickup_borough dropoff_borough
 2019-03-23    20:21:09   2019-03-23     20:27:24           1       1.60        7.0       2.15        0.0      12.95     yellow credit card       Lenox Hill West   UN/Turtle Bay South      Manhattan       Manhattan
 2019-03-04    16:11:55   2019-03-04     16:19:00           1       0.79        5.0       0.00        0.0       9.30     yellow        cash Upper West Side South Upper West Side South      Manhattan       Manhattan
 2019-03-27    17:53:01   2019-03-27     18:00:25           1       1.37        7.5       2.36        0.0      14.16     yellow credit card         Alphabet City          West Village      Manhattan       Manhattan
 2019-03-10    01:23:59   2019-03-10     01:49:

In [15]:
# Question 1b - compute for the green and yellow
print("Question 1b")
# Convert pickup_time and dropoff_time to datetime
df['pickup_time'] = pd.to_datetime(df['pickup_time'], format='%H:%M:%S')
df['dropoff_time'] = pd.to_datetime(df['dropoff_time'], format='%H:%M:%S')

# Calculate the time of travel for each row and add it as a new column
df['travel_time'] = (df['dropoff_time'] - df['pickup_time']).dt.total_seconds() / 60  # This gives travel time in minutes

# convert back to time
df['pickup_time'] = pd.to_datetime(df['pickup_time'], format='%H:%M:%S').dt.time
df['dropoff_time'] = pd.to_datetime(df['dropoff_time'], format='%H:%M:%S').dt.time

# Group by car_type and compute the totals
color_agg_results = df.groupby('color').agg({
    'fare': 'sum',
    'passengers': 'sum',
    'distance': 'sum',
    'travel_time': 'sum'
})

# Convert the total travel time from minutes to "Days Hours:Minutes:Seconds"
color_agg_results['travel_time'] = pd.to_timedelta(color_agg_results['travel_time'], unit='s')
color_agg_results['travel_time'] = color_agg_results['travel_time'].astype(str)

print("The aggregated results of fare, passengers, distance and travel_time for each colour is:")
print(color_agg_results.to_string(header=True, index=False, col_space=10))


Question 1b
The aggregated results of fare, passengers, distance and travel_time for each colour is:
      fare  passengers   distance               travel_time
  13788.15        1226    3345.95 0 days 03:03:37.766666667
  70426.72        8676   16111.41 0 days 02:34:53.316666667


In [16]:
# Question 2
from datetime import datetime
# Filter for cash payments and specific pickup dates
cash_df = df[(df['payment'] == 'cash') & pd.to_datetime(df['pickup_date']).dt.day.isin([10, 15, 20, 25, 30])]
# print(cash_df)

# Function to get row with max distance for each group
def get_max_distance_trip(group):
    idx = group['distance'].idxmax() 
    if pd.notna(idx):  # checks if idx is not NaN
        return group.loc[idx]
    else:
        return pd.Series({'distance': 0})

# Group by car color and pickup date, then apply the function
max_distance_df = cash_df.groupby(['color', 'pickup_date']).apply(get_max_distance_trip)

# Filter and rearrange columns
GY_cash = max_distance_df[['color', 'distance', 'pickup', 'pickup_date', 'pickup_time', 'dropoff', 'dropoff_time', 'fare']]
GY_cash.columns = ['color', 'distance', 'pickup', 'pickup_date', 'pickup_time', 'dropoff', 'dropoff_time', 'fare']

GY_cash = GY_cash.reset_index(drop=True)

print(GY_cash.to_string(header=True, index=False, col_space=10))

     color   distance              pickup pickup_date pickup_time             dropoff dropoff_time       fare
     green      16.01 2019-03-10 00:59:49  2019-03-10    00:59:49 2019-03-10 01:36:25     01:36:25       47.0
     green       4.71 2019-03-15 06:42:17  2019-03-15    06:42:17 2019-03-15 06:54:38     06:54:38       15.0
     green       6.66 2019-03-20 12:04:33  2019-03-20    12:04:33 2019-03-20 12:28:07     12:28:07       20.0
     green       8.80 2019-03-25 10:10:11  2019-03-25    10:10:11 2019-03-25 10:46:25     10:46:25       31.0
     green      10.32 2019-03-30 09:25:32  2019-03-30    09:25:32 2019-03-30 09:54:33     09:54:33       31.0
    yellow      11.52 2019-03-10 00:13:12  2019-03-10    00:13:12 2019-03-10 00:33:09     00:33:09       32.5
    yellow      15.64 2019-03-15 16:45:55  2019-03-15    16:45:55 2019-03-15 17:46:47     17:46:47       50.0
    yellow      17.01 2019-03-20 17:22:33  2019-03-20    17:22:33 2019-03-20 18:07:19     18:07:19       52.0
    yellow

In [17]:
# Question 3
print("Question 3")

# Group by 'pickup_date' and find the index with the maximum 'distance'
idx = GY_cash.groupby('pickup_date')['distance'].idxmax()

# Create a new dataframe with only the rows with the maximum 'distance' for each 'pickup_date'
GY_maxDist = GY_cash.loc[idx]

# Display the new DataFrame
print(GY_maxDist.to_string(header=True, index=False, col_space=10))

Question 3
     color   distance              pickup pickup_date pickup_time             dropoff dropoff_time       fare
     green      16.01 2019-03-10 00:59:49  2019-03-10    00:59:49 2019-03-10 01:36:25     01:36:25       47.0
    yellow      15.64 2019-03-15 16:45:55  2019-03-15    16:45:55 2019-03-15 17:46:47     17:46:47       50.0
    yellow      17.01 2019-03-20 17:22:33  2019-03-20    17:22:33 2019-03-20 18:07:19     18:07:19       52.0
    yellow      12.12 2019-03-25 05:33:43  2019-03-25    05:33:43 2019-03-25 05:58:38     05:58:38       35.5
    yellow      17.14 2019-03-30 14:56:08  2019-03-30    14:56:08 2019-03-30 15:49:32     15:49:32       52.0
