In [1]:
!pip install pandas numpy dask matplotlib seaborn scikit-learn joblib




In [2]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score
import joblib


In [3]:
# Load dataset using Dask
df = dd.read_parquet("C:/Users/vyshn/Documents/bigdata_taxi/processed_nyc_taxi_data.parquet")

# Show the first few rows without loading everything into memory
df.head()


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,trip_duration
0,2,2015-01-15 19:05:39,2015-01-15 19:23:42,1,1.59,-73.993896,40.750111,1,N,-73.974785,40.750618,1,12.0,1.0,0.5,3.25,0.0,0.3,17.05,18.05
1,1,2015-01-10 20:33:38,2015-01-10 20:53:28,1,3.3,-74.001648,40.724243,1,N,-73.994415,40.759109,1,14.5,0.5,0.5,2.0,0.0,0.3,17.8,19.833333
2,1,2015-01-10 20:33:38,2015-01-10 20:43:41,1,1.8,-73.963341,40.802788,1,N,-73.95182,40.824413,2,9.5,0.5,0.5,0.0,0.0,0.3,10.8,10.05
3,1,2015-01-10 20:33:39,2015-01-10 20:35:31,1,0.5,-74.009087,40.713818,1,N,-74.004326,40.719986,2,3.5,0.5,0.5,0.0,0.0,0.3,4.8,1.866667
4,1,2015-01-10 20:33:39,2015-01-10 20:52:58,1,3.0,-73.971176,40.762428,1,N,-74.004181,40.742653,2,15.0,0.5,0.5,0.0,0.0,0.3,16.3,19.316667


In [4]:
print(df.columns)


Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'pickup_longitude',
       'pickup_latitude', 'RateCodeID', 'store_and_fwd_flag',
       'dropoff_longitude', 'dropoff_latitude', 'payment_type', 'fare_amount',
       'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
       'improvement_surcharge', 'total_amount', 'trip_duration'],
      dtype='object')


In [5]:
# Convert pickup datetime to datetime format
df['tpep_pickup_datetime'] = dd.to_datetime(df['tpep_pickup_datetime'])

# Extract time-based features
df = df.assign(
    hour=df['tpep_pickup_datetime'].dt.hour,
    day_of_week=df['tpep_pickup_datetime'].dt.dayofweek
)

# Drop original datetime columns
df = df.drop(columns=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])

df.head()


Unnamed: 0,VendorID,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,trip_duration,hour,day_of_week
0,2,1,1.59,-73.993896,40.750111,1,N,-73.974785,40.750618,1,12.0,1.0,0.5,3.25,0.0,0.3,17.05,18.05,19,3
1,1,1,3.3,-74.001648,40.724243,1,N,-73.994415,40.759109,1,14.5,0.5,0.5,2.0,0.0,0.3,17.8,19.833333,20,5
2,1,1,1.8,-73.963341,40.802788,1,N,-73.95182,40.824413,2,9.5,0.5,0.5,0.0,0.0,0.3,10.8,10.05,20,5
3,1,1,0.5,-74.009087,40.713818,1,N,-74.004326,40.719986,2,3.5,0.5,0.5,0.0,0.0,0.3,4.8,1.866667,20,5
4,1,1,3.0,-73.971176,40.762428,1,N,-74.004181,40.742653,2,15.0,0.5,0.5,0.0,0.0,0.3,16.3,19.316667,20,5


In [6]:
# Convert only 1% of data to Pandas to avoid MemoryErrors
df_sample = df.sample(frac=0.01).compute()


In [7]:
selected_features = ['passenger_count', 'trip_distance', 'hour', 'day_of_week', 
                     'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']

# Define target variables
target_fare = 'fare_amount'   # For regression model
target_tip = 'tip_amount'     # For classification model

# Drop invalid values
df_sample = df_sample[(df_sample['fare_amount'] > 0) & (df_sample['tip_amount'] >= 0)]


In [8]:
X = df_sample[selected_features]
y = df_sample[target_fare]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


Train shape: (377756, 8) Test shape: (94440, 8)


In [10]:
import sklearn
print(sklearn.__version__)


1.6.1


In [12]:
print(mean_squared_error)


<function mean_squared_error at 0x000002D62350F9C0>


In [13]:
rmse = mean_squared_error(y_test, y_pred) ** 0.5
print(f"Root Mean Squared Error: {rmse}")


Root Mean Squared Error: 10.458447066687684


In [14]:
# Define tip classification (High Tip = 1, Low Tip = 0)
df_sample['high_tip'] = (df_sample['tip_amount'] > 5).astype(int)

# Drop original tip amount
df_sample = df_sample.drop(columns=['tip_amount'])

df_sample.head()


Unnamed: 0,VendorID,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tolls_amount,improvement_surcharge,total_amount,trip_duration,hour,day_of_week,high_tip
282504,1,2,2.4,-73.9823,40.750992,1,N,-73.996254,40.721142,1,12.0,0.0,0.5,0.0,0.3,14.3,14.983333,13,5,0
57276,2,1,1.49,-73.973099,40.743649,1,N,-73.987892,40.73465,1,6.5,0.0,0.5,0.0,0.3,8.3,5.366667,17,5,0
230929,2,6,6.72,-73.977486,40.757938,1,N,-73.988556,40.696091,2,20.5,0.0,0.5,0.0,0.3,21.3,14.95,12,3,0
1496,1,2,2.9,-74.011353,40.709686,1,N,-73.982918,40.739452,2,16.0,0.0,0.5,0.0,0.3,16.8,25.066667,13,0,0
28677,1,2,0.8,-73.993813,40.729156,1,N,-74.001778,40.719288,2,6.0,0.0,0.5,0.0,0.3,6.8,6.6,13,1,0


In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define features and target
X = df_sample[selected_features]
y = df_sample['high_tip']

# Split data into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict on test data
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Tip Prediction Model Accuracy: {accuracy * 100:.2f}%")


Tip Prediction Model Accuracy: 95.34%


In [16]:
import joblib

# Save the trained models
joblib.dump(reg_model, "fare_prediction_model.pkl")
joblib.dump(clf, "tip_prediction_model.pkl")

print("Models saved successfully!")


Models saved successfully!
