# Access Data from [Data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page)

### Newyork Yellow Taxi Data Analysis (01/2024)

!['Photo'](taxi.jpg)

| **#** | **Column**               | **Dtype**         | **Description**                                      |
|-------|--------------------------|-------------------|------------------------------------------------------|
| 0     | `VendorID`               | `int32`           | Taxi service provider ID.                            |
| 1     | `tpep_pickup_datetime`   | `datetime64[ns]`  | Pickup date and time.                                |
| 2     | `tpep_dropoff_datetime`  | `datetime64[ns]`  | Dropoff date and time.                               |
| 3     | `passenger_count`        | `float64`         | Number of passengers.                                |
| 4     | `trip_distance`          | `float64`         | Distance of the trip.                                |
| 5     | `RatecodeID`             | `float64`         | Rate code ID.                                        |
| 6     | `store_and_fwd_flag`     | `object`          | Flag for storing trip data before sending.           |
| 7     | `PULocationID`           | `int32`           | Pickup location ID.                                  |
| 8     | `DOLocationID`           | `int32`           | Dropoff location ID.                                 |
| 9     | `payment_type`           | `int64`           | Payment method ID.                                   |
| 10    | `fare_amount`            | `float64`         | Fare amount.                                         |
| 11    | `extra`                  | `float64`         | Additional charges.                                  |
| 12    | `mta_tax`                | `float64`         | MTA tax amount.                                      |
| 13    | `tip_amount`             | `float64`         | Tip amount.                                          |
| 14    | `tolls_amount`           | `float64`         | Tolls amount.                                        |
| 15    | `improvement_surcharge`  | `float64`         | Improvement surcharge.                               |
| 16    | `total_amount`           | `float64`         | Total amount charged.                                |
| 17    | `congestion_surcharge`   | `float64`         | Congestion surcharge.                                |
| 18    | `Airport_fee`            | `float64`         | Airport fee.                                         |


-------------------------------------------

# Import Important Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import RobustScaler , OneHotEncoder
from category_encoders import BinaryEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from tabulate import tabulate

-------------------------------------------

# Load the data


In [3]:
df = pd.read_csv('cleaned_data.csv', engine='python', delimiter=',', on_bad_lines='skip')

-------------------------------------------

# Overview of the data

In [24]:
df.columns

Index(['vendorid', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'pulocationid', 'dolocationid',
       'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount',
       'tolls_amount', 'improvement_surcharge', 'congestion_surcharge',
       'airport_fee', 'day', 'pickup_time', 'dropoff_time', 'duration_seconds',
       'duration_minutes', 'total_amount'],
      dtype='object')

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 449610 entries, 0 to 449609
Data columns (total 22 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   vendorid               449610 non-null  int64  
 1   tpep_pickup_datetime   449610 non-null  object 
 2   tpep_dropoff_datetime  449610 non-null  object 
 3   passenger_count        449610 non-null  float64
 4   trip_distance          449610 non-null  float64
 5   pulocationid           449610 non-null  int64  
 6   dolocationid           449610 non-null  int64  
 7   payment_type           449610 non-null  int64  
 8   fare_amount            449610 non-null  float64
 9   extra                  449610 non-null  float64
 10  mta_tax                449610 non-null  float64
 11  tip_amount             449609 non-null  float64
 12  tolls_amount           449609 non-null  float64
 13  improvement_surcharge  449609 non-null  float64
 14  congestion_surcharge   449609 non-nu

In [25]:
df.isna().sum()

Unnamed: 0,0
vendorid,0
tpep_pickup_datetime,0
tpep_dropoff_datetime,0
passenger_count,0
trip_distance,0
pulocationid,0
dolocationid,0
payment_type,0
fare_amount,0
extra,0


In [15]:
df.dropna(inplace=True)

In [26]:
df.duplicated().sum()

0

In [17]:
df.sample().T

Unnamed: 0,398848
vendorid,2
tpep_pickup_datetime,2024-01-06 12:13:49
tpep_dropoff_datetime,2024-01-06 12:19:08
passenger_count,1.0
trip_distance,0.92
pulocationid,140
dolocationid,140
payment_type,1
fare_amount,7.2
extra,0.0


In [None]:
df.columns

Index(['vendorid', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'pulocationid', 'dolocationid',
       'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount',
       'tolls_amount', 'improvement_surcharge', 'congestion_surcharge',
       'airport_fee', 'day', 'pickup_time', 'dropoff_time', 'duration_seconds',
       'duration_minutes', 'total_amount'],
      dtype='object')

-------------------------------------------

# Start Preprocessing

In [4]:
df.pulocationid = df.pulocationid.astype('O')
df.dolocationid = df.dolocationid.astype('O')
df.day = df.day.astype('O')
df.vendorid = df.vendorid.astype('O')
df.payment_type = df.payment_type.astype('O')

# Preprocessing
numeric_features = ['passenger_count','trip_distance','duration_seconds']
categorical_features = ['pulocationid','dolocationid','day']
one_hot_features = ['vendorid','payment_type']

In [5]:
X = df[['vendorid','passenger_count','trip_distance','pulocationid', 'dolocationid','payment_type','day','duration_seconds']]
y = df['fare_amount']

-------------------------------------------

# Train test split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [7]:
preprocessor = ColumnTransformer([
    ('num', RobustScaler(), numeric_features),
    ('cat', BinaryEncoder(), categorical_features),
    ('ohe', OneHotEncoder(), one_hot_features)
])

X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)
X_test = preprocessor.transform(X_test)

In [17]:
pd.DataFrame(X_train,columns=preprocessor.get_feature_names_out()).sample().T

Unnamed: 0,1302974
num__passenger_count,0.0
num__trip_distance,-0.660819
num__duration_seconds,-0.760797
cat__pulocationid_0,0.0
cat__pulocationid_1,0.0
cat__pulocationid_2,1.0
cat__pulocationid_3,0.0
cat__pulocationid_4,0.0
cat__pulocationid_5,1.0
cat__pulocationid_6,1.0


-------------------------------

# Linear Regression

In [None]:
pipelr = Pipeline([
    ('model', LinearRegression())
])

pipelr.fit(X_train, y_train)

y_pred = pipelr.predict(X_val)

scores = cross_val_score(pipelr, X_train, y_train, cv=5)

# Calculate metrics
mse = mean_squared_error(y_val, y_pred)
mae = mean_absolute_error(y_val, y_pred)
mean_cv_score = scores.mean() * 100
r2_train = pipelr.score(X_train, y_train) * 100
r2_val = pipelr.score(X_val, y_val) * 100

# Create data for the table
table_data = [
    ['Mean Squared Error', f"{mse:.4f}"],
    ['Mean Absolute Error', f"{mae:.4f}"],
    ['Mean Cross-Validation Score', f"{mean_cv_score:.4f}%"],
    ['R-squared Train', f"{r2_train:.4f}%"],
    ['R-squared Val', f"{r2_val:.4f}%"]
]

# Create and print the table with borders
table = tabulate(table_data, headers=['Metric', 'Value'], tablefmt='grid')

# Print title and table
print("Linear Regression")
print("-" * 70)
print(table)


Linear Regression
----------------------------------------------------------------------
+-----------------------------+----------+
| Metric                      | Value    |
| Mean Squared Error          | 4.9697   |
+-----------------------------+----------+
| Mean Absolute Error         | 0.9464   |
+-----------------------------+----------+
| Mean Cross-Validation Score | 94.2309% |
+-----------------------------+----------+
| R-squared Train             | 94.0472% |
+-----------------------------+----------+
| R-squared Val               | 95.5961% |
+-----------------------------+----------+


-----------------------------

# Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
# Create and fit the pipeline
pipedtr = Pipeline([
    ('model', DecisionTreeRegressor(max_depth=5))
])

pipedtr.fit(X_train, y_train)

# Make predictions
y_pred = pipedtr.predict(X_val)

# Calculate metrics
mse = mean_squared_error(y_val, y_pred)
mae = mean_absolute_error(y_val, y_pred)
scores = cross_val_score(pipedtr, X_train, y_train, cv=5)
mean_cv_score = scores.mean() * 100
r2_train = pipedtr.score(X_train, y_train) * 100
r2_val = pipedtr.score(X_val, y_val) * 100

# Create data for the table
table_data = [
    ['Mean Squared Error', f"{mse:.4f}"],
    ['Mean Absolute Error', f"{mae:.4f}"],
    ['Mean Cross-Validation Score', f"{mean_cv_score:.4f}%"],
    ['R-squared Train', f"{r2_train:.4f}%"],
    ['R-squared Val', f"{r2_val:.4f}%"]
]

# Create and print the table with borders
table = tabulate(table_data, headers=['Metric', 'Value'], tablefmt='grid')

# Print title and table
print("Decision Tree Regression")
print("-" * 70)
print(table)

Decision Tree Regression
----------------------------------------------------------------------
+-----------------------------+----------+
| Metric                      | Value    |
| Mean Squared Error          | 3.3168   |
+-----------------------------+----------+
| Mean Absolute Error         | 1.0436   |
+-----------------------------+----------+
| Mean Cross-Validation Score | 95.2645% |
+-----------------------------+----------+
| R-squared Train             | 95.7474% |
+-----------------------------+----------+
| R-squared Val               | 97.0608% |
+-----------------------------+----------+


----------------

# Support Vector Machine

In [None]:
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tabulate import tabulate

# Create and fit the pipeline with SVR
pipe_svr = Pipeline([
    ('model', SVR())
])

pipe_svr.fit(X_train, y_train)

# Make predictions
y_pred = pipe_svr.predict(X_val)

# Calculate metrics
mse = mean_squared_error(y_val, y_pred)
mae = mean_absolute_error(y_val, y_pred)
scores = cross_val_score(pipe_svr, X_train, y_train, cv=5)
mean_cv_score = scores.mean() * 100
r2_train = pipe_svr.score(X_train, y_train) * 100
r2_val = pipe_svr.score(X_val, y_val) * 100

# Create data for the table
table_data = [
    ['Mean Squared Error', f"{mse:.4f}"],
    ['Mean Absolute Error', f"{mae:.4f}"],
    ['Mean Cross-Validation Score', f"{mean_cv_score:.4f}%"],
    ['R-squared Train', f"{r2_train:.4f}%"],
    ['R-squared Val', f"{r2_val:.4f}%"]
]

# Create and print the table with borders
table = tabulate(table_data, headers=['Metric', 'Value'], tablefmt='grid')

# Print title and table
print("SVR Model Performance")
print("-" * 70)
print(table)


------------------------------------------------------

# Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tabulate import tabulate

# Create and fit the pipeline with RandomForestRegressor
pipe_rf = Pipeline([
    ('model', RandomForestRegressor())
])

pipe_rf.fit(X_train, y_train)

# Make predictions
y_pred = pipe_rf.predict(X_val)

# Calculate metrics
mse = mean_squared_error(y_val, y_pred)
mae = mean_absolute_error(y_val, y_pred)
# scores = cross_val_score(pipe_rf, X_train, y_train, cv=5)
# mean_cv_score = scores.mean() * 100
r2_train = pipe_rf.score(X_train, y_train) * 100
r2_val = pipe_rf.score(X_val, y_val) * 100

# Create data for the table
table_data = [
    ['Mean Squared Error', f"{mse:.4f}"],
    ['Mean Absolute Error', f"{mae:.4f}"],
    # ['Mean Cross-Validation Score', f"{mean_cv_score:.4f}%"],
    ['R-squared Train', f"{r2_train:.4f}%"],
    ['R-squared Val', f"{r2_val:.4f}%"]
]

# Create and print the table with borders
table = tabulate(table_data, headers=['Metric', 'Value'], tablefmt='grid')

# Print title and table
print("Random Forest Regressor Model Performance")
print("-" * 70)
print(table)


---

# XGBoost

In [8]:
from xgboost import XGBRegressor
# Create and fit the pipeline with XGBRegressor
pipexgb = Pipeline([
    ('model', XGBRegressor())
])

pipexgb.fit(X_train, y_train)

# Make predictions
y_pred = pipexgb.predict(X_val)

# Calculate metrics
mse = mean_squared_error(y_val, y_pred)
mae = mean_absolute_error(y_val, y_pred)
scores = cross_val_score(pipexgb, X_train, y_train, cv=5)
mean_cv_score = scores.mean() * 100
r2_train = pipexgb.score(X_train, y_train) * 100
r2_val = pipexgb.score(X_val, y_val) * 100

# Create data for the table
table_data = [
    ['Mean Squared Error', f"{mse:.4f}"],
    ['Mean Absolute Error', f"{mae:.4f}"],
    ['Mean Cross-Validation Score', f"{mean_cv_score:.4f}%"],
    ['R-squared Train', f"{r2_train:.4f}%"],
    ['R-squared Val', f"{r2_val:.4f}%"]
]

# Create and print the table with borders
table = tabulate(table_data, headers=['Metric', 'Value'], tablefmt='grid')

# Print title and table
print("XGBRegressor Model Performance")
print("-" * 70)
print(table)

XGBRegressor Model Performance
----------------------------------------------------------------------
+-----------------------------+----------+
| Metric                      | Value    |
| Mean Squared Error          | 2.4320   |
+-----------------------------+----------+
| Mean Absolute Error         | 0.4898   |
+-----------------------------+----------+
| Mean Cross-Validation Score | 95.0946% |
+-----------------------------+----------+
| R-squared Train             | 99.1728% |
+-----------------------------+----------+
| R-squared Val               | 97.8449% |
+-----------------------------+----------+


In [9]:
# prompt: grid search cv with xgboost

# Define the parameter grid for GridSearchCV
param_grid = {
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__max_depth': [3, 5, 7],
    'model__subsample': [0.8, 1.0],
    'model__colsample_bytree': [0.8, 1.0]
}

# Create the pipeline with XGBRegressor
pipexgb = Pipeline([
    ('model', XGBRegressor())
])

# Create GridSearchCV object
grid_search = GridSearchCV(
    estimator=pipexgb,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=2
)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best negative mean squared error found: ", grid_search.best_score_)



Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters found:  {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__subsample': 1.0}
Best negative mean squared error found:  -4.275125249656144


# Best Model


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tabulate import tabulate

# Create and fit the pipeline with RandomForestRegressor
pipe_rf = Pipeline([
    ('model', RandomForestRegressor(learning_rate=0.1, max_depth=3, n_estimators=100,colsample_bytree= 0.8,subsample = 1.0))
])

pipe_rf.fit(X_train, y_train)

# Make predictions
y_pred = pipe_rf.predict(X_val)

# Calculate metrics
mse = mean_squared_error(y_val, y_pred)
mae = mean_absolute_error(y_val, y_pred)
scores = cross_val_score(pipe_rf, X_train, y_train, cv=5)
mean_cv_score = scores.mean() * 100
r2_train = pipe_rf.score(X_train, y_train) * 100
r2_val = pipe_rf.score(X_val, y_val) * 100

# Create data for the table
table_data = [
    ['Mean Squared Error', f"{mse:.4f}"],
    ['Mean Absolute Error', f"{mae:.4f}"],
    ['Mean Cross-Validation Score', f"{mean_cv_score:.4f}%"],
    ['R-squared Train', f"{r2_train:.4f}%"],
    ['R-squared Val', f"{r2_val:.4f}%"]
]

# Create and print the table with borders
table = tabulate(table_data, headers=['Metric', 'Value'], tablefmt='grid')

# Print title and table
print("Random Forest Regressor Model Performance")
print("-" * 70)
print(table)


In [None]:
# save the model to disk
import pickle
filename = 'finalized_model.sav'
pickle.dump(pipe_rf, open(filename, 'wb'))


In [None]:
# load the model from disk
loaded_model = pickle.load(open('finalized_model.sav', 'rb'))
result = loaded_model.score(X_val, y_val)
print(result)

0.9829003849242023


In [None]:
df.columns

Index(['vendorid', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'pulocationid', 'dolocationid',
       'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount',
       'tolls_amount', 'improvement_surcharge', 'congestion_surcharge',
       'airport_fee', 'day', 'pickup_time', 'dropoff_time', 'duration_seconds',
       'duration_minutes', 'total_amount'],
      dtype='object')

In [None]:
new_data = df[['vendorid','passenger_count','trip_distance','pulocationid', 'dolocationid','payment_type','day','duration_seconds']].sample(1, random_state=42)
new_data

Unnamed: 0,vendorid,passenger_count,trip_distance,pulocationid,dolocationid,payment_type,day,duration_seconds
520900,1,1.0,1.5,143,246,1,8,426


In [None]:
# new_data = preprocessor.transform(new_data)
loaded_model.predict(new_data)[0]

9.531134

In [None]:
New_data = pd.DataFrame([[1,1.0,1.5,143,246,1,8,426]],dtype='object', columns=['vendorid','passenger_count','trip_distance','pulocationid', 'dolocationid','payment_type','day','duration_seconds'])

loaded_model.predict(New_data)[0]

9.531134