Final Models

Final model Rate

In [None]:
import pandas as pd

# Path to the Parquet file
parquet_file_path = 'C:\\Users\\jaime\\Documents\\GitHub\\taxi-price-predictor\\trans_data.parquet'

# Load the Parquet file into a pandas DataFrame
df = pd.read_parquet(parquet_file_path)
df_time = pd.read_parquet(parquet_file_path)

# Display the first few rows of the DataFrame
print(df.head())


In [None]:
# Calculate mean and standard deviation
mean_fare = df['fare_amount'].mean()
std_fare = df['fare_amount'].std()

# Create a new column for the standardized fare amounts
df['fare_amount_sigmas'] = (df['fare_amount'] - mean_fare) / std_fare

# Filter out rows with 'fare_amount' beyond 3 standard deviations
df = df[(df['fare_amount_sigmas'] < 3) & (df['fare_amount_sigmas'] > -3)]

# Filter out rows with non-positive 'fare_amount'
df = df[df['fare_amount'] > 0]

# Drop the 'fare_amount_sigmas' column
df.drop("fare_amount_sigmas", inplace=True, axis=1)

# Display the first few rows of the DataFrame to verify the changes
print(df.head())

In [None]:
# Ensure that 'tpep_dropoff_datetime' and 'tpep_pickup_datetime' are in datetime format
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])

# Add the new 'duration_in_minutes' column to the DataFrame
df["duration_in_minutes"] = (df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]).dt.total_seconds() / 60

# Extract datetime components and replace original datetime columns
df["pickup_year"] = df["tpep_pickup_datetime"].dt.year
df["pickup_day"] = df["tpep_pickup_datetime"].dt.day
df["pickup_day_of_week"] = df["tpep_pickup_datetime"].dt.dayofweek
df["pickup_hour"] = df["tpep_pickup_datetime"].dt.hour
df["pickup_minute"] = df["tpep_pickup_datetime"].dt.minute

# Drop the original datetime columns
df.drop("tpep_pickup_datetime", inplace=True, axis=1)
df.drop("tpep_dropoff_datetime", inplace=True, axis=1)

# Display the first few rows of the DataFrame to verify the changes
print(df.head())

In [None]:
import numpy as np


# Ensure that 'pickup_year' is extracted and present in the DataFrame
if 'pickup_year' not in df.columns:
    df['pickup_year'] = pd.to_datetime(df['tpep_pickup_datetime']).dt.year

# Remove rows that are not from 2022
df.loc[df["pickup_year"] != 2022, "pickup_year"] = np.nan

# Replace all rows with trip_distance > 60 because they are outliers
df.loc[df["trip_distance"] > 60, "trip_distance"] = np.nan

# Remove all rows with RatecodeID > 6 because according to docs RatecodeID can only go from 1 to 6
df.loc[df["RatecodeID"] > 6, "RatecodeID"] = np.nan

# Remove rows with PULocationID > 263 or DOLocationID > 263 because they don't add value to the model
df.loc[df["PULocationID"] > 263, "PULocationID"] = np.nan
df.loc[df["DOLocationID"] > 263, "DOLocationID"] = np.nan

# Filter out rows where total_amount is 400 or more
df = df[df['total_amount'] < 400]

# Display the first few rows of the DataFrame to verify the changes
print(df.head())

In [None]:
# Replace None values in the 'store_and_fwd_flag' column with 'N'
df["store_and_fwd_flag"].replace({None: "N"}, inplace=True)

# Display the first few rows of the DataFrame to verify the changes
print(df.head())


In [None]:
# Create copies of the DataFrame for training and testing
X_train = df.copy()
X_test = df.copy()

# Assign target columns to y_train
y_train_total_amount = X_train["total_amount"]
y_train_duration_in_minutes = X_train["duration_in_minutes"]

# Drop target columns from X_train
X_train.drop(["total_amount", "duration_in_minutes"], inplace=True, axis=1)

# Assign target columns to y_test
y_test_total_amount = X_test["total_amount"]
y_test_duration_in_minutes = X_test["duration_in_minutes"]

# Drop target columns from X_test
X_test.drop(["total_amount", "duration_in_minutes"], inplace=True, axis=1)

# Output the results
print(X_train.head())
print(y_train_total_amount.head())
print(y_train_duration_in_minutes.head())
print(X_test.head())
print(y_test_total_amount.head())
print(y_test_duration_in_minutes.head())


In [None]:
import pandas as pd

# Function to remove specified columns from a DataFrame
def remove_columns_in_place(dfs, columns_to_remove):
    for df in dfs:
        columns_to_remove_existing = [col for col in columns_to_remove if col in df.columns]
        df.drop(columns=columns_to_remove_existing, axis=1, inplace=True)

# List of columns to remove
columns_to_remove = ['improvement_surcharge', 'congestion_surcharge', 'VendorID', 
                     'passenger_count', 'RatecodeID', 'store_and_fwd_flag', 'extra', 'tip_amount', 'fare_amount']

# Your DataFrames
df_list = [X_train, X_test]

# Remove columns from all DataFrames in place
remove_columns_in_place(df_list, columns_to_remove)

# Display the first few rows of the cleaned DataFrame to verify the changes
print(X_train.columns)

In [None]:


# Example LightGBM training code
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score
import pickle
# Prepare data for LightGBM
train_data = lgb.Dataset(X_train, label=y_train_total_amount)
test_data = lgb.Dataset(X_test, label=y_test_total_amount, reference=train_data)

# Set parameters for LightGBM
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# Train the model
num_round = 100
bst = lgb.train(params, train_data, num_round, valid_sets=[test_data])
# Save the model to a pickle file
with open('lgb_model.pickle', 'wb') as model_file:
    pickle.dump(bst, model_file)

# Make predictions
y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)

# Evaluate the model
mse = mean_squared_error(y_test_total_amount, y_pred)
print(f'Mean Squared Error: {mse}')
r2 = r2_score(y_test_total_amount, y_pred)
print(f"R-squared: {r2:.2f}")



Final model time

In [None]:
df_time = pd.read_parquet(parquet_file_path)

df_time.head()

In [None]:
# Function to remove specified columns from a DataFrame
def remove_columns(df_time, columns_to_remove):
  columns_to_remove_existing = [col for col in columns_to_remove if col in df_time.columns]
  df_time.drop(columns=columns_to_remove_existing, axis=1, inplace=True)
  return df_time

# List of columns to remove
columns_to_remove = ['improvement_surcharge', 'congestion_surcharge', 'VendorID', 
                     'passenger_count', 'RatecodeID', 'store_and_fwd_flag', 'extra', 'tip_amount', 'fare_amount', 'payment_type']

# Remove columns from the DataFrame
df_time = remove_columns(df_time.copy(), columns_to_remove)

# Display the first few rows of the cleaned DataFrame to verify the changes
print(df_time.columns)

In [None]:
def add_average_speed(df_time, min_trip_duration=1):
  # Parse datetime columns
  df_time['tpep_pickup_datetime'] = pd.to_datetime(df_time['tpep_pickup_datetime'])
  df_time['tpep_dropoff_datetime'] = pd.to_datetime(df_time['tpep_dropoff_datetime'])

  # Calculate trip duration in minutes
  df_time['trip_duration'] = (df_time['tpep_dropoff_datetime'] - df_time['tpep_pickup_datetime']) / pd.Timedelta(minutes=1)

  # Handle zero trip durations (optional)
  df_time.loc[df_time['trip_duration'] == 0, 'trip_duration'] = min_trip_duration

  # Calculate average speed in miles per minute
  average_speed = df_time['trip_distance'] / df_time['trip_duration']

  # Add new columns (rounded to two decimals)
  df_time['average_speed_mph'] = average_speed.round(2)  # Miles per minute
  df_time['trip_duration'] = df_time['trip_duration'].round(2)

  return df_time

In [None]:
df_time = add_average_speed(df_time.copy())

# Now df_time_modified will have the new column
print(df_time.columns) 

In [None]:
def remove_columns(df_time, columns_to_remove):
  columns_to_remove_existing = [col for col in columns_to_remove if col in df_time.columns]
  df_time.drop(columns=columns_to_remove_existing, axis=1, inplace=True)
  return df_time

# List of columns to remove
columns_to_remove = ['tpep_dropoff_datetime']

# Remove columns from the DataFrame
df_time = remove_columns(df_time.copy(), columns_to_remove)

# Display the first few rows of the cleaned DataFrame to verify the changes
print(df_time.columns)

In [None]:
# Create copies of the DataFrame for training and testing
X_train = df_time.copy()
X_test = df_time.copy()

# Assign target column to y_train
y_train_duration = X_train["trip_duration"]

# Drop target column from X_train
X_train.drop(["trip_duration"], inplace=True, axis=1)

# Assign target column to y_test
y_test_duration = X_test["trip_duration"]

# Drop target column from X_test
X_test.drop(["trip_duration"], inplace=True, axis=1)

# Output the results
print(X_train.head())
print(y_train_duration.head())
print(X_test.head())
print(y_test_duration.head())

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error



# Drop datetime columns from the feature set
X = df_time.drop(columns=['trip_duration', 'tpep_pickup_datetime'])
y = df_time['trip_duration']

# Convert categorical columns to numeric using one-hot encoding
X = pd.get_dummies(X, drop_first=True)

# Fill NaN values with the median of each column
X.fillna(X.median(), inplace=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LightGBM Regressor
lgbm = lgb.LGBMRegressor(random_state=42)
lgbm.fit(X_train, y_train)
y_pred_lgbm = lgbm.predict(X_test)
r2_lgbm = r2_score(y_test, y_pred_lgbm)
mse_lgbm = mean_squared_error(y_test, y_pred_lgbm)
# Save the model
with open('lightgbm_regressor_model.pkl', 'wb') as file:
    pickle.dump(lgbm, file)

# XGBoost Regressor
xgbr = xgb.XGBRegressor(random_state=42)
xgbr.fit(X_train, y_train)
y_pred_xgbr = xgbr.predict(X_test)
r2_xgbr = r2_score(y_test, y_pred_xgbr)
mse_xgbr = mean_squared_error(y_test, y_pred_xgbr)
# Save the model
with open('xgboost_regressor_model.pkl', 'wb') as file:
    pickle.dump(xgbr, file)


# Print the results
print(f"LightGBM Regressor - R2: {r2_lgbm:.4f}, MSE: {mse_lgbm:.4f}")
print(f"XGBoost Regressor - R2: {r2_xgbr:.4f}, MSE: {mse_xgbr:.4f}")




In [None]:
# Load the saved LightGBM Regressor model
with open('lightgbm_regressor_model.pkl', 'rb') as file:
    loaded_lgbm = pickle.load(file)

# Make predictions with the loaded model
loaded_lgbm_predictions = loaded_lgbm.predict(X_test)
print(f"Loaded LightGBM Regressor - R2: {r2_score(y_test, loaded_lgbm_predictions):.4f}, MSE: {mean_squared_error(y_test, loaded_lgbm_predictions):.4f}")

# Load the saved XGBoost Regressor model
with open('xgboost_regressor_model.pkl', 'rb') as file:
    loaded_xgbr = pickle.load(file)

# Make predictions with the loaded model
loaded_xgbr_predictions = loaded_xgbr.predict(X_test)
print(f"Loaded XGBoost Regressor - R2: {r2_score(y_test, loaded_xgbr_predictions):.4f}, MSE: {mean_squared_error(y_test, loaded_xgbr_predictions):.4f}")