In [None]:
# ride_fare_prediction.py
# Complete Ride Fare Prediction Project

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from geopy.distance import geodesic
import warnings
warnings.filterwarnings('ignore')

# ---------------------------
# 1. Load Dataset
# ---------------------------
# You can download NYC Taxi dataset or use a smaller sample CSV
# Example: https://www.kaggle.com/c/nyc-taxi-fare-prediction/data
df = pd.read_csv('train.csv')  # Replace with your dataset path

# Quick look at data
print("Data Sample:")
print(df.head())

# ---------------------------
# 2. Data Cleaning
# ---------------------------
# Remove missing values
df = df.dropna()

# Remove unrealistic fares and passenger counts
df = df[(df['fare_amount'] > 0) & (df['fare_amount'] < 500)]
df = df[(df['passenger_count'] > 0) & (df['passenger_count'] <= 6)]

# ---------------------------
# 3. Feature Engineering
# ---------------------------
# Convert pickup_datetime to datetime
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['hour'] = df['pickup_datetime'].dt.hour
df['day_of_week'] = df['pickup_datetime'].dt.dayofweek

# Compute trip distance using geodesic
def compute_distance(row):
    pickup = (row['pickup_latitude'], row['pickup_longitude'])
    dropoff = (row['dropoff_latitude'], row['dropoff_longitude'])
    return geodesic(pickup, dropoff).km

df['distance_km'] = df.apply(compute_distance, axis=1)

# ---------------------------
# 4. Prepare Features and Target
# ---------------------------
features = ['passenger_count', 'hour', 'day_of_week', 'distance_km']
X = df[features]
y = df['fare_amount']

# ---------------------------
# 5. Train/Test Split
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ---------------------------
# 6. Train Model
# ---------------------------
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# ---------------------------
# 7. Evaluate Model
# ---------------------------
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

# ---------------------------
# 8. Predict New Ride Fare
# ---------------------------
# Example ride details
new_ride = pd.DataFrame({
    'passenger_count': [2],
    'hour': [15],
    'day_of_week': [2],
    'distance_km': [5.4]  # distance in km
})

predicted_fare = model.predict(new_ride)
print(f"Predicted Fare for new ride: ${predicted_fare[0]:.2f}")

# ---------------------------
# 9. Save Model (Optional)
# ---------------------------
import joblib
joblib.dump(model, 'ride_fare_model.pkl')
print("Model saved as ride_fare_model.pkl")