In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
df = pd.read_csv("uber.csv")
print(df.head())

In [None]:
df.info()

In [None]:
# Check for missing values
df.isna().sum()


In [None]:
# Drop rows with missing values
df.dropna(inplace=True)
print("After dropping NaNs:", df.shape)

In [None]:
# Remove invalid fare amounts (negative or zero)
df = df[df["fare_amount"] > 0]

In [None]:
# Remove invalid passenger counts
df = df[(df["passenger_count"] > 0) & (df["passenger_count"] <= 6)]

In [None]:
# Boxplot before removing outliers
plt.figure(figsize=(6, 3))
sns.boxplot(x=df["fare_amount"], color="lightcoral")

plt.title("Fare Amount Distribution (Before Outlier Removal)")
plt.show()

In [None]:
# --- Identify and remove outliers using IQR ---
Q1 = df["fare_amount"].quantile(0.25)
Q3 = df["fare_amount"].quantile(0.75)
IQR = Q3 - Q1

# Keep only data within 1.5 * IQR
df = df[
    (df["fare_amount"] >= Q1 - 1.5 * IQR)
    & (df["fare_amount"] <= Q3 + 1.5 * IQR)
]

In [None]:
# Boxplot after removing outliers
plt.figure(figsize=(6, 3))
sns.boxplot(x=df["fare_amount"], color="skyblue")
plt.title("Fare Amount Distribution (After Outlier Removal)")
plt.show()

In [None]:
def haversine(lon1, lat1, lon2, lat2):
    """Calculate great-circle distance (km) between two points."""
    R = 6371  # Earth radius (km)
    lon1, lon2, lat1, lat2 = map(np.radians, [lon1, lon2, lat1, lat2])
    dlon, dlat = lon2 - lon1, lat2 - lat1
    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c

# Compute distance and add as new column
df["distance_km"] = haversine(
    df["pickup_longitude"],
    df["pickup_latitude"],
    df["dropoff_longitude"],
    df["dropoff_latitude"],
)

# Remove entries with zero or very small distance
df = df[df["distance_km"] > 0.1]

In [None]:
df.head()

In [None]:
corr_matrix = df[["fare_amount", "distance_km", "passenger_count"]].corr()

plt.figure(figsize=(5, 4))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# Features and target
X = df[["distance_km", "passenger_count"]]
y = df["fare_amount"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
# ---- Linear Regression ----
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [None]:
# ---- Random Forest Regression ----
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

In [None]:
def evaluate_model(y_true, y_pred, model_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return rmse, r2

rmse_lr, r2_lr = evaluate_model(y_test, y_pred_lr, "Linear Regression")
rmse_rf, r2_rf = evaluate_model(y_test, y_pred_rf, "Random Forest")

In [None]:
comparison = pd.DataFrame(
    {
        "Model": ["Linear Regression", "Random Forest"],
        "RMSE": [rmse_lr, rmse_rf],
        "RÂ² Score": [r2_lr, r2_rf],
    }
)
print("\nModel Comparison:")
print(comparison)