In [23]:
import pandas as pd
import numpy as np

# import the dataframe
df = pd.read_csv('./datasets/uber.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [25]:
df.describe()

Unnamed: 0.1,Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,200000.0,200000.0,200000.0,200000.0,199999.0,199999.0,200000.0
mean,27712500.0,11.359955,-72.527638,39.935885,-72.525292,39.92389,1.684535
std,16013820.0,9.901776,11.437787,7.720539,13.117408,6.794829,1.385997
min,1.0,-52.0,-1340.64841,-74.015515,-3356.6663,-881.985513,0.0
25%,13825350.0,6.0,-73.992065,40.734796,-73.991407,40.733823,1.0
50%,27745500.0,8.5,-73.981823,40.752592,-73.980093,40.753042,1.0
75%,41555300.0,12.5,-73.967154,40.767158,-73.963658,40.768001,2.0
max,55423570.0,499.0,57.418457,1644.421482,1153.572603,872.697628,208.0


In [28]:
df.isnull().sum()

Unnamed: 0           0
key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    1
dropoff_latitude     1
passenger_count      0
dtype: int64

In [30]:
df.dropna(inplace=True)

In [32]:
df.isnull().sum()

fare_amount          0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
distance_km          0
dtype: int64

In [34]:
upper_limit = df['fare_amount'].quantile(0.99)
lower_limit = df['fare_amount'].quantile(0.01)
df['fare_amount'] = df['fare_amount'].clip(lower_limit, upper_limit)
df['fare_amount'].describe()

count    199987.000000
mean         11.251634
std           9.050967
min           3.300000
25%           6.000000
50%           8.500000
75%          12.500000
max          53.300000
Name: fare_amount, dtype: float64

In [44]:
df['passenger_count'] = df['passenger_count'].clip(1,6)
df['passenger_count'].describe() 

count    199987.000000
mean          1.687075
std           1.303782
min           1.000000
25%           1.000000
50%           1.000000
75%           2.000000
max           6.000000
Name: passenger_count, dtype: float64

In [45]:
df = df.drop(['Unnamed: 0', 'key', 'pickup_datetime'], axis=1, errors='ignore')
corr_matrix = df.corr()
corr_matrix

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance_km
fare_amount,1.0,0.008121,-0.008337,0.008564,-0.009237,0.01272,0.026869
pickup_longitude,0.008121,1.0,-0.979048,0.949698,-0.936642,-0.000539,0.163551
pickup_latitude,-0.008337,-0.979048,1.0,-0.93669,0.958143,-0.000801,-0.142183
dropoff_longitude,0.008564,0.949698,-0.93669,1.0,-0.979692,-0.000155,0.140684
dropoff_latitude,-0.009237,-0.936642,0.958143,-0.979692,1.0,-0.001697,-0.124631
passenger_count,0.01272,-0.000539,-0.000801,-0.000155,-0.001697,1.0,-0.001371
distance_km,0.026869,0.163551,-0.142183,0.140684,-0.124631,-0.001371,1.0


In [46]:
from geopy.distance import geodesic

# Keep only valid lat/lon rows
df = df[
    (df['pickup_latitude'].between(-90, 90)) &
    (df['pickup_longitude'].between(-180, 180)) &
    (df['dropoff_latitude'].between(-90, 90)) &
    (df['dropoff_longitude'].between(-180, 180))
].copy()

# Now geodesic will work safely
def calculate_distance(row):
    pickup = (row['pickup_latitude'], row['pickup_longitude'])
    dropoff = (row['dropoff_latitude'], row['dropoff_longitude'])
    return geodesic(pickup, dropoff).km

df['distance_km'] = df.apply(calculate_distance, axis=1)


In [47]:
X = df[['distance_km', 'passenger_count']]  # features
y = df['fare_amount']                        # target


In [48]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [49]:
from sklearn.linear_model import LinearRegression

# Initialize the model
model = LinearRegression()

# Train (fit) the model on the training data
model.fit(X_train, y_train)


In [50]:
# Predict fare amounts for the test set
y_pred = model.predict(X_test)


In [51]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.2f}")


MAE: 5.85
RMSE: 8.96
R²: 0.00


In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

print("RF R²:", r2_score(y_test, rf_pred))

