In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("uber.csv")
df.head()


Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [2]:
num_cols = df.select_dtypes(include=np.number).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

cat_cols = df.select_dtypes(exclude=np.number).columns
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])


In [3]:

cat_cols = df.select_dtypes(include="object").columns
cat_cols


Index(['key', 'pickup_datetime'], dtype='object')

In [4]:

low_cardinality_cols = [
    col for col in cat_cols if df[col].nunique() <= 10
]

print("Encoding these columns:", low_cardinality_cols)

df = pd.get_dummies(df, columns=low_cardinality_cols, drop_first=True)


Encoding these columns: []


In [5]:

high_cardinality_cols = [
    col for col in cat_cols if df[col].nunique() > 50
]

print("Dropping columns:", high_cardinality_cols)

df.drop(columns=high_cardinality_cols, inplace=True)


Dropping columns: ['key', 'pickup_datetime']


In [6]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    random_state=42)


In [8]:


X_train = X_train.select_dtypes(include=[np.number])
X_test = X_test.select_dtypes(include=[np.number])

print("Remaining dtypes in X_train:")
print(X_train.dtypes)


Remaining dtypes in X_train:
Unnamed: 0             int64
fare_amount          float64
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
dtype: object


In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [10]:
corr_with_target = df.corr()[df.columns[-1]].sort_values(ascending=False)
corr_with_target


passenger_count      1.000000
fare_amount          0.010150
Unnamed: 0           0.002257
dropoff_longitude    0.000034
pickup_longitude    -0.000414
dropoff_latitude    -0.000660
pickup_latitude     -0.001560
Name: passenger_count, dtype: float64

In [11]:
pd.DataFrame(X_train_scaled, columns=X_train.columns).to_csv(
    "X_train_scaled.csv", index=False
)

pd.DataFrame(X_test_scaled, columns=X_test.columns).to_csv(
    "X_test_scaled.csv", index=False
)


    COMPONENT - 2
    

In [12]:

X_train = X_train.select_dtypes(include=[np.number])
X_test = X_test.select_dtypes(include=[np.number])

X_train.shape, X_test.shape


((160000, 6), (40000, 6))

In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [14]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score)


In [15]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)

    return mae, rmse, r2


In [16]:
df.columns


Index(['Unnamed: 0', 'fare_amount', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count'],
      dtype='object')

In [18]:

df.drop(columns=["Unnamed: 0"], inplace=True)


In [19]:
target_col = "fare_amount"

X = df.drop(columns=[target_col])
y = df[target_col]


In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)


In [21]:
import numpy as np

X_train = X_train.select_dtypes(include=[np.number])
X_test = X_test.select_dtypes(include=[np.number])

X_train.dtypes.value_counts()


float64    4
int64      1
Name: count, dtype: int64

In [22]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [24]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def evaluate_model(model):
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_test_scaled)

    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)

    return mae, rmse, r2


models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.01),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(
        n_estimators=150,
        random_state=42,
        n_jobs=-1
    )
}

results = []

for name, model in models.items():
    results.append((name, *evaluate_model(model)))

results_df = pd.DataFrame(
    results, columns=["Model", "MAE", "RMSE", "R2 Score"]
)

results_df.sort_values(by="R2 Score", ascending=False)


Unnamed: 0,Model,MAE,RMSE,R2 Score
4,Random Forest,2.29129,5.65885,0.698812
3,Decision Tree,3.072788,7.083113,0.528123
2,Lasso Regression,6.103557,10.309387,0.000353
1,Ridge Regression,6.103717,10.309518,0.000328
0,Linear Regression,6.103717,10.309518,0.000328


In [27]:

results_df


Unnamed: 0,Model,MAE,RMSE,R2 Score
0,Linear Regression,6.103717,10.309518,0.000328
1,Ridge Regression,6.103717,10.309518,0.000328
2,Lasso Regression,6.103557,10.309387,0.000353
3,Decision Tree,3.072788,7.083113,0.528123
4,Random Forest,2.29129,5.65885,0.698812


In [25]:
best_model = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

best_model.fit(X_train_scaled, y_train)


In [28]:
results_df.style.format({
    "MAE": "{:.2f}",
    "RMSE": "{:.2f}",
    "R2 Score": "{:.3f}"
})


Unnamed: 0,Model,MAE,RMSE,R2 Score
0,Linear Regression,6.1,10.31,0.0
1,Ridge Regression,6.1,10.31,0.0
2,Lasso Regression,6.1,10.31,0.0
3,Decision Tree,3.07,7.08,0.528
4,Random Forest,2.29,5.66,0.699
