# Fare Prediction Pipeline: Distance + Transport Type


In [None]:
# Setup & Imports
import os, warnings
warnings.filterwarnings("ignore")
import pandas as pd, numpy as np
np.random.seed(42)
from typing import List, Tuple, Dict, Any

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
import joblib
from sklearn.pipeline import Pipeline as SKPipeline
from sklearn.compose import ColumnTransformer as SKColumnTransformer

#Helper Functions "Metrics + Rounding"

In [None]:
def metrics_dict(y_true, y_pred, n_features=None):
    r2 = r2_score(y_true, y_pred)
    adj_r2 = None
    if n_features is not None and n_features < len(y_true) - 1:
        n = len(y_true)
        adj_r2 = 1 - (1 - r2) * (n - 1) / (n - n_features - 1)
    mse = mean_squared_error(y_true, y_pred)
    return {"R2": r2, "Adj_R2": adj_r2, "MSE": mse}

def print_metrics(name, m):
    adj = f" | Adj_R2={m['Adj_R2']:.4f}" if m['Adj_R2'] is not None else ""
    print(f"{name:35s} | R2={m['R2']:.4f}{adj} | MSE={m['MSE']:.3f}")

def print_metrics_r2_adj_mse(name, m):
    return print_metrics(name, m)


In [None]:
def round_bus_style(vals):
    scalar = np.isscalar(vals)
    arr = np.array([vals]) if scalar else np.asarray(vals)
    out = []
    for v in arr:
        pounds = int(np.floor(v))
        dec = v - pounds
        if dec < 0.125: r = pounds + 0.0
        elif dec < 0.375: r = pounds + 0.25
        elif dec < 0.75: r = pounds + 0.5
        else: r = pounds + 1.0
        out.append(round(r, 2))

    return out[0] if scalar else np.array(out)

In [None]:
class DistanceFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, use_generated=True):
        self.use_generated = use_generated
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        df = X if isinstance(X, pd.DataFrame) else pd.DataFrame(X, columns=['distance_km'])
        out = pd.DataFrame()
        out['distance_km'] = pd.to_numeric(df['distance_km'], errors='coerce')
        if self.use_generated:
            out['distance_log'] = np.log1p(out['distance_km'])
        return out

In [None]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        df = X if isinstance(X, pd.DataFrame) else pd.DataFrame(X)
        return pd.DataFrame(df[self.columns])

In [None]:
class DummyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column
        self.categories_ = None
    def fit(self, X, y=None):
        s = X[self.column].astype('category')
        self.categories_ = s.cat.categories.tolist()
        return self
    def transform(self, X):
        s = X[self.column].astype('category')
        dummies = pd.get_dummies(s, drop_first=True)
        dummies.columns = [f"{self.column}_{c}" for c in dummies.columns]
        return dummies

In [None]:
class TargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, column, smoothing=10.0):
        self.column = column
        self.smoothing = smoothing
        self.global_mean_ = None
        self.category_mean_ = None
    def fit(self, X, y):
        df = pd.DataFrame({self.column: X[self.column], 'y': y})
        self.global_mean_ = df['y'].mean()
        means = df.groupby(self.column)['y'].agg(['mean', 'count'])
        smoothing = 1 / (1 + np.exp(-(means['count'] - self.smoothing)))
        self.category_mean_ = self.global_mean_ * (1 - smoothing) + means['mean'] * smoothing
        return self
    def transform(self, X):
        s = X[self.column]
        te = s.map(self.category_mean_).fillna(self.global_mean_)
        return pd.DataFrame({f"{self.column}_te": te})

In [None]:
prices = pd.read_csv('prices.csv', dtype=str)
dists = pd.read_csv('trip_distances.csv', dtype=str)

In [None]:
prices['Price'] = pd.to_numeric(prices.get('Price'), errors='coerce')
prices['passangers'] = pd.to_numeric(prices.get('passangers'), errors='coerce')
dists['distance_km'] = pd.to_numeric(dists.get('distance_km'), errors='coerce')

In [None]:
df = pd.merge(prices, dists, on='trip_id', how='left')
df = df.drop_duplicates(subset=['trip_id']).reset_index(drop=True)
df = df[~df['distance_km'].isna()].reset_index(drop=True)

In [None]:
known = df.dropna(subset=['Price']).reset_index(drop=True)
unknown = df[df['Price'].isna()].reset_index(drop=True)
print(f"Known prices: {len(known)} | Unknown to predict: {len(unknown)}")

Known prices: 61 | Unknown to predict: 102


In [None]:
train_idx, test_idx = train_test_split(np.arange(len(known)), test_size=0.20, random_state=42)
train_df = known.iloc[train_idx].reset_index(drop=True)
test_df  = known.iloc[test_idx].reset_index(drop=True)

In [None]:
num_cols_raw = ['distance_km']
cat_cols = ['agency_id']

In [None]:
# Build pipeline variants
def build_pipeline(encoding: str, scale: bool, model_name: str, use_generated=True, ridge_alphas=None):
    num_distance = Pipeline([
        ("dist_feats", DistanceFeatures(use_generated=use_generated)),
        ("scaler", StandardScaler()) if scale else ("identity", SimpleImputer(strategy='median'))
    ])

    # Agency encoders
    if encoding == 'onehot':
        cat_transform = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        cat_pipe = ColumnTransformer([("oh", cat_transform, cat_cols)], remainder='drop')
    elif encoding == 'dummy':
        cat_pipe = Pipeline([("dummy", DummyEncoder(column='agency_id'))])
    elif encoding == 'target':
        cat_pipe = Pipeline([("target", TargetEncoder(column='agency_id', smoothing=5.0))])
    else:
        raise ValueError("encoding must be one of ['onehot','dummy','target']")

    # Assemble features with fitted transformers (two-part fitted tuple)
    def assemble_features(X, fitted):
        nd, cp = fitted
        d_part = nd.transform(X[num_cols_raw])
        cat_part = cp.transform(X)
        if use_generated:
            d_df = pd.DataFrame(d_part, columns=['distance_km','distance_log'])
        else:
            d_df = pd.DataFrame(d_part, columns=['distance_km'])
        if encoding == 'onehot':
            cat_names = list(cp.named_transformers_['oh'].get_feature_names_out(['agency_id']))
            cat_df = pd.DataFrame(cat_part, columns=cat_names)
        elif encoding == 'dummy':
            cat_df = pd.DataFrame(cat_part)
        else:
            cat_df = pd.DataFrame(cat_part, columns=['agency_id_te'])
        features = pd.concat([d_df, cat_df], axis=1).fillna(0)
        return features

    # Estimator
    if model_name == 'lr':
        est = LinearRegression()
        param_grid = None
    elif model_name == 'ridge':
        est = Ridge()
        param_grid = {"alpha": ridge_alphas or [0.1, 0.3, 1.0, 3.0, 10.0]}
    else:
        raise ValueError("model_name must be 'lr' or 'ridge'")

    return (num_distance, cat_pipe), assemble_features, est, param_grid

In [None]:
# Experiments
X_train_raw = train_df[[*num_cols_raw, *cat_cols]].copy()
y_train = train_df['Price'].values
X_test_raw = test_df[[*num_cols_raw, *cat_cols]].copy()
y_test = test_df['Price'].values

encodings = ['onehot','dummy','target']
scaling_opts = [False, True]
models = ['lr','ridge']
ridge_alphas = [0.1, 0.3, 1.0, 3.0, 10.0]

rows = []
experiments = []

In [None]:
# Experiments: define feature configs (distance vs agency)
feature_configs = [
    (lambda df: df[['distance_log']], 'distance_log'),
    (lambda df: pd.concat([df[['distance_log']], df[[c for c in df.columns if c.startswith('agency_id_') or c=='agency_id_te']]], axis=1), 'distance_log + agency')
]

In [None]:
# Experiments: fit transformers per setting
for enc in encodings:
    for scale in scaling_opts:
        for model_name in models:
            (num_distance, cat_pipe), assemble, est, param_grid = build_pipeline(
                encoding=enc, scale=scale, model_name=model_name, use_generated=True,
                ridge_alphas=ridge_alphas
            )
            # Fit transformers only on train (to avoid leakage)
            num_distance.fit(X_train_raw[num_cols_raw])
            if enc == 'target':
                cat_pipe.fit(X_train_raw, y_train)
            else:
                cat_pipe.fit(X_train_raw)
            fitted = (num_distance, cat_pipe)

            # Assemble features
            X_train_feats = assemble(pd.DataFrame({
                'distance_km': X_train_raw['distance_km'],
                'agency_id': X_train_raw['agency_id']
            }), fitted)
            X_test_feats = assemble(pd.DataFrame({
                'distance_km': X_test_raw['distance_km'],
                'agency_id': X_test_raw['agency_id']
            }), fitted)

            # Evaluate configs
            for make_feats, tag in feature_configs:
                # if tag needs agency columns, ensure they exist
                if 'agency' in tag:
                    agency_cols = [c for c in X_train_feats.columns if c.startswith('agency_id_') or c=='agency_id_te']
                    if len(agency_cols) == 0:
                        continue
                Xtr = make_feats(X_train_feats).values
                Xte = make_feats(X_test_feats).values
                exp_name = f"enc={enc}_scale={scale}_model={model_name}_feats={tag}"
                if model_name == 'ridge':
                    gs = GridSearchCV(est, param_grid={"alpha": ridge_alphas}, cv=3)
                    gs.fit(Xtr, y_train)
                    best_est = gs.best_estimator_
                    y_pred = best_est.predict(Xte)
                    m = metrics_dict(y_test, y_pred, n_features=Xtr.shape[1])
                    experiments.append({"name": exp_name, "model": best_est, "features": tag, "metrics": m})
                    print_metrics_r2_adj_mse(f"{exp_name} | Ridge(alpha={best_est.alpha})", m)
                    rows.append({
                        "experiment": exp_name,
                        "model": f"Ridge(alpha={best_est.alpha})",
                        "features": tag,
                        "R2": m['R2'], "Adj_R2": m['Adj_R2'], "MSE": m['MSE']
                    })
                else:
                    est.fit(Xtr, y_train)
                    y_pred = est.predict(Xte)
                    m = metrics_dict(y_test, y_pred, n_features=Xtr.shape[1])
                    experiments.append({"name": exp_name, "model": est, "features": tag, "metrics": m})
                    print_metrics_r2_adj_mse(f"{exp_name} | LinearRegression", m)
                    rows.append({
                        "experiment": exp_name,
                        "model": "LinearRegression",
                        "features": tag,
                        "R2": m['R2'], "Adj_R2": m['Adj_R2'], "MSE": m['MSE']
                    })

enc=onehot_scale=False_model=lr_feats=distance_log | LinearRegression | R2=0.7976 | Adj_R2=0.7792 | MSE=1.620
enc=onehot_scale=False_model=lr_feats=distance_log + agency | LinearRegression | R2=0.7898 | Adj_R2=0.7198 | MSE=1.682
enc=onehot_scale=False_model=ridge_feats=distance_log | Ridge(alpha=3.0) | R2=0.7714 | Adj_R2=0.7506 | MSE=1.830
enc=onehot_scale=False_model=ridge_feats=distance_log + agency | Ridge(alpha=0.3) | R2=0.7890 | Adj_R2=0.7187 | MSE=1.689
enc=onehot_scale=True_model=lr_feats=distance_log | LinearRegression | R2=0.7976 | Adj_R2=0.7792 | MSE=1.620
enc=onehot_scale=True_model=lr_feats=distance_log + agency | LinearRegression | R2=0.7898 | Adj_R2=0.7198 | MSE=1.682
enc=onehot_scale=True_model=ridge_feats=distance_log | Ridge(alpha=3.0) | R2=0.7798 | Adj_R2=0.7597 | MSE=1.763
enc=onehot_scale=True_model=ridge_feats=distance_log + agency | Ridge(alpha=0.3) | R2=0.7896 | Adj_R2=0.7195 | MSE=1.684
enc=dummy_scale=False_model=lr_feats=distance_log | LinearRegression | R2=0.

In [None]:
# Experiments: summary and best selection
summary_df = pd.DataFrame(rows).sort_values(["Adj_R2","R2","MSE"], ascending=[False,False,True]).reset_index(drop=True)
print("\n==== Comparison summary (distance/agency/passangers) ====")
display(summary_df)

best_row = summary_df.iloc[0]
print("\nBest overall experiment & model:")
display(best_row)

best_exp_name = best_row["experiment"]
best_model_desc = best_row["model"]
best_features_tag = best_row["features"]

# Retrieve model object
best_model_obj = None
for exp in experiments:
    if exp["name"] == best_exp_name and exp['features'] == best_features_tag:
        best_model_obj = exp["model"]
        break

print(f"\nChosen best: {best_model_desc} from {best_exp_name} (features={best_features_tag})")


==== Comparison summary (distance/agency/passangers) ====


Unnamed: 0,experiment,model,features,R2,Adj_R2,MSE
0,enc=onehot_scale=True_model=lr_feats=distance_log,LinearRegression,distance_log,0.797625,0.779228,1.620046
1,enc=dummy_scale=True_model=lr_feats=distance_log,LinearRegression,distance_log,0.797625,0.779228,1.620046
2,enc=target_scale=True_model=lr_feats=distance_log,LinearRegression,distance_log,0.797625,0.779228,1.620046
3,enc=onehot_scale=False_model=lr_feats=distance...,LinearRegression,distance_log,0.797625,0.779228,1.620046
4,enc=dummy_scale=False_model=lr_feats=distance_log,LinearRegression,distance_log,0.797625,0.779228,1.620046
5,enc=target_scale=False_model=lr_feats=distance...,LinearRegression,distance_log,0.797625,0.779228,1.620046
6,enc=onehot_scale=True_model=ridge_feats=distan...,Ridge(alpha=3.0),distance_log,0.779759,0.759737,1.76307
7,enc=dummy_scale=True_model=ridge_feats=distanc...,Ridge(alpha=3.0),distance_log,0.779759,0.759737,1.76307
8,enc=target_scale=True_model=ridge_feats=distan...,Ridge(alpha=3.0),distance_log,0.779759,0.759737,1.76307
9,enc=onehot_scale=False_model=ridge_feats=dista...,Ridge(alpha=3.0),distance_log,0.771394,0.750611,1.830035



Best overall experiment & model:


Unnamed: 0,0
experiment,enc=onehot_scale=True_model=lr_feats=distance_log
model,LinearRegression
features,distance_log
R2,0.797625
Adj_R2,0.779228
MSE,1.620046



Chosen best: LinearRegression from enc=onehot_scale=True_model=lr_feats=distance_log (features=distance_log)


# Why scores drop when adding agency

- Why `distance_log + agency` can score lower than `distance_log` alone :
  - Small sample size: With limited data, adding many categorical dummy columns can overfit and reduce generalization; Adjusted R² reflects this.
  - Sparse categories: If some `agency_id` values appear rarely, their coefficients are poorly estimated, adding variance without enough gain.
  - Collinearity/noise: If agency effects are small compared to the dominant distance signal, the added features contribute mostly noise

- Why  still prefer `distance_log + agency` with Ridge:#####
  - Regularization shrinks unstable coefficients and controls overfitting; it’s safer when you expect real agency-level pricing differences

  - Future data: With more observations per agency, those effects will be estimated better, and the model can leverage them. Without `agency_id`, the model risks attributing all variation to distance and may over-rely on it
               "the model will over depand on the distance"
  - Practical pricing: Agencies often have systematic markups/discounts. Keeping the feature lets the model learn them as data grows

- Target encoding vs One-Hot/Dummy for `agency_id`:
  - Target encoding (with smoothing) maps each category to an average target; it’s efficient for high-cardinality features and can capture ordering-like effects. However it did not lead to best results, and can be biased when category counts are small ..............(it was logically better to use)


  - fit target encoding on train only with smoothing, but if some agencies have very few samples, their encoded means still have high variance. Ridge then has to correct this noisy signal

  - One-Hot/Dummy performed better likely because categories are few
  
   linear models with Ridge can estimate clean, separate offsets per agency without compressing information into a single mean. This avoids bias from target encoding when counts are small



#Final Model

In [None]:
# Best Pipeline: distance_log only + agency dummy (drop-first), WITH scaling, Ridge(alpha=0.1)
preprocess_best = SKColumnTransformer([
    ("distance", SKPipeline([("dist", DistanceFeatures(use_generated=True)), ("select_log", ColumnSelector(["distance_log"])), ("scaler", StandardScaler())]), num_cols_raw),
    ("agency", OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'), cat_cols)
], remainder='drop')

ridge_pipe = SKPipeline([
    ("preprocess", preprocess_best),
    ("model", Ridge(alpha=0.1))
])

In [None]:
# Fit on train, evaluate on test
X_train = train_df[[*num_cols_raw, *cat_cols]]
X_test  = test_df[[*num_cols_raw, *cat_cols]]
y_train = train_df['Price'].values
y_test  = test_df['Price'].values

In [None]:
ridge_pipe.fit(X_train, y_train)
y_pred_rg = ridge_pipe.predict(X_test)
print_metrics("Pipeline Ridge (alpha=0.1, dummy, WITH-scaling)", metrics_dict(y_test, y_pred_rg, n_features=None))

Pipeline Ridge (alpha=0.1, dummy, WITH-scaling) | R2=0.7902 | MSE=1.680


In [None]:
# Fit best pipeline on all known, predict unknown, save outputs using sklearn pipeline (dummy, no scaling, alpha=0.1)
X_known_raw = known[[*num_cols_raw, *cat_cols]].copy()
y_known = known['Price'].values
X_unknown_raw = unknown[[*num_cols_raw, *cat_cols]].copy()

In [None]:
# Refit the fixed estimator (preprocess + Ridge(alpha=0.1)
best_est_all = ridge_pipe.fit(X_known_raw, y_known)

In [None]:
joblib.dump(best_est_all, 'fare_model_sklearn.joblib')
print("Saved model: fare_model_sklearn.joblib")

Saved model: fare_model_sklearn.joblib


In [None]:
# Predict unknown
pred_unknown = best_est_all.predict(X_unknown_raw)
unknown['Price_pred'] = round_bus_style(pred_unknown)

In [None]:
# Save files
unknown[['trip_id', 'Price_pred']].to_excel('predicted_unknown_prices.xlsx', index=False)
all_trips = pd.concat([
    known[['trip_id', 'Price']].rename(columns={'Price': 'Price_final'}),
    unknown[['trip_id', 'Price_pred']].rename(columns={'Price_pred': 'Price_final'})
], ignore_index=True)
all_trips.to_excel('all_trips_with_prices.xlsx', index=False)