In [2]:
# -------------------------------
# 📦 1. Imports
# -------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


In [3]:
# -------------------------------
# 📥 2. Load the data
# -------------------------------
# In case kernel has crashed again, load the data here so you can skip all previous cells
data_no_outliers = pd.read_parquet('data/final_data_cleaned_NIET_scaled.parquet')
data_no_outliers.head()


Unnamed: 0,passenger_count,trip_distance,RatecodeID_1.0,RatecodeID_2.0,RatecodeID_3.0,RatecodeID_4.0,RatecodeID_5.0,RatecodeID_6.0,RatecodeID_99.0,payment_type_1,...,Trip_Staten Island->Brooklyn,Trip_Staten Island->Manhattan,Trip_Staten Island->Queens,Trip_Staten Island->Staten Island,trip_duration,pickup_hour,pickup_day,peak_times,weekend,total_amount_no_tip
0,1.0,9.76,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,18.75,0.0,6.0,0,1,47.25
1,1.0,7.62,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,32.183334,23.0,5.0,0,1,42.299999
2,4.0,20.07,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,34.183334,0.0,6.0,0,1,82.690002
3,3.0,2.34,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,15.0,0.0,6.0,0,1,20.6
4,1.0,5.05,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,22.200001,0.0,6.0,0,1,31.799999


In [4]:
import statsmodels.api as sm

def train_regression_with_significance(data, target_column, not_hot_columns, test_size=0.2, random_state=42):
    # Split features and target
    y = data[target_column]
    X = data.drop(columns=[target_column])

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # Scale only the selected columns
    scaler = MinMaxScaler()
    X_train_scaled = pd.DataFrame(
        scaler.fit_transform(X_train[not_hot_columns]),
        columns=not_hot_columns,
        index=X_train.index
    )
    X_test_scaled = pd.DataFrame(
        scaler.transform(X_test[not_hot_columns]),
        columns=not_hot_columns,
        index=X_test.index
    )

    # Add constant for intercept
    X_train_sm = sm.add_constant(X_train_scaled)

    # Fit using statsmodels
    model = sm.OLS(y_train, X_train_sm).fit()

    # Predict and evaluate
    X_test_sm = sm.add_constant(X_test_scaled)
    y_pred = model.predict(X_test_sm)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Mean Squared Error: {mse:.4f}")
    print(f"R-squared: {r2:.4f}\n")
    print("📊 Regression Coefficients with Significance:\n")
    print(model.summary())

    return model, scaler, model.summary2().tables[1]



In [5]:
# -------------------------------
# 🚀 4. Use the function
# -------------------------------
not_hot_columns = [
    'passenger_count', 'trip_distance', 'trip_duration',
    'pickup_hour', 'pickup_day', 'peak_times', 'weekend'
]

model, scaler, coef_table = train_regression_with_significance(
    data_no_outliers,
    target_column='total_amount_no_tip',
    not_hot_columns=not_hot_columns
)


Mean Squared Error: 21.6905
R-squared: 0.9249

📊 Regression Coefficients with Significance:

                             OLS Regression Results                            
Dep. Variable:     total_amount_no_tip   R-squared:                       0.924
Model:                             OLS   Adj. R-squared:                  0.924
Method:                  Least Squares   F-statistic:                 4.300e+06
Date:                 Wed, 02 Apr 2025   Prob (F-statistic):               0.00
Time:                         10:42:30   Log-Likelihood:            -7.3328e+06
No. Observations:              2472729   AIC:                         1.467e+07
Df Residuals:                  2472721   BIC:                         1.467e+07
Df Model:                            7                                         
Covariance Type:             nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------

In [6]:
def predict_fare(model, scaler, not_hot_columns, input_dict):
    import pandas as pd
    import statsmodels.api as sm

    # Create a DataFrame from the input
    input_df = pd.DataFrame([input_dict], columns=not_hot_columns)

    # Scale it
    input_scaled = pd.DataFrame(scaler.transform(input_df), columns=not_hot_columns)

    # Add constant (force it to add)
    input_scaled_with_const = sm.add_constant(input_scaled, has_constant='add')

    # Align columns with model
    input_scaled_with_const = input_scaled_with_const[model.params.index]

    # Predict
    predicted_fare = model.predict(input_scaled_with_const)[0]

    return predicted_fare


In [13]:
example_input = {
    'passenger_count': 2,
    'trip_distance': 1,
    'trip_duration': 13.5,
    'pickup_hour': 18,
    'pickup_day': 2,
    'peak_times': 1,
    'weekend': 0
}

fare = predict_fare(model, scaler, not_hot_columns, example_input)
print(f"Predicted fare (excluding tip): ${fare:.2f}")


Predicted fare (excluding tip): $17.78
