In [17]:
import pandas as pd 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV


In [6]:
df = pd.read_csv("../cleaned_dataset.csv")

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1164996 entries, 0 to 1164995
Data columns (total 18 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   distance          1164996 non-null  float64
 1   cab_type          1164996 non-null  object 
 2   destination       1164996 non-null  object 
 3   source            1164996 non-null  object 
 4   price             1164996 non-null  float64
 5   surge_multiplier  1164996 non-null  float64
 6   name              1164996 non-null  object 
 7   date_time         1164996 non-null  object 
 8   temp              1164996 non-null  float64
 9   location          1164996 non-null  object 
 10  clouds            1164996 non-null  float64
 11  pressure          1164996 non-null  float64
 12  rain              1164996 non-null  float64
 13  humidity          1164996 non-null  float64
 14  wind              1164996 non-null  float64
 15  day               1164996 non-null  int64  
 16  

In [8]:
df.head()

Unnamed: 0,distance,cab_type,destination,source,price,surge_multiplier,name,date_time,temp,location,clouds,pressure,rain,humidity,wind,day,hour,month
0,0.44,Lyft,North Station,Haymarket Square,5.0,1.0,Shared,2018-12-16 09:30:07.890000105,38.46,Haymarket Square,0.29,1022.25,0.0,0.76,7.68,6,9,12
1,0.44,Lyft,North Station,Haymarket Square,11.0,1.0,Lux,2018-11-27 02:00:23.677000046,44.31,Haymarket Square,1.0,1003.17,0.1123,0.9,13.69,1,2,11
2,0.44,Lyft,North Station,Haymarket Square,11.0,1.0,Lux,2018-11-27 02:00:23.677000046,43.82,Haymarket Square,0.99,1002.59,0.0997,0.89,11.57,1,2,11
3,0.44,Lyft,North Station,Haymarket Square,26.0,1.0,Lux Black XL,2018-11-30 04:53:02.749000072,35.08,Haymarket Square,0.0,1013.71,0.0,0.7,5.25,4,4,11
4,0.44,Lyft,North Station,Haymarket Square,9.0,1.0,Lyft XL,2018-11-29 03:49:20.223000050,37.58,Haymarket Square,0.42,998.64,0.0,0.71,11.3,3,3,11


In [9]:
# Defining Target and features
y = df['price']

numerical_cols = ['distance','surge_multiplier','temp','clouds','pressure','rain','humidity','wind','hour','day','month']

catergorical_cols = ['cab_type','destination','name','source']

X = df[numerical_cols + catergorical_cols]


In [10]:
# Linear Regression without preprocessing

X1 = df[['distance', 'surge_multiplier', 'temp', 'clouds', 'pressure',
        'rain', 'humidity', 'wind', 'hour', 'day', 'month']]


y1 = df['price'] 


X_train, X_test, y_train, y_test = train_test_split(
    X1,y1, test_size=0.2, random_state=42)


model_regression = LinearRegression()
model_regression.fit(X_train, y_train)


In [11]:
# Performance Evaluation without pre processing


y_pred = model_regression.predict(X_test)

mae  = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)
r2   = r2_score(y_test, y_pred)

print(f"MAE : {mae}")
print(f"RMSE: {rmse}")
print(f"R²   : {r2}")

MAE : 6.949262804838439
RMSE: 71.86301369274967
R²   : 0.1720687561477816


In [12]:
# Pre processing pipeline for Linear Regression

numeric_pipeline = Pipeline([('scaler', StandardScaler())])

catergorical_pipeline = Pipeline([('ohe', OneHotEncoder())])

preprocess = ColumnTransformer([
    ('numeric', numeric_pipeline, numerical_cols),
    ('categorical', catergorical_pipeline,catergorical_cols)
])




In [14]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

linear_reg = Pipeline([
    ('prep', preprocess),
    ('model', LinearRegression())
])

linear_reg.fit(X_train, y_train)

In [15]:
y_pred = linear_reg.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test,y_pred)
r2= r2_score(y_test,y_pred)

print(f"MAE : {mae}")
print(f"RMSE : {rmse}")
print(f"R² : {r2}")

MAE : 1.7533634996476883
RMSE : 6.176122948934114
R² : 0.9288451055342931


In [26]:
# Lasso Regression
from sklearn.linear_model import Lasso

lasso_pipeline = Pipeline([
    ('prep', preprocess),
    ('model', Lasso(max_iter=5000))
])

param_grid = {
    'model__alpha': [0.001, 0.01, 0.1, 1.0, 10]
}


In [27]:
# Grid Search to tune

grid_linear = GridSearchCV(lasso_pipeline, param_grid, scoring='neg_mean_squared_error', cv=3)
grid_linear.fit(X_train, y_train)

print("Best alpha:", grid_linear.best_params_)
print("Best RMSE:", (-grid_linear.best_score_) ** 0.5)


Best alpha: {'model__alpha': 0.001}
Best RMSE: 2.491342768081312


In [28]:
y_predic = grid_linear.predict(X_test)

print("R²:", r2_score(y_test, y_pred))
print("Test RMSE:", mean_squared_error(y_test, y_predic))

R²: 0.9288451055342931
Test RMSE: 6.176370051367104


In [None]:
# XGBoost without Preprocessing

X2 = df[['distance', 'surge_multiplier', 'temp', 'clouds', 'pressure',
        'rain', 'humidity', 'wind', 'hour', 'day', 'month']]
y2 = df['price'] 


X_train, X_test, y_train, y_test = train_test_split(
    X2, y2, test_size=0.2, random_state=42)

model_xgboost = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    learning_rate=0.1,
    random_state=42
)
model_xgboost.fit(X_train, y_train)

In [None]:
# Performance Evaluation
y_pred = model_xgboost.predict(X_test)

mae  = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)
r2   = r2_score(y_test, y_pred)

print(f"MAE : {mae}")
print(f"RMSE: {rmse}")
print(f"R²   : {r2}")


MAE : 6.920541810908338
RMSE: 71.06324065134078
R²   : 0.1812829131243342


In [None]:
# XGBoost Regressor with Preprocessing

xgb_pipeline = Pipeline([
    ('prep', preprocess),            
    ('model', XGBRegressor(
        objective='reg:squarederror', 
        n_estimators=100,
        learning_rate=0.1,
        max_depth=6,
        random_state=42
    ))
])
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
xgb_pipeline.fit(X_train, y_train)


In [None]:
# Performance Evaluation for XGBoost

y_pred = xgb_pipeline.predict(X_test)

mae  = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)
r2   = r2_score(y_test, y_pred)

print(f"MAE : {mae}")
print(f"RMSE: {rmse}")
print(f"R²   : {r2}")


MAE : 1.1089455346481996
RMSE: 2.803806845791862
R²   : 0.967697440309381


In [None]:
# Hyperparamter tuning using Grid Search 
from sklearn.model_selection import GridSearchCV

param_grid = {
    'model__n_estimators': [100,150,200],
    'model__learning_rate': [0.05,0.1,0.2],
    'model__max_depth': [4, 6,8]
}

grid = GridSearchCV(
    xgb_pipeline,
    param_grid,
    scoring='root_mean_squared_error',
    cv=3,
    n_jobs=1,
    verbose=1
)

grid.fit(X_train, y_train)



Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [None]:
print("Best Parameters:", grid.best_params_)
print("Best CV RMSE:", grid.best_score_)

Best Parameters: {'model__learning_rate': 0.2, 'model__max_depth': 8, 'model__n_estimators': 200}
Best CV RMSE: -1.5788302122942326


In [None]:
grid.fit(X_train, y_train)

y_pred_new = grid.predict(X_test)

new_mae = mean_absolute_error(y_test, y_pred_new)
new_rmse = mean_squared_error(y_test, y_pred_new)
new_r2 = r2_score(y_test, y_pred_new)

print(f"MAE  : {new_mae}")
print(f"RMSE : {new_rmse}")
print(f"R²   : {new_r2}")


Fitting 3 folds for each of 27 candidates, totalling 81 fits
MAE  : 1.0176097445154395
RMSE : 2.3972958175655346
R²   : 0.972380839514959
