# Support vector Machine

In [1]:
from sklearn.svm import SVR 
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd

In [2]:
data=pd.read_csv('../dataset/merged_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,...,pressure,rain,time_stamp_w,humidity,wind,date_time_w,merge_date_w,day,hour,month
0,0,0.44,Lyft,1544952607890,North Station,Haymarket Square,5.0,1.0,424553bb-7174-41ea-aeb4-fe06d4f4b9d7,lyft_line,...,1022.25,0.0,1544954000.0,0.76,7.68,2018-12-16 09:45:01,Haymarket Square - 2018-12-16 - 9,6,9,12
1,1,0.44,Lyft,1543284023677,North Station,Haymarket Square,11.0,1.0,4bd23055-6827-41c6-b23b-3c491f24e74d,lyft_premier,...,1003.17,0.1123,1543285000.0,0.9,13.69,2018-11-27 02:15:20,Haymarket Square - 2018-11-27 - 2,1,2,11
2,1,0.44,Lyft,1543284023677,North Station,Haymarket Square,11.0,1.0,4bd23055-6827-41c6-b23b-3c491f24e74d,lyft_premier,...,1002.59,0.0997,1543287000.0,0.89,11.57,2018-11-27 02:45:20,Haymarket Square - 2018-11-27 - 2,1,2,11
3,3,0.44,Lyft,1543553582749,North Station,Haymarket Square,26.0,1.0,c2d88af2-d278-4bfd-a8d0-29ca77cc5512,lyft_luxsuv,...,1013.71,0.0,1543554000.0,0.7,5.25,2018-11-30 04:52:54,Haymarket Square - 2018-11-30 - 4,4,4,11
4,4,0.44,Lyft,1543463360223,North Station,Haymarket Square,9.0,1.0,e0126e1f-8ca9-4f2e-82b3-50505a09db9a,lyft_plus,...,998.64,0.0,1543462000.0,0.71,11.3,2018-11-29 03:32:09,Haymarket Square - 2018-11-29 - 3,3,3,11


In [3]:
# Defining Target and features
y = data['price']

numerical_columns = ['distance','surge_multiplier','temp','clouds','pressure','rain','humidity','wind','hour','day','month']

categorical_columns = ['cab_type','destination','name','source']

X = data[numerical_columns + categorical_columns]
# Random Forest without preprocessing
xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=0.2, random_state=42)
xtrain.shape, xtest.shape

((931996, 15), (233000, 15))

In [4]:
numerical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_preprocessor, numerical_columns),
    ('cat', categorical_preprocessor, categorical_columns)
])

# --- Create Pipeline with SVR ---
svr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', SVR())
])
svr_pipeline

In [5]:
svr_param_grid = {
    'model__C': [0.1, 1, 10],
    'model__epsilon': [0.1, 0.2, 0.5],
    'model__kernel': ['rbf', 'linear']
}


svr_grid_search = GridSearchCV(
    svr_pipeline,
    svr_param_grid,
    scoring='neg_mean_squared_error',
    cv=3,
    n_jobs=-1
)


svr_grid_search.fit(xtrain, ytrain)

KeyboardInterrupt: 

In [None]:
best_svr_model = svr_grid_search.best_estimator_
svr_preds = best_svr_model.predict(xtest)

rmse = np.sqrt(mean_squared_error(ytest, svr_preds))
mae = mean_absolute_error(ytest, svr_preds)
r2 = r2_score(ytest, svr_preds)

print("🔹 Tuned SVR Model")
print("Best Params:", svr_grid_search.best_params_)
print(f"RMSE: {rmse:.3f}")
print(f"MAE: {mae:.3f}")
print(f"R²: {r2:.3f}")