In [6]:
import pandas as pd
df_processed = pd.read_csv('proccessed_bookings.csv')

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR



# Select target variable and features
target = "discounted_price_per_night"
features = [
    "original_price_per_night", "review_score", "number_of_reviews", "star_rating", 
    "kms_from_centre", "location_score", "time_to_travel", "length_of_stay",
    "review_title_rank", "bed_type_rank"
] + [col for col in df_processed.columns if col in [
    "breakfast_included", "free_cancellation", "prepayment_needed"
]]

# Drop rows with missing values in selected features
df_model = df_processed[features + [target]].dropna()

# Split data into train and test sets (70-30 split)
X = df_model[features]
y = df_model[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree Regressor": DecisionTreeRegressor(max_depth=10, random_state=42),
    "Gaussian Process Regressor": GaussianProcessRegressor(kernel=C(1.0) * RBF(length_scale=1.0), random_state=42),
    # Additional models=
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42),
    "Gradient Boosting Regressor": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    "Support Vector Regressor": SVR(kernel='rbf', C=100, gamma=0.1)
}

# Train and evaluate models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    results[name] = {
        "R² (Train)": r2_score(y_train, y_train_pred),
        "R² (Test)": r2_score(y_test, y_test_pred),
        "MAE": mean_absolute_error(y_test, y_test_pred),
        "MSE": mean_squared_error(y_test, y_test_pred),
        "RMSE": np.sqrt(mean_squared_error(y_test, y_test_pred))
    }

# Convert results to a DataFrame and display
results_df = pd.DataFrame(results).T


In [8]:
results_df

Unnamed: 0,R² (Train),R² (Test),MAE,MSE,RMSE
Linear Regression,0.93696,0.937068,13.36474,362.042457,19.027413
Decision Tree Regressor,0.988053,0.968189,6.598156,183.006853,13.528003
Gaussian Process Regressor,1.0,-8.071038,207.495728,52184.941272,228.440236
Random Forest Regressor,0.991416,0.977943,6.112611,126.894266,11.264735
Gradient Boosting Regressor,0.990615,0.984702,5.500641,88.009944,9.381362
Support Vector Regressor,0.969142,0.520377,35.212891,2759.229964,52.528373
