In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# Metrics
from sklearn.metrics import mean_squared_error, r2_score

# Set plot style
sns.set(style="whitegrid")

def load_and_clean_data(filepath='listings.csv.gz'):

    print(f"Step 1: Loading and cleaning data from {filepath}...")
    try:
        try:
            df = pd.read_csv(filepath, compression='gzip')
        except FileNotFoundError:
            df = pd.read_csv('listings.csv')
    except FileNotFoundError:
        print(f"Error: {filepath} or listings.csv not found.")
        return None

    # --- Clean 'price' ---
    if 'price' not in df.columns:
        print("Error: 'price' column not found.")
        return None

    df['price'] = df['price'].replace({'\$': '', ',': ''}, regex=True).astype(float)
    df.dropna(subset=['price'], inplace=True)

    # --- Clean 'bathrooms_text' ---
    if 'bathrooms_text' in df.columns:
        df['bathrooms'] = df['bathrooms_text'].str.extract(r'(\d+\.?\d*)').astype(float)
    else:
        # Fallback if the column is already named 'bathrooms'
        df['bathrooms'] = pd.to_numeric(df['bathrooms'], errors='coerce')

    # --- Filter Outliers
    # Filtering outliers is crucial for model performance
    p_05 = df['price'].quantile(0.05)
    p_95 = df['price'].quantile(0.95)
    df_filtered = df[(df['price'] >= p_05) & (df['price'] <= p_95)].copy()
    print(f"Filtered prices between £{p_05:.2f} (5th) and £{p_95:.2f} (95th).")

    # --- Feature Engineering: Neighbourhoods ---
    # One-hot encoding all neighbourhoods is too much.
    top_10_neighbourhoods = df_filtered['neighbourhood_cleansed'].value_counts().head(10).index

    # Use .loc to avoid SettingWithCopyWarning
    df_filtered.loc[:, 'neighbourhood_grouped'] = df_filtered['neighbourhood_cleansed'].apply(
        lambda x: x if x in top_10_neighbourhoods else 'Other'
    )

    print(f"Loaded and cleaned data. Shape: {df_filtered.shape}")
    return df_filtered

def define_and_preprocess(df):

    print("\nStep 2: Defining features and preprocessing...")

    # --- Define Features (X) and Target (y) ---
    # Select key feature
    numerical_features = [
        'accommodates', 'bedrooms', 'bathrooms', 'beds',
        'number_of_reviews', 'review_scores_rating',
        'latitude', 'longitude', 'minimum_nights'
    ]

    categorical_features = [
        'room_type',
        'host_is_superhost',
        'neighbourhood_grouped' # Use our new engineered feature
    ]

    # Define X and y
    X = df[numerical_features + categorical_features]
    y = df['price']

    # --- Split Data ---
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Data split: {len(X_train)} train samples, {len(X_test)} test samples.")

    # --- Create Preprocessing Pipelines ---

    # Pipeline for numerical features:
    # 1. Impute missing values (e.g., 'bedrooms') with the median
    # 2. Scale features to a standard range
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # Pipeline for categorical features:
    # 1. Impute missing values (e.g., 'host_is_superhost') with the most frequent value
    # 2. One-hot encode the categories
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # --- Combine pipelines with ColumnTransformer ---
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # --- Apply preprocessing ---
    print("Fitting preprocessor and transforming data...")
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # --- Get Feature Names After Encoding ---

    cat_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)
    all_feature_names = numerical_features + list(cat_feature_names)

    return X_train_processed, X_test_processed, y_train, y_test, preprocessor, all_feature_names

def run_and_evaluate_models(X_train, y_train, X_test, y_test):
    """
    Trains and evaluates all three models.
    """
    print("\nStep 3: Training and evaluating models...")

    # Define the models
    models = {
        "Linear Regression": LinearRegression(),
        "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_depth=10),
        "XGBoost": XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_depth=5)
    }

    # Store results
    results = {}

    for name, model in models.items():
        print(f"--- Training {name} ---")
        model.fit(X_train, y_train)

        print(f"--- Evaluating {name} ---")
        y_pred = model.predict(X_test)

        # Calculate metrics
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)

        results[name] = {'r2': r2, 'rmse': rmse, 'model': model}

        # --- Answer Part 1 of Research Question (Accuracy) ---
        print(f"R2 Score: {r2:.4f}")
        print(f"RMSE: £{rmse:.2f}")
        print("-"*(20 + len(name)) + "\n")

    return results

if __name__ == "__main__":
    # Run the full pipeline
    df = load_and_clean_data()

    if df is not None:
        # We still need feature_names for the next step, so we'll keep it in the return
        X_train, X_test, y_train, y_test, preprocessor, feature_names = define_and_preprocess(df)

        # Pass all necessary data to the evaluation function
        results = run_and_evaluate_models(X_train, y_train, X_test, y_test)



  df['price'] = df['price'].replace({'\$': '', ',': ''}, regex=True).astype(float)


Step 1: Loading and cleaning data from listings.csv.gz...
Filtered prices between £40.00 (5th) and £518.00 (95th).
Loaded and cleaned data. Shape: (56499, 80)

Step 2: Defining features and preprocessing...
Data split: 45199 train samples, 11300 test samples.
Fitting preprocessor and transforming data...

Step 3: Training and evaluating models...
--- Training Linear Regression ---
--- Evaluating Linear Regression ---
R2 Score: 0.5189
RMSE: £71.79
-------------------------------------

--- Training Random Forest ---
--- Evaluating Random Forest ---
R2 Score: 0.6350
RMSE: £62.53
---------------------------------

--- Training XGBoost ---
--- Evaluating XGBoost ---
R2 Score: 0.6611
RMSE: £60.25
---------------------------


ML Pipeline Complete. Initial results printed above.
Models and results are stored. Ready for feature analysis when you are.
