In [64]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import ast

In [65]:
def summarize_property_type(property_type):
        if "rental unit" in property_type:
            return "Rental Unit"
        elif "tiny" in property_type or "Tiny" in property_type:
            return "Tiny"
        elif "hotel" in property_type or "boutique hotel" in property_type:
            return "Hotel"
        elif "condo" in property_type:
            return "Condo"
        elif "townhouse" in property_type:
            return "Townhouse"
        elif "guest suite" in property_type or "guesthouse" in property_type:
            return "Guesthouse"
        elif "serviced apartment" in property_type or "aparthotel" in property_type:
            return "Serviced Apartment"
        elif "vacation home" in property_type or "bungalow" in property_type or "villa" in property_type or "cottage" in property_type:
            return "Vacation Home"
        elif "loft" in property_type:
            return "Loft"
        elif "hostel" in property_type:
            return "Hostel"
        elif "home" in property_type or "casa" in property_type:
            return "Home"
        else:
            return "Other"

In [66]:
class MissingValueTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.categorical_cols = None
        self.numerical_cols = None
        self.special_categorical_cols = ['description','host_since','first_review','last_review','bathrooms_text','amenities','reviews']

    def fit(self, X, y=None):
        # get the object columns
        self.numerical_cols = X.select_dtypes(include=[np.number]).columns
        self.categorical_cols = X.select_dtypes(include=[object, 'category']).columns.difference(self.special_categorical_cols)
        return self

    def transform(self, X):
        X = X.copy()
        # Fill missing values for numerical columns with mean
        X[self.numerical_cols] = X[self.numerical_cols].fillna(X[self.numerical_cols].mean())
        # Fill missing values for categorical columns with mode
        X[self.categorical_cols] = X[self.categorical_cols].fillna(X[self.categorical_cols].mode().iloc[0]).infer_objects()
        return X


class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=30, random_state=0):
        self.n_clusters = n_clusters
        self.random_state = random_state
        self.kmeans_location = None

    def fit(self, X, y=None):
        # Fit KMeans for location clustering
        self.kmeans_location = KMeans(n_clusters=self.n_clusters, random_state=self.random_state).fit(X[['longitude','latitude']])
        return self

    def transform(self, X):
        # Copy the dataset to avoid changing the original data
        X = X.copy()

        # Simplify property type
        X['filtered_property_type'] = X['property_type'].apply(summarize_property_type)
        X = X.drop(columns=['property_type'])

        # 8. Cluster locations using fitted KMeans
        X['location'] = self.kmeans_location.predict(X[['longitude','latitude']])

        # Process date columns
        # for the datatime object, we will replace it with year + month/12, ignoring the day, etc.
        # eg. 2019-01-01 will be replaced with 2019 + 1/12 = 2019.0833
        date_cols = ['host_since', 'first_review', 'last_review']
        for col in date_cols:
            X[col] = X[col].dt.year + X[col].dt.month / 12
            X[col] = X[col].fillna(0)
        # Calculate host duration from 2025
        X['host_duration'] = 2025 - X['host_since']

        # Extract bathroom features
        # Assign 'bathrooms_shared' = 1 if 'shared' is in 'bathrooms_text' and 'bathrooms' > 0, else 0
        X['bathrooms_shared'] = X.apply(
            lambda row: 1 if ('shared' in str(row['bathrooms_text']).lower() and row['bathrooms'] > 0) else 0,
            axis=1)

        # Calculate accommodates ratios
        # calculate the bedroom, bed, and bathroom per accommodates can get
        X['beds_per_accommodates'] = X['beds'] / X['accommodates']
        X['bedrooms_per_accommodates'] = X['bedrooms'] / X['accommodates']
        X['bathrooms_per_accommodates'] = X['bathrooms'] / X['accommodates']

        # Count amenities
        X['amenities'] = X['amenities'].apply(ast.literal_eval)
        X['amenities_count'] = X['amenities'].apply(len)

        # Calculate availability ratios
        # Avoid division by zero using numpy where
        X['availability_30_ratio'] = np.where(X['availability_60'] == 0, 0, X['availability_30'] / X['availability_60'])
        X['availability_60_ratio'] = np.where(X['availability_90'] == 0, 0, X['availability_60'] / X['availability_90'])
        X['availability_90_ratio'] = np.where(X['availability_365'] == 0, 0, X['availability_90'] / X['availability_365'])

        # Calculate recent review ratios
        X['recent_month_review_ratio'] = X.apply(
            lambda row: row['number_of_reviews_l30d'] / row['number_of_reviews'] if row['number_of_reviews'] > 0 else 0,
            axis=1)
        X['recent_year_review_ratio'] = X.apply(
            lambda row: row['number_of_reviews_ltm'] / row['number_of_reviews'] if row['number_of_reviews'] > 0 else 0,
            axis=1)

        # Drop text columns
        return X.drop(columns=['name', 'description', 'bathrooms_text', 'amenities', 'reviews'])
    
class OrdinalEncoderPersonal(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        self.object_cols = None

    def fit(self, X, y=None):
        # get the object columns
        self.object_cols = X.select_dtypes(include=['object']).columns
        self.ordinal_encoder.fit(X[self.object_cols])
        return self

    def transform(self, X):
        X = X.copy()
        X[self.object_cols] = self.ordinal_encoder.transform(X[self.object_cols])
        return X

In [67]:
# Importing the dataset
train_data = pd.read_csv('./Data_Pure/train.csv',parse_dates=['host_since', 'first_review', 'last_review'])
train_target = train_data['price'] 
train_data = train_data.drop('price', axis = 1)

test_data = pd.read_csv('./Data_Pure/test.csv',parse_dates=['host_since', 'first_review', 'last_review'])
test_id = test_data['id']
test_data = test_data.drop('id', axis = 1)

# Concatenate the previously processed positive_review_ratio
train_review_pos_ratio = pd.read_csv('./Data_Preprocess/train_positive_ratio.csv')
test_review_pos_ratio = pd.read_csv('./Data_Preprocess/test_positive_ratio.csv')
train_data = pd.concat([train_data,train_review_pos_ratio], axis=1)
test_data = pd.concat([test_data, test_review_pos_ratio], axis=1)

In [68]:
# define the pipeline
feature_engineering_pipeline = Pipeline([
    # fill the missing values
    ('missing_values', MissingValueTransformer()),
    # Apply specific feature engineering
    ('feature_engineering', FeatureEngineeringTransformer()),
    # Encode object features
    ('ordinal_encoder', OrdinalEncoderPersonal())
])

# Perform the feature engineering
train_data = feature_engineering_pipeline.fit_transform(train_data)
test_data = feature_engineering_pipeline.transform(test_data)

  X[self.categorical_cols] = X[self.categorical_cols].fillna(X[self.categorical_cols].mode().iloc[0]).infer_objects()
  X[self.categorical_cols] = X[self.categorical_cols].fillna(X[self.categorical_cols].mode().iloc[0]).infer_objects()
