In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from xgboost import XGBRegressor
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib

In [5]:
def drop_columns(df, columns_to_drop):
    """
    Drop specified columns from a DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame from which columns should be dropped.
    columns_to_drop (list): A list of column names to be dropped.

    Returns:
    pd.DataFrame: The DataFrame with specified columns dropped.
    """
    # Drop the specified columns
    df_dropped = df.drop(columns=columns_to_drop)
    return df_dropped

## Prep for pileine

In [6]:
data = pd.read_csv('/content/FE_RoomBasePrice.csv')

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142 entries, 0 to 141
Data columns (total 49 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   unit_type_name                 142 non-null    object 
 1   property_design                142 non-null    float64
 2   property_type                  142 non-null    float64
 3   number_of_bookings             142 non-null    int64  
 4   bedroom                        142 non-null    float64
 5   bathroom                       142 non-null    float64
 6   beds                           142 non-null    float64
 7   capacity                       142 non-null    float64
 8   ac                             142 non-null    float64
 9   balcony                        142 non-null    float64
 10  beachfront                     142 non-null    float64
 11  breakfast                      142 non-null    float64
 12  building_staff                 142 non-null    flo

In [8]:
columnsToDropBeforeTraining = ['Unnamed: 0.1','Unnamed: 0', 'room_id']
data = drop_columns(data, columnsToDropBeforeTraining)

In [10]:
average_baseline_price = data['average_baseline_price']
data = data.drop(columns='average_baseline_price')

In [11]:
X = data
y = average_baseline_price

In [15]:
# Example column names
numerical_features = ['bedroom','bathroom','beds','capacity','lat','lng','distance_to_coastline','area_distance_to_airport','total_fas','ratio_bedroom_bathroom','ratio_bedroom_cap',
                  'avg_price_distance_to_coast', 'avg_price_distance_to_airport', 'avg_price_bedroom','avg_price_beds','avg_price_bathroom','avg_price_total_fas']
categorical_features = ['unit_type_name', 'property_design', 'property_type','area_name']
unprocessed_features = ['ac', 'balcony', 'beachfront', 'breakfast', 'building_staff',
                             'cable_tv', 'essentials', 'garden', 'gym', 'hair_dryer',
                             'hanger', 'heating', 'hot_water', 'kitchen', 'linens',
                             'lock', 'luggage_drop_off', 'parking', 'pool',
                             'private_entrance', 'shampoo', 'tv', 'washer', 'wifi',
                             'workspace']

# Get indices of these columns
numerical_indices = [data.columns.get_loc(col) for col in numerical_features]
categorical_indices = [data.columns.get_loc(col) for col in categorical_features]
unprocessed_indices = [data.columns.get_loc(col) for col in unprocessed_features]

In [16]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features),
        ('passthrough', 'passthrough', unprocessed_features)
    ]
)

In [17]:
TEST_RATIO = 0.3
RANDOM_STATE = 123
params = {
    "n_estimators": 1000,
    "max_depth": 64,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "absolute_error",
}

In [19]:
model = RandomForestRegressor(n_estimators=58,max_depth=params["max_depth"],random_state=RANDOM_STATE)

In [31]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142 entries, 0 to 141
Data columns (total 48 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   unit_type_name                 142 non-null    object 
 1   property_design                142 non-null    float64
 2   property_type                  142 non-null    float64
 3   number_of_bookings             142 non-null    int64  
 4   bedroom                        142 non-null    float64
 5   bathroom                       142 non-null    float64
 6   beds                           142 non-null    float64
 7   capacity                       142 non-null    float64
 8   ac                             142 non-null    float64
 9   balcony                        142 non-null    float64
 10  beachfront                     142 non-null    float64
 11  breakfast                      142 non-null    float64
 12  building_staff                 142 non-null    flo

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_RATIO, random_state=42)

In [25]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)  # Replace with your model
])
pipeline.fit(X, y)

In [28]:
y_pred = pipeline.predict(X)
val_mae = mean_absolute_error(y, y_pred)

In [29]:
val_mae

39643.474937153165

In [30]:
joblib.dump(pipeline, 'pipeline.pkl')

['pipeline.pkl']