In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from xgboost import XGBRegressor
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib

In [2]:
def drop_columns(df, columns_to_drop):
    """
    Drop specified columns from a DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame from which columns should be dropped.
    columns_to_drop (list): A list of column names to be dropped.

    Returns:
    pd.DataFrame: The DataFrame with specified columns dropped.
    """
    # Drop the specified columns
    df_dropped = df.drop(columns=columns_to_drop)
    return df_dropped

In [6]:
data = pd.read_csv('/content/FE_RoomPrice.csv')

In [7]:
columns_to_drop = ['Unnamed: 0.1','Unnamed: 0','room_id',	'unit_id',	'booking_id','booking_check_in', 'created_at','booking_check_out', 'earnings_in_idr', 'total_earnings']
data = drop_columns(data, columns_to_drop)
data['contain_national_holiday'] = data['contain_national_holiday'].astype(float)

## Preprocessing Setup

In [8]:
X = data.iloc[:, 1:]
y = data.iloc[:, :1]

In [9]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29766 entries, 0 to 29765
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   average_baseline_price    29766 non-null  float64
 1   rating                    29766 non-null  float64
 2   review_sentiment_score    29766 non-null  float64
 3   communication             29766 non-null  float64
 4   cleanliness               29766 non-null  float64
 5   accuracy                  29766 non-null  float64
 6   month                     29766 non-null  object 
 7   contain_national_holiday  29766 non-null  float64
 8   stay_duration             29766 non-null  int64  
 9   booking_day_of_week       29766 non-null  int64  
 10  booking_lead_time         29766 non-null  int64  
 11  price_fluctuation         29766 non-null  float64
 12  total_review_score        29766 non-null  float64
dtypes: float64(9), int64(3), object(1)
memory usage: 3.0+ MB


In [19]:
# Example column names
numerical_features = ['rating', 'review_sentiment_score', 'communication', 'cleanliness', 'accuracy', 'stay_duration', 'booking_day_of_week', 'booking_lead_time', 'price_fluctuation', 'total_review_score']
categorical_features = ['month']
unprocessed_features = ['contain_national_holiday', 'average_baseline_price']

# Get indices of these columns
numerical_indices = [data.columns.get_loc(col)-1 for col in numerical_features]
categorical_indices = [data.columns.get_loc(col)-1 for col in categorical_features]
unprocessed_indices = [data.columns.get_loc(col)-1 for col in unprocessed_features]

In [20]:
categorical_indices

[6]

In [21]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_indices),
        ('cat', OneHotEncoder(), categorical_indices),
        ('passthrough', 'passthrough', unprocessed_indices)
    ]
)

0    may
Name: month, dtype: object

In [12]:
model = RandomForestRegressor(n_estimators=25,max_depth=64,random_state=123)

In [None]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29766 entries, 0 to 29765
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   average_baseline_price    29766 non-null  float64
 1   rating                    29766 non-null  float64
 2   review_sentiment_score    29766 non-null  float64
 3   communication             29766 non-null  float64
 4   cleanliness               29766 non-null  float64
 5   accuracy                  29766 non-null  float64
 6   month                     29766 non-null  object 
 7   contain_national_holiday  29766 non-null  float64
 8   stay_duration             29766 non-null  int64  
 9   booking_day_of_week       29766 non-null  int64  
 10  booking_lead_time         29766 non-null  int64  
 11  price_fluctuation         29766 non-null  float64
 12  total_review_score        29766 non-null  float64
dtypes: float64(9), int64(3), object(1)
memory usage: 3.0+ MB


In [22]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)  # Replace with your model
])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
pipeline.fit(np.array(X_train), y_train)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


In [23]:
y_pred = pipeline.predict(X_test)
val_mae = mean_absolute_error(y_test, y_pred)



In [24]:
val_mae

4810.054533017035

In [25]:
joblib.dump(pipeline, 'RoomPrice.pkl')

['RoomPrice.pkl']