In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [4]:
from xgboost import XGBRegressor
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
import numpy as np
import matplotlib.pyplot as plt
import joblib
from sklearn.datasets import make_regression
import warnings
warnings.filterwarnings("ignore")

In [6]:
calendar_df = pd.read_csv("C:/Users/ADMIN/Rakamin/Week 16/calendar.csv")
listings_df = pd.read_csv("C:/Users/ADMIN/Rakamin/Week 16/listings.csv")
reviews_df = pd.read_csv("C:/Users/ADMIN/Rakamin/Week 16/reviews.csv")

In [8]:
# Remove currency symbols and convert 'price' to numeric in listings_df
listings_df['price'] = listings_df['price'].replace('[\$,]', '', regex=True).astype(float)

# Remove currency symbols and convert 'price' to numeric in calendar_df
calendar_df['price'] = calendar_df['price'].replace('[\$,]', '', regex=True).astype(float)

# Fill missing values with median in calendar_df
calendar_df['price'].fillna(calendar_df['price'].median(), inplace=True)

# Fill missing values in listings_df
listings_df['reviews_per_month'].fillna(listings_df['reviews_per_month'].median(), inplace=True)

# Fill missing values in reviews_df
reviews_df['comments'].fillna('No Comments', inplace=True)

# Check missing values after handling
calendar_missing_values_after = calendar_df.isnull().sum()
listings_missing_values_after = listings_df.isnull().sum()
reviews_missing_values_after = reviews_df.isnull().sum()


# Convert 'date' columns to datetime
calendar_df['date'] = pd.to_datetime(calendar_df['date'], errors='coerce')
reviews_df['date'] = pd.to_datetime(reviews_df['date'], errors='coerce')

# Standardize categorical values in 'room_type'
listings_df['room_type'] = listings_df['room_type'].str.strip().str.lower()

# Remove extra spaces in 'reviewer_name'
reviews_df['reviewer_name'] = reviews_df['reviewer_name'].str.strip().str.title()


# Drop duplicates based on the correct column name
listings_df.drop_duplicates(subset=['id'], inplace=True)
reviews_df.drop_duplicates(subset=['reviewer_id'], inplace=True)



In [10]:
# Mengimpor library tambahan
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder

# Memilih fitur yang relevan untuk model
# Menggunakan fitur seperti 'bedrooms', 'beds', 'host_listings_count', dan 'zipcode'
X = listings_df[['bedrooms', 'beds', 'host_listings_count', 'zipcode']]
y = listings_df['price']

# Menangani nilai kategori dalam 'zipcode' menggunakan LabelEncoder
label_encoder = LabelEncoder()
X['zipcode'] = label_encoder.fit_transform(X['zipcode'])

# Mengisi missing values pada fitur yang masih memiliki nilai kosong
X.fillna(X.median(), inplace=True)
y.fillna(y.median(), inplace=True)

# Membagi dataset menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, KFold, cross_val_score
import numpy as np
import matplotlib.pyplot as plt
import joblib
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for GridSearchCV for XGBoost
param_grid_xgb = {
    'n_estimators': [50, 100, 150, 200],  # Number of boosting rounds
    'learning_rate': [0.01, 0.05, 0.1, 0.2],  # Learning rate
    'max_depth': [3, 4, 5, 6],  # Depth of the trees
    'subsample': [0.7, 0.8, 0.9, 1.0],  # Fraction of samples used per boosting round
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0]  # Fraction of features used per tree
}
# Generate dataset
from sklearn.datasets import make_regression
X, y = make_regression(n_samples=1000, n_features=4, noise=0.1, random_state=42)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create XGBoost Regressor model with default parameters
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Predict with the trained model
y_xgb_pred = xgb_model.predict(X_test)

# Create GridSearchCV with XGBoost Regressor
grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb,
                               cv=3, n_jobs=-1, verbose=2)

# Train the model with GridSearchCV
grid_search_xgb.fit(X_train, y_train)

# Display the best hyperparameters from GridSearchCV
best_params_grid_xgb = grid_search_xgb.best_params_
print("Best hyperparameters using GridSearchCV (XGBoost): ", best_params_grid_xgb)

# Predict with the best model
y_xgb_pred_grid = grid_search_xgb.predict(X_test)

# Calculate MAE and RMSE for the best model
xgb_mae_grid = mean_absolute_error(y_test, y_xgb_pred_grid)
xgb_rmse_grid = np.sqrt(mean_squared_error(y_test, y_xgb_pred_grid))

print(f"Mean Absolute Error (MAE) for XGBoost after GridSearchCV: {xgb_mae_grid}")
print(f"Root Mean Squared Error (RMSE) for XGBoost after GridSearchCV: {xgb_rmse_grid}")

# Compute MAPE for the best model
xgb_mape_grid = mean_absolute_percentage_error(y_test, y_xgb_pred_grid)
print(f"Mean Absolute Percentage Error (MAPE) for XGBoost after GridSearchCV: {xgb_mape_grid}")

# K-Fold Cross Validation
kf = KFold(n_splits=5,  shuffle=True, random_state=42)

cv_mae_grid_xgb = cross_val_score(grid_search_xgb.best_estimator_, X_train, y_train, cv=kf, scoring='neg_mean_absolute_error')
cv_rmse_grid_xgb = cross_val_score(grid_search_xgb.best_estimator_, X_train, y_train, cv=kf, scoring='neg_root_mean_squared_error')

# Evaluasi MAPE dengan k-fold cross-validation
cv_mape_grid_xgb = cross_val_score(grid_search_xgb.best_estimator_, X_train, y_train, cv=kf,
                                   scoring=lambda estimator, X, y: -mean_absolute_percentage_error(y, estimator.predict(X)))

# Display cross-validation results
print(f"K-Fold Cross-Validation MAE for XGBoost (GridSearchCV): {-cv_mae_grid_xgb.mean()}")
print(f"K-Fold Cross-Validation RMSE for XGBoost (GridSearchCV): {-cv_rmse_grid_xgb.mean()}")
print(f"K-Fold Cross-Validation MAPE for XGBoost (GridSearchCV): {-cv_mape_grid_xgb.mean()}")


Fitting 3 folds for each of 1024 candidates, totalling 3072 fits
Best hyperparameters using GridSearchCV (XGBoost):  {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.9}
Mean Absolute Error (MAE) for XGBoost after GridSearchCV: 8.121748533179867
Root Mean Squared Error (RMSE) for XGBoost after GridSearchCV: 10.698328929531794
Mean Absolute Percentage Error (MAPE) for XGBoost after GridSearchCV: 0.3681034410705682
K-Fold Cross-Validation MAE for XGBoost (GridSearchCV): 9.339806090043496
K-Fold Cross-Validation RMSE for XGBoost (GridSearchCV): 13.041327826239405
K-Fold Cross-Validation MAPE for XGBoost (GridSearchCV): 0.363613858791384


In [14]:
joblib.dump(grid_search_xgb, 'XGboost_GridSearchCV.jodlib')

['XGboost_GridSearchCV.jodlib']

In [15]:
grid_search_xgb.predict(X_test[:1])

array([31.089327], dtype=float32)

In [18]:
pip install streamlit

Collecting streamlitNote: you may need to restart the kernel to use updated packages.

  Obtaining dependency information for streamlit from https://files.pythonhosted.org/packages/bf/14/a95ac354fe9ca59ee9d030dc738dac8ac04bd32f9d7c3f2c790eb6431ab1/streamlit-1.43.1-py2.py3-none-any.whl.metadata
  Downloading streamlit-1.43.1-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting altair<6,>=4.0 (from streamlit)
  Obtaining dependency information for altair<6,>=4.0 from https://files.pythonhosted.org/packages/aa/f3/0b6ced594e51cc95d8c1fc1640d3623770d01e4969d29c0bd09945fafefa/altair-5.5.0-py3-none-any.whl.metadata
  Downloading altair-5.5.0-py3-none-any.whl.metadata (11 kB)
Collecting blinker<2,>=1.0.0 (from streamlit)
  Obtaining dependency information for blinker<2,>=1.0.0 from https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl.metadata
  Downloading blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecti

In [16]:
!jupyter nbconvert --to script XGBoost_GridSearchCV.ipynb


[NbConvertApp] Converting notebook XGBoost_GridSearchCV.ipynb to script
[NbConvertApp] Writing 6662 bytes to XGBoost_GridSearchCV.py
