In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

In [4]:
data = pd.read_csv('/content/melb_data.csv')

In [5]:
data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [6]:
data['Bedroom2'] = data['Bedroom2'].astype(int)
data['Bathroom'] = data['Bathroom'].astype(int)

In [7]:
data.dropna(subset=['Car'], inplace=True)

In [8]:
nan_count_car = data['Car'].isna().sum()

In [9]:
print(f"Jumlah nilai NaN dalam kolom Car: {nan_count_car}")

Jumlah nilai NaN dalam kolom Car: 0


In [10]:
# Select subset of predictors
cols_to_use = ['Rooms', 'Distance', 'Landsize', 'BuildingArea', 'YearBuilt']
X = data[cols_to_use]

In [11]:
# Select target
y = data.Price

# Separate data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y)

In [12]:
my_model = XGBRegressor()
my_model.fit(X_train, y_train)

In [13]:
my_model_0 = XGBRegressor(n_estimators=500)
my_model_0.fit(X_train, y_train)

In [14]:
my_model_1 = XGBRegressor(n_estimators=110)
my_model_1.fit(X_train, y_train)

In [15]:
my_model_1 = XGBRegressor(n_estimators=110)
my_model_1.fit(X_train, y_train)

predictions_1 = my_model_1.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions_1, y_valid)))

Mean Absolute Error: 240818.62889238165


In [16]:
from sklearn.model_selection import GridSearchCV

# Tentukan model XGBoost
model_grid = XGBRegressor()

# Tentukan parameter grid
param_grid = {
    'n_estimators': [150, 200, 250, 300, 1000]  # Ganti dengan rentang nilai yang ingin Anda coba
}

# Inisialisasi GridSearchCV
grid_search = GridSearchCV(estimator=model_grid, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error')

# Lakukan fitting menggunakan data training
grid_search.fit(X_train, y_train)

# Dapatkan parameter terbaik
best_n_estimators = grid_search.best_params_['n_estimators']

print("Nilai n_estimators terbaik:", best_n_estimators)

Nilai n_estimators terbaik: 150


In [17]:
# Tentukan range nilai n_estimators yang akan diuji
n_estimators_list = [100, 150, 200, 250, 300]

# Inisialisasi sebuah list untuk menyimpan hasil MAE
mae_results = []

# Loop melalui setiap nilai n_estimators
for n_estimators in n_estimators_list:
    # Inisialisasi model dengan n_estimators tertentu
    model_grid = XGBRegressor(n_estimators=n_estimators)

    # Fitting model menggunakan data training
    model_grid.fit(X_train, y_train)

    # Lakukan prediksi pada data validasi
    predictions = model_grid.predict(X_valid)

    # Hitung Mean Absolute Error
    mae = mean_absolute_error(predictions, y_valid)

    # Tampilkan nilai n_estimators dan MAE
    print(f"n_estimators: {n_estimators}, Mean Absolute Error: {mae}")

    # Tambahkan nilai MAE ke dalam list
    mae_results.append(mae)

n_estimators: 100, Mean Absolute Error: 240549.16647096892
n_estimators: 150, Mean Absolute Error: 240530.72294055103
n_estimators: 200, Mean Absolute Error: 241387.91935211723
n_estimators: 250, Mean Absolute Error: 242062.26505177515
n_estimators: 300, Mean Absolute Error: 243936.87501386835


In [18]:
my_model_2 = XGBRegressor(n_estimators=100)
my_model_2.fit(X_train, y_train,
               early_stopping_rounds=5,
               eval_set=[(X_valid, y_valid)],
               verbose=False)

predictions_2 = my_model_2.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions_2, y_valid)))

Mean Absolute Error: 242322.8468241494


