In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("train.csv")
print(f"Original shape: {df.shape}")

threshold = len(df) * 0.5
cols_to_drop = df.columns[df.isnull().sum() > threshold]
print(f"Drop kolom >50% missing: {list(cols_to_drop)}")
df = df.drop(columns=cols_to_drop)

df['LotFrontage'] = df.groupby('Neighborhood')['LotFrontage'].transform(
    lambda x: x.fillna(x.median())
)
df['LotFrontage'] = df['LotFrontage'].fillna(df['LotFrontage'].median())

cat_cols = df.select_dtypes(include='object').columns
df[cat_cols] = df[cat_cols].fillna("Missing")

num_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

print(f"\nFinal missing values: {df.isnull().sum().sum()}")
print(f"Final shape: {df.shape}")

df.to_csv("train_clean.csv", index=False)
print("train_clean.csv berhasil disimpan!")

Original shape: (1460, 81)
Drop kolom >50% missing: ['Alley', 'MasVnrType', 'PoolQC', 'Fence', 'MiscFeature']

Final missing values: 0
Final shape: (1460, 76)
train_clean.csv berhasil disimpan!


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb

train = pd.read_csv("train_clean.csv")
test = pd.read_csv("test.csv")

cols_to_drop = ['Alley', 'MasVnrType', 'PoolQC', 'Fence', 'MiscFeature']
test = test.drop(columns=[col for col in cols_to_drop if col in test.columns], errors='ignore')
test.fillna({"Missing": "Missing"}, inplace=True)
for col in test.select_dtypes(include=['float64', 'int64']).columns:
  if test[col].isnull().any():
    test[col].fillna(train[col].median(), inplace=True)

for df in [train, test]:
  df['TotalSF']         = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
  df['TotalBath']       = df['FullBath'] + 0.5*df['HalfBath'] + df['BsmtFullBath'] + 0.5*df['BsmtHalfBath']
  df['Age']             = 2025 - df['YearBuilt']
  df['RemodAge']        = 2025 - df['YearRemodAdd']
  df['TotalPorchSF']    = df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] + df['ScreenPorch']
  df['Has2ndFloor']     = (df['2ndFlrSF'] > 0).astype(int)
  df['HasGarage']       = (df['GarageArea'] > 0).astype(int)
  df['HasPool']         = (df['PoolArea'] > 0).astype(int)
  df['IsNew']           = (df['YearBuilt'] > 0).astype(int)

X = train.drop(['Id', 'SalePrice'], axis=1)
y = np.log1p(train['SalePrice'])
X_test = test.drop('Id', axis=1)

cat_cols = X.select_dtypes(include='object').columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('num', StandardScaler(), num_cols)
])

X_proc = preprocessor.fit_transform(X)
X_test_proc = preprocessor.transform(X_test)

model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.02,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

model.fit(X_proc, y, verbose=False)
print("XGBoost Selesai Dilatih!")

pred_log = model.predict(X_test_proc)
pred = np.expm1(pred_log)

submission = pd.DataFrame({'Id': test['Id'], 'SalePrice': pred})
submission.to_csv("submission_xgboost.csv", index=False)
print("submission_xgboost.csv berhasil dibuat!")
print(submission.head())

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(train[col].median(), inplace=True)


XGBoost Selesai Dilatih!
submission_xgboost.csv berhasil dibuat!
     Id      SalePrice
0  1461  126359.703125
1  1462  164204.031250
2  1463  184116.812500
3  1464  190951.406250
4  1465  188661.437500
