In [8]:
import numpy as np
import pandas as pd

from utils import get_train_data
from FeatureEngineering import _encode
# We will do categorical encoding, and not use the integrated module of XGBoost
# supposed to handle these categorical variables

# from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

### __Setup__

In [9]:
# We import the data:
X, y = get_train_data()

# We put the label to the log to help the model:
X_encoded, y_log = _encode(X, y)

# Pipeline creation:
model = XGBRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=6,
        random_state=42
    )

### __Model prediction__

In [10]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_log, 
                                                    test_size=0.4, 
                                                    random_state=42)

# Model fit
model.fit(X_train, y_train)

# Model predict
y_pred = model.predict(X_test)

# Error estimation
error = root_mean_squared_error(y_test, y_pred)
print(f'The RMSE is {error}.')

The RMSE is 0.13453803415959517.


### __Test Data Prediction__

In [18]:
test_data = pd.read_csv('data/test.csv')

# Preprocess the test data using the same pipeline, and make sure the columns 
# of the training set are the same as the columns of the test set:
test_data_encoded = _encode(test_data).reindex(columns=X_encoded.columns, 
                                               fill_value=0)

# Make predictions on the test data
test_prediction = model.predict(test_data_encoded)
predictions = np.exp(test_prediction)

### __Output Extraction__

In [20]:
results = pd.DataFrame(
    dict(
        Id=test_data['Id'],
        SalePrice=predictions,
    )
)
results.to_csv("submission_XGB_v1.csv", index=False)