In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
import seaborn as sns

from xgboost import XGBRegressor
import xgboost as xgb

from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor

In [2]:
data = pd.read_csv("https://raw.githubusercontent.com/SamuelD005/challenge-regression/development/Data8.csv", sep=",")
data.head()

Unnamed: 0.1,Unnamed: 0,Locality,Type of property,Price,Number of rooms,Area,Fully equipped kitchen,Furnished,Open fire,Terrace Area,Garden Area,Surface of the land,Number of facades,Swimming pool,State of the building,Province,Region,PriceperMeter
0,0,2970,apartment,764999.0,2.0,153.0,0.0,0.0,0.0,62.0,0.0,215.0,2.0,0.0,medium,Anvers,Flanders,4999.0
1,1,3200,apartment,294999.0,2.0,80.0,0.0,0.0,0.0,0.0,0.0,80.0,2.0,0.0,medium,Brabant Flamand,Flanders,3687.0
2,2,8211,apartment,233999.0,2.0,90.0,0.0,0.0,0.0,0.0,0.0,90.0,2.0,0.0,medium,Flandre Occidental,Flanders,2599.0
3,3,2630,apartment,329899.0,1.0,87.0,0.0,0.0,0.0,28.0,0.0,115.0,2.0,0.0,medium,Anvers,Flanders,3791.0
4,4,2630,apartment,359899.0,1.0,95.0,0.0,0.0,0.0,47.0,0.0,142.0,4.0,0.0,medium,Anvers,Flanders,3788.0


In [3]:
X = data.drop(["Price","Unnamed: 0","PriceperMeter"] , axis = 1)
y = data["Price"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)

In [5]:
numerical_features = ['Locality', 'Number of rooms', 'Area', 'Terrace Area', 'Garden Area', 'Surface of the land']
categorial_features = ['Type of property','Fully equipped kitchen', 'Furnished', 'Open fire',
                       'Swimming pool', 'State of the building', 'Province', 'Region','Number of facades']

In [6]:
numerical_pipeline = make_pipeline(SimpleImputer(), StandardScaler())
categorial_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent"), 
                                   OneHotEncoder())

In [7]:
preprocessor = make_column_transformer((numerical_pipeline, numerical_features),
                       (categorial_pipeline, categorial_features))

In [8]:
model = make_pipeline(preprocessor, XGBRegressor())
XGBoost_model = model.fit(X_train, y_train)
XGBoost_model.score(X_test, y_test)

0.7545927011573226

In [9]:
BG = BaggingRegressor(base_estimator=XGBRegressor(), n_estimators= 10)

In [10]:
model = make_pipeline(preprocessor, BG)
BG_model = model.fit(X_train, y_train)
BG_model.score(X_test, y_test)

0.7747862035827116

In [11]:
BG_model.predict(X_test)

array([228695.78, 288245.75, 195271.83, ..., 175918.55, 652174.25,
       332661.3 ], dtype=float32)

In [12]:
import pickle
xg_file = "xgb_reg.pkl"
bg_file = "bg_reg.pkl"
# save
pickle.dump(XGBoost_model, open(xg_file, "wb"))
pickle.dump(BG_model, open(bg_file, "wb"))

In [13]:
# # load
xgb_model_loaded = pickle.load(open(xg_file, "rb"))
print(xgb_model_loaded)
# test
# ind = 1
# test = X_val[ind]
# xgb_model_loaded.predict(test)[0] == xgb_model.predict(test)[0]

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer()),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['Locality',
                                                   'Number of rooms', 'Area',
                                                   'Terrace Area',
                                                   'Garden Area',
                                                   'Surface of the land']),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                          