In [1]:
!pip install xgboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

--2022-10-14 05:43:21--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1423529 (1.4M) [text/plain]
Saving to: ‘housing.csv’


2022-10-14 05:43:21 (156 MB/s) - ‘housing.csv’ saved [1423529/1423529]



In [4]:
import xgboost as xgb

In [5]:
import pandas as pd
import numpy as np

from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction import DictVectorizer

from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text

In [6]:
df=pd.read_csv("housing.csv")

In [7]:
def prepare_datax(df, seedval,fillnavalue):
    
    df=df[['latitude',
            'longitude',
            'housing_median_age',
            'total_rooms',
            'total_bedrooms',
            'population',
            'households',
            'median_income',
            'median_house_value',
            'ocean_proximity']]
    # Splitting of dataset into the train 60%, validation 20%, and the test 20 % dataset
    df = df.copy()
    n = len(df)
    n_val = int(n * 0.2)            # Validation dataset
    n_test = int(n * 0.2)           # Test dataset 
    n_train = n - n_val - n_test    # Train dataset
    
    #filling with zero in missing values
    
    #print(df.total_bedrooms.value_counts())
    df["total_bedrooms"] = df["total_bedrooms"].fillna(fillnavalue)

    
    # suffle index
    idx = np.arange(n)
    np.random.seed(seedval)
    np.random.shuffle(idx)
    
    #get train/val/test dataset
    df_train = df.iloc[idx[:n_train]]
    df_val = df.iloc[idx[n_train:n_train+n_val]]
    df_test = df.iloc[idx[n_train+n_val:]]
    
    #reset index
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
    
    
    # Transformation of the y values
    y_train = np.log1p(df_train.median_house_value.values)
    y_val = np.log1p(df_val.median_house_value.values)
    y_test = np.log1p(df_test.median_house_value.values)
    
    del df_train['median_house_value']
    del df_val['median_house_value']
    
    del df_test['median_house_value']
    
    #train dictvectorizor
    categorical = ["ocean_proximity"]
    numerical=['latitude', 'longitude', 'housing_median_age',"total_rooms",
           'total_bedrooms', 'population', 'households', 'median_income']
    
    train_dict = df_train[categorical + numerical].to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict)
    X_train = dv.transform(train_dict)

    test_dict = df_test[categorical + numerical].to_dict(orient='records')
    X_test = dv.transform(test_dict)
    
    
    val_dict = df_val[categorical + numerical].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    
    
    return X_train,X_test,X_val,y_train,y_test,y_val,dv

In [8]:
X_train,X_test,X_val,y_train,y_test,y_val,dv= prepare_datax(df,1,0)

In [10]:
features = dv.get_feature_names()


In [12]:
features

['households',
 'housing_median_age',
 'latitude',
 'longitude',
 'median_income',
 'ocean_proximity=<1H OCEAN',
 'ocean_proximity=INLAND',
 'ocean_proximity=ISLAND',
 'ocean_proximity=NEAR BAY',
 'ocean_proximity=NEAR OCEAN',
 'population',
 'total_bedrooms',
 'total_rooms']

In [14]:
import re

regex = re.compile(r"\[|\]|<", re.IGNORECASE)



features1 = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in features ]

In [15]:
features1

['households',
 'housing_median_age',
 'latitude',
 'longitude',
 'median_income',
 'ocean_proximity=_1H OCEAN',
 'ocean_proximity=INLAND',
 'ocean_proximity=ISLAND',
 'ocean_proximity=NEAR BAY',
 'ocean_proximity=NEAR OCEAN',
 'population',
 'total_bedrooms',
 'total_rooms']

In [16]:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

In [17]:
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features1)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features1)

In [19]:
model = xgb.train(xgb_params, dtrain, num_boost_round=100)

In [20]:
#model = xgb.train(xgb_params, dtrain, num_boost_round=100,verbose_eval=5, evals=watchlist)

In [21]:
y_pred = model.predict(dval)

In [23]:
rmse_val = np.sqrt(mean_squared_error(y_val,y_pred))

In [24]:
rmse_val

0.22821795570185202

In [25]:
def train_XGboost(xgb_params):
  dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features1)
  dval = xgb.DMatrix(X_val, label=y_val, feature_names=features1)
  model = xgb.train(xgb_params, dtrain, num_boost_round=100)
  y_pred = model.predict(dval)
  rmse_val = np.sqrt(mean_squared_error(y_val,y_pred))
  return rmse_val


In [26]:
xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

In [27]:
print( "RMSE", train_XGboost(xgb_params))

RMSE 0.2276690432902143


In [28]:
xgb_params = {
    'eta': 0.01, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

In [29]:
print( "RMSE", train_XGboost(xgb_params))

RMSE 4.263073942463008
