In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from math import sqrt


# Load the datasets

In [2]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')


In [3]:
train_df.head()


Unnamed: 0,ID,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
3,5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2
4,7,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43,22.9


In [4]:
test_df.head()


Unnamed: 0,ID,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat
0,3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
1,6,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21
2,8,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15
3,9,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93
4,10,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1


# Split the training data into training and validation sets

In [5]:
X = train_df.drop('medv', axis=1)
y = train_df['medv']
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
len(X_train)


266

In [7]:
len(X_validation)


67

# Initialize and train the model

In [8]:
model = XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.05, max_depth=3, early_stopping_rounds=10)
model.fit(X_train, y_train, eval_set=[(X_validation, y_validation)], verbose=False)


# Predict on validation set

In [9]:
y_pred = model.predict(X_validation)
rmse = sqrt(mean_squared_error(y_validation, y_pred))
print(f'Validation RMSE: {rmse}')


Validation RMSE: 2.7156560273310455


# Predict on test set

In [10]:
test_predictions = model.predict(test_df)
test_predictions


array([35.56349  , 25.408875 , 16.928946 , 15.766086 , 17.419239 ,
       17.992695 , 20.071798 , 16.490725 , 16.528818 , 17.89488  ,
       18.632595 , 21.419909 , 14.91279  , 16.352617 , 21.519    ,
       21.957006 , 23.386515 , 29.009607 , 16.868143 , 24.300848 ,
       22.131157 , 22.702505 , 22.463701 , 21.492983 , 20.580076 ,
       22.8756   , 23.232796 , 23.108418 , 23.411106 , 26.408672 ,
       42.677773 , 42.175064 , 31.773092 , 19.997364 , 17.566303 ,
       20.261332 , 18.171719 , 17.565464 , 19.50556  , 20.4826   ,
       16.071987 , 19.28671  , 21.125513 , 17.18672  , 15.155695 ,
       14.665497 , 14.665497 , 18.186405 , 21.802341 , 21.752468 ,
       17.039682 , 32.79187  , 43.131256 , 21.446873 , 24.326988 ,
       24.0978   , 29.107061 , 46.195854 , 37.168003 , 33.29088  ,
       31.69041  , 42.39548  , 19.412949 , 19.247957 , 19.754988 ,
       21.689806 , 24.06799  , 25.234608 , 18.80327  , 22.234976 ,
       28.69479  , 41.634384 , 33.872704 , 36.386127 , 24.1952

In [11]:
submission_df = pd.DataFrame()
submission_df['ID'] = test_df['ID']
submission_df['medv'] = test_predictions
submission_df


Unnamed: 0,ID,medv
0,3,35.563492
1,6,25.408875
2,8,16.928946
3,9,15.766086
4,10,17.419239
...,...,...
168,496,18.352749
169,497,16.459480
170,499,19.890814
171,501,19.626846


In [12]:
submission_df.to_csv('submission_xgboost.csv', index=False)


# The model got 3.43069 score on Kaggle