In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from math import sqrt


# Load the datasets

In [2]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')


In [3]:
train_df.head()


Unnamed: 0,ID,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
3,5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2
4,7,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43,22.9


In [4]:
test_df.head()


Unnamed: 0,ID,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat
0,3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
1,6,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21
2,8,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15
3,9,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93
4,10,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1


# Split the training data into training and validation sets

In [5]:
X = train_df.drop('medv', axis=1)
y = train_df['medv']
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
len(X_train)


266

In [7]:
len(X_validation)


67

# Initialize and train the model

In [8]:
model = LGBMRegressor(num_leaves=31, learning_rate=0.05, n_estimators=100, early_stopping_rounds=10, verbose=-1)
model.fit(X_train, y_train, eval_set=[(X_validation, y_validation)])


# Predict on validation set

In [9]:
y_pred = model.predict(X_validation)
rmse = sqrt(mean_squared_error(y_validation, y_pred))
print(f'Validation RMSE: {rmse}')


Validation RMSE: 2.8550089013119884


# Predict on test set

In [10]:
test_predictions = model.predict(test_df)
test_predictions


array([36.36302595, 24.06269934, 16.69024107, 15.6335339 , 18.23115111,
       18.61642148, 19.78300178, 15.58903005, 15.58294457, 16.69155256,
       17.04377827, 18.09973501, 15.15926543, 14.75969218, 22.27002747,
       22.38165503, 24.49831283, 28.64068855, 15.07064617, 22.65104846,
       23.38538122, 24.10640688, 20.9545313 , 22.03673544, 21.36777116,
       21.83567967, 22.9436849 , 24.50717857, 24.33197852, 26.64551282,
       41.33261813, 41.98705112, 36.18066752, 19.52724672, 17.82426305,
       22.04608218, 18.52389671, 17.35735081, 18.26613137, 19.74077011,
       16.4206413 , 17.97940651, 19.60390072, 17.33751054, 17.42645238,
       15.43172965, 15.69553911, 17.37319067, 19.88577326, 19.55537112,
       18.37881564, 34.26486126, 44.68469507, 19.99302013, 22.5617462 ,
       24.53605638, 33.66314348, 43.07283751, 42.5161844 , 37.52802996,
       33.91717267, 42.57471113, 21.18051208, 17.49297003, 21.14572321,
       22.7553366 , 27.82974157, 24.68947157, 20.40235885, 22.63

In [11]:
submission_df = pd.DataFrame()
submission_df['ID'] = test_df['ID']
submission_df['medv'] = test_predictions
submission_df


Unnamed: 0,ID,medv
0,3,36.363026
1,6,24.062699
2,8,16.690241
3,9,15.633534
4,10,18.231151
...,...,...
168,496,18.778522
169,497,16.262040
170,499,20.725601
171,501,19.422011


In [12]:
submission_df.to_csv('submission_lgbm.csv', index=False)


# The model got 4.12283 score on Kaggle