<a href="https://colab.research.google.com/github/tomdavid92/ML-models/blob/main/diamonds_price_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Import necessary libraries**

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# **Import dataset**

In [3]:
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/diamonds.csv')

Mounted at /content/drive


# **Exploratory data analysis**

In [30]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [31]:
df.shape

(53940, 10)

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [33]:
df.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


# **Data preparation**

## Data separation

In [34]:
X, y = df.drop('price', axis=1), df['price']

In [36]:
X.dtypes

Unnamed: 0,0
carat,float64
cut,object
color,object
clarity,object
depth,float64
table,float64
x,float64
y,float64
z,float64


## Data splitting

Make it as DMatrix for XGBoost treatments

In [11]:
category_list_columns = X.select_dtypes(exclude=np.number).columns.tolist()

for col in category_list_columns:
  X[col] = X[col].astype('category')

In [12]:
X.dtypes

Unnamed: 0,0
carat,float64
cut,category
color,category
clarity,category
depth,float64
table,float64
x,float64
y,float64
z,float64


In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **Model Building**

In [38]:
import xgboost as xgb

d_train_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
d_test_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

XGBoost Regression

In [39]:
params = {"objective": "reg:squarederror", "tree_method": "hist"}

In [40]:
n = 100
model = xgb.train(params=params, dtrain=d_train_reg, num_boost_round=n)

Evaluation

In [41]:
from sklearn.metrics import mean_squared_error
preds = model.predict(d_test_reg)
rmse = mean_squared_error(y_test, preds, squared=False)
print(f"RMSE of the base model: {rmse:.3f}")

RMSE of the base model: 545.192


In [44]:
params = {"objective": "reg:squarederror", "tree_method": "hist"}
n = 100
evals = [(d_train_reg, "train"), (d_test_reg, "validation")]

In [47]:
model = xgb.train(
   params=params,
   dtrain=d_train_reg,
   num_boost_round=n,
   evals=evals,
)

[0]	train-rmse:2861.71326	validation-rmse:2853.85688
[1]	train-rmse:2087.22208	validation-rmse:2080.88058
[2]	train-rmse:1549.27061	validation-rmse:1544.13098
[3]	train-rmse:1185.13563	validation-rmse:1182.53339
[4]	train-rmse:943.01001	validation-rmse:942.11856
[5]	train-rmse:789.57197	validation-rmse:790.16193
[6]	train-rmse:690.62100	validation-rmse:695.83846
[7]	train-rmse:630.78564	validation-rmse:641.87271
[8]	train-rmse:594.21550	validation-rmse:611.64636
[9]	train-rmse:571.71186	validation-rmse:593.20351
[10]	train-rmse:554.29819	validation-rmse:579.26422
[11]	train-rmse:541.55608	validation-rmse:570.45546
[12]	train-rmse:532.56782	validation-rmse:565.51254
[13]	train-rmse:524.23258	validation-rmse:560.53057
[14]	train-rmse:518.15856	validation-rmse:556.80032
[15]	train-rmse:512.93476	validation-rmse:555.75156
[16]	train-rmse:510.11780	validation-rmse:554.48049
[17]	train-rmse:506.87514	validation-rmse:553.83358
[18]	train-rmse:502.48391	validation-rmse:551.51128
[19]	train-rms

In [49]:
params = {"objective": "reg:squarederror", "tree_method": "hist"}
n = 5000

evals = [(d_test_reg, "validation"), (d_train_reg, "train")]


model = xgb.train(
   params=params,
   dtrain=d_train_reg,
   num_boost_round=n,
   evals=evals,
   verbose_eval=250
)

[0]	validation-rmse:2853.85688	train-rmse:2861.71326
[250]	validation-rmse:553.13553	train-rmse:285.02438
[500]	validation-rmse:562.47732	train-rmse:206.74470
[750]	validation-rmse:569.81204	train-rmse:161.82647
[1000]	validation-rmse:574.70442	train-rmse:131.56736
[1250]	validation-rmse:577.92934	train-rmse:110.25011
[1500]	validation-rmse:579.69954	train-rmse:93.80344
[1750]	validation-rmse:581.91763	train-rmse:81.16993
[2000]	validation-rmse:583.14571	train-rmse:70.86212
[2250]	validation-rmse:584.42655	train-rmse:62.16493
[2500]	validation-rmse:585.14955	train-rmse:55.05947
[2750]	validation-rmse:585.92294	train-rmse:49.26530
[3000]	validation-rmse:586.31957	train-rmse:44.30367
[3250]	validation-rmse:586.67409	train-rmse:39.85742
[3500]	validation-rmse:587.01904	train-rmse:36.21849
[3750]	validation-rmse:587.30401	train-rmse:32.97982
[4000]	validation-rmse:587.53382	train-rmse:30.16726
[4250]	validation-rmse:587.76085	train-rmse:27.57122
[4500]	validation-rmse:587.97758	train-rmse: