In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn import svm
from sklearn.model_selection import GridSearchCV

In [3]:
data = pd.read_csv("clean-data.csv")

In [4]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,2,1,3,61.5,55.0,326,3.95,3.98,2.43
1,0.21,3,1,2,59.8,61.0,326,3.89,3.84,2.31
2,0.23,1,1,4,56.9,65.0,327,4.05,4.07,2.31
3,0.29,3,5,5,62.4,58.0,334,4.2,4.23,2.63
4,0.31,1,6,3,63.3,58.0,335,4.34,4.35,2.75


In [5]:
data.describe()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
count,53763.0,53763.0,53763.0,53763.0,53763.0,53763.0,53763.0,53763.0,53763.0,53763.0
mean,0.79746,2.554247,2.593698,3.836188,61.748781,57.457207,3930.785336,5.731405,5.733299,3.539367
std,0.473136,1.027364,1.701283,1.724832,1.419309,2.226311,3985.807738,1.118563,1.110473,0.690879
min,0.2,0.0,0.0,0.0,50.8,43.0,326.0,3.73,3.68,2.06
25%,0.4,2.0,1.0,2.0,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,2.0,3.0,4.0,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,3.0,4.0,5.0,62.5,59.0,5324.0,6.54,6.54,4.03
max,5.01,4.0,6.0,7.0,73.6,79.0,18823.0,10.74,10.54,6.98


In [6]:
#copy data to save back up
features = data.copy()

In [7]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53763 entries, 0 to 53762
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53763 non-null  float64
 1   cut      53763 non-null  int64  
 2   color    53763 non-null  int64  
 3   clarity  53763 non-null  int64  
 4   depth    53763 non-null  float64
 5   table    53763 non-null  float64
 6   price    53763 non-null  int64  
 7   x        53763 non-null  float64
 8   y        53763 non-null  float64
 9   z        53763 non-null  float64
dtypes: float64(6), int64(4)
memory usage: 4.1 MB


In [8]:
features.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,2,1,3,61.5,55.0,326,3.95,3.98,2.43
1,0.21,3,1,2,59.8,61.0,326,3.89,3.84,2.31
2,0.23,1,1,4,56.9,65.0,327,4.05,4.07,2.31
3,0.29,3,5,5,62.4,58.0,334,4.2,4.23,2.63
4,0.31,1,6,3,63.3,58.0,335,4.34,4.35,2.75


In [9]:
#select the independent and dependent variables

X = features.drop(columns='price', axis=1)
y = features['price']

In [10]:
#train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

In [11]:
num_features = features[["carat", "depth","table", "price","x","y","z"]]
cat_features = features[["cut", "color", "clarity"]]

In [12]:
stan = StandardScaler()

X_train = stan.fit_transform(X_train) 
X_test = stan.fit_transform(X_test)

#pipeline numerical

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ("std_scaler", StandardScaler())
])

X_train = num_pipeline.fit_transform(num_features)

**Linear Regression**

In [13]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()


#fitting the model
lr.fit(X_train,y_train)

LinearRegression()

In [14]:
from sklearn.metrics import mean_squared_error
#run the prediction
lr_pred = lr.predict(X_test)

#calculate the mean square error
lr_mse = mean_squared_error(lr_pred, y_test)

#calculate the root mean squared error
lr_rmse = np.sqrt(lr_mse)
print("The root mean squared error in Linear Regression is", lr_rmse)

The root mean squared error in Linear Regression is 1301.5801002109824


**Support Vector Machines Regressor**

In [15]:
stop.fig

NameError: name 'stop' is not defined

In [None]:




parameters = {'kernel':('linear','rbf','poly','sigmoid'), 
                'C': np.arange(1,15,2),
                'gamma': np.arange(1,15,2)}

regsvr = svm.SVR()

#fit the parameters to the gridsearch

cv_regsvr = GridSearchCV(regsvr, parameters, cv= 10)


cv_regsvr.fit(X_train, y_train)


In [None]:
print("tuned hyperparameters: (best parameters) ", cv_regsvr.best_params_)
print("accuracy : ", cv_regsvr.best_score_)

In [None]:
pred_svr = cv_regsvr.predict(X_test)

#calculate the RMSE
mse_svr = mean_squared_error(pred_svr, y_test)
rmse_svr = np.sqrt(mse_svr)

print("The Root mean Squared error in Support Vector Machines is: ", rmse_svr)

**Random Forest Regressor**

In [None]:
from sklearn.ensemble import RandomForestRegressor

trees = np.arange(100, 500, 50)
parameters = {
    'n_estimators': trees,
}

rfr = RandomForestRegressor()

rfr_cv = GridSearchCV(rfr, parameters, cv=10)

rfr_cv.fit(X_train, y_train)

In [None]:
print("tuned hyperparameters: (best parameters) ", rfr_cv.best_params_)
print("accuracy : ", rfr_cv.best_score_)

In [20]:
def model_rmse(reg):
    """A function to predict the X-test and also calculate the root mean score"""

    prediction = reg.predict(X_test)

    #calculate the RMSE
    mean_value = mean_squared_error(prediction, y_test)
    root_mean = np.sqrt(mean_value)

    return("Root Mean Square Error is: ",root_mean)

In [None]:
model_rmse(rfr_cv)

Best Performing model for Random forest regression is at either n_estimator 300 or 450 with a RMSE of 550

**Decision Trees**

In [22]:
from sklearn.tree import DecisionTreeRegressor

parameters = {
    'criterion': ('poisson', 'friedman_mse','absolute_error','squared_error'),
    'splitter': ('best', 'random'),
    'max_depth': np.arange(1,15,2),
    'random_state': np.arange(0,4,1)
}

dtr = DecisionTreeRegressor()

#applying grid search
dtr_cv = GridSearchCV(dtr, parameters, cv=5)

#fitting the model
dtr_cv.fit(X_train, y_train)

Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 1252, in fit
    super().fit(
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 351, in fit
    criterion = CRITERIA_REG[self.criterion](self.n_outputs_,
KeyError: 'absolute_error'

Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 1252, in fit
    super().fit(
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 351, in fit
    criterion = CRITERIA_REG[self.criterion](self.n_outputs_,
KeyError: 'absolute_er

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(),
             param_grid={'criterion': ('poisson', 'friedman_mse',
                                       'absolute_error', 'squared_error'),
                         'max_depth': array([ 1,  3,  5,  7,  9, 11, 13]),
                         'random_state': array([0, 1, 2, 3]),
                         'splitter': ('best', 'random')})

In [23]:
print("tuned hyperparameters: (best parameters ", dtr_cv.best_params_)
print("accuracy: ", dtr_cv.best_score_)

tuned hyperparameters: (best parameters  {'criterion': 'friedman_mse', 'max_depth': 11, 'random_state': 0, 'splitter': 'best'}
accuracy:  0.9742608588265881


In [24]:
model_rmse(dtr_cv)

('Root Mean Square Error is: ', 645.5441164345638)

**XGB Regressor**

In [16]:
import xgboost

In [17]:
import xgboost
from xgboost import XGBRegressor

parameters = {
    'n_estimators': np.arange(20, 170, 20),
    'max_depth': np.arange(1,11,2),
    'eta': [0.1, 0.01, 0.001]
}

xgb = XGBRegressor()

xgb_cv = GridSearchCV(xgb, parameters, cv = 10)

#fit the model
xgb_cv.fit(X_train, y_train)



GridSearchCV(cv=10,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, predictor=None,
                                    random_state=None, reg_alpha=None,
                                    reg_lambda=None, scale_pos_weight=None,
      

In [18]:
print("Tuned hyperparameters: (best parameters) ", xgb_cv.best_params_)
print("Accuracy: ", xgb_cv.best_score_)

Tuned hyperparameters: (best parameters)  {'eta': 0.1, 'max_depth': 7, 'n_estimators': 160}
Accuracy:  0.9821363048842192


Tuned hyperparameters: (best parameters)  {'eta': 0.1, 'max_depth': 7, 'n_estimators': 160}
Accuracy:  0.9821363048842192

In [21]:
model_rmse(xgb_cv)

('Root Mean Square Error is: ', 531.0856169158978)

The best performing model for XGB Regressor is 531 with an accuracy of 0.98