In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [7]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

from sklearn.preprocessing import StandardScaler,OneHotEncoder

## Import Provisional CSV and Perform Basic Data Cleaning

In [9]:
# Load the data
file_name = "Resources/Diamonds_Prices2022.csv"
df = pd.read_csv(file_name, index_col = "Unnamed: 0")

In [10]:
# EDA showed no missing values

# # Drop the null columns where all values are null
# df = df.dropna(axis='columns', how='all')

# # Drop the null rows
# df = df.dropna()

#### Address z == 0
Missing values in x, y, and z were assigned as 0's

In [11]:
# all x and y 0's also have z 0's

# find (x,y,z) 0's
df[df.z == 0]

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
2208,1.0,Premium,G,SI2,59.1,59.0,3142,6.55,6.48,0.0
2315,1.01,Premium,H,I1,58.1,59.0,3167,6.66,6.6,0.0
4792,1.1,Premium,G,SI2,63.0,59.0,3696,6.5,6.47,0.0
5472,1.01,Premium,F,SI2,59.2,58.0,3837,6.5,6.47,0.0
10168,1.5,Good,G,I1,64.0,61.0,4731,7.15,7.04,0.0
11183,1.07,Ideal,F,SI2,61.6,56.0,4954,0.0,6.62,0.0
11964,1.0,Very Good,H,VS2,63.3,53.0,5139,0.0,0.0,0.0
13602,1.15,Ideal,G,VS2,59.2,56.0,5564,6.88,6.83,0.0
15952,1.14,Fair,G,VS1,57.5,67.0,6381,0.0,0.0,0.0
24395,2.18,Premium,H,SI2,59.4,61.0,12631,8.49,8.45,0.0


In [12]:
# z can not be recovered without x, 
# as round cut diamonds can be quite oval

df.drop(df[df.x == 0].index, inplace=True)

df[df.z == 0]

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
2208,1.0,Premium,G,SI2,59.1,59.0,3142,6.55,6.48,0.0
2315,1.01,Premium,H,I1,58.1,59.0,3167,6.66,6.6,0.0
4792,1.1,Premium,G,SI2,63.0,59.0,3696,6.5,6.47,0.0
5472,1.01,Premium,F,SI2,59.2,58.0,3837,6.5,6.47,0.0
10168,1.5,Good,G,I1,64.0,61.0,4731,7.15,7.04,0.0
13602,1.15,Ideal,G,VS2,59.2,56.0,5564,6.88,6.83,0.0
24395,2.18,Premium,H,SI2,59.4,61.0,12631,8.49,8.45,0.0
26124,2.25,Premium,I,SI1,61.3,58.0,15397,8.52,8.42,0.0
27113,2.2,Premium,H,SI1,61.2,59.0,17265,8.42,8.37,0.0
27504,2.02,Premium,H,VS2,62.7,53.0,18207,8.02,7.95,0.0


In [13]:
# z can be recovered from x, y, and depth:
# z = depth / 100 * mean(x, y)

index_values = df[df.z == 0].index.values
print(index_values)

df.z = np.where(df.z == 0, df.depth / 100 * ((df.x + df.y) / 2), df.z)

# updated z values that were previously 0's
df.loc[index_values]

[ 2208  2315  4792  5472 10168 13602 24395 26124 27113 27504 27740 51507]


Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
2208,1.0,Premium,G,SI2,59.1,59.0,3142,6.55,6.48,3.850365
2315,1.01,Premium,H,I1,58.1,59.0,3167,6.66,6.6,3.85203
4792,1.1,Premium,G,SI2,63.0,59.0,3696,6.5,6.47,4.08555
5472,1.01,Premium,F,SI2,59.2,58.0,3837,6.5,6.47,3.83912
10168,1.5,Good,G,I1,64.0,61.0,4731,7.15,7.04,4.5408
13602,1.15,Ideal,G,VS2,59.2,56.0,5564,6.88,6.83,4.05816
24395,2.18,Premium,H,SI2,59.4,61.0,12631,8.49,8.45,5.03118
26124,2.25,Premium,I,SI1,61.3,58.0,15397,8.52,8.42,5.19211
27113,2.2,Premium,H,SI1,61.2,59.0,17265,8.42,8.37,5.13774
27504,2.02,Premium,H,VS2,62.7,53.0,18207,8.02,7.95,5.006595


#### Possible feature selection
- Combine colors I + J, as they contain relatively few values..
- Combine clarities IF + VVS1 and I1 + SI2, as IF and I1 have relatively few values, and IF has a similar clarity to VVS21, and I1 has a similar clarity to SI2 

In [14]:
# encode feature columns that are strings/objects
# use get_dummies

type_objs = ['cut', 'color', 
             'clarity'
            ]

df = pd.get_dummies(df, columns=type_objs)

print(df.shape)
df.head()

(53935, 27)


Unnamed: 0,carat,depth,table,price,x,y,z,cut_Fair,cut_Good,cut_Ideal,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
1,0.23,61.5,55.0,326,3.95,3.98,2.43,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2,0.21,59.8,61.0,326,3.89,3.84,2.31,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0.23,56.9,65.0,327,4.05,4.07,2.31,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,0.29,62.4,58.0,334,4.2,4.23,2.63,0,0,0,...,1,0,0,0,0,0,0,1,0,0
5,0.31,63.3,58.0,335,4.34,4.35,2.75,0,1,0,...,0,1,0,0,0,1,0,0,0,0


In [15]:
df.columns

Index(['carat', 'depth', 'table', 'price', 'x', 'y', 'z', 'cut_Fair',
       'cut_Good', 'cut_Ideal', 'cut_Premium', 'cut_Very Good', 'color_D',
       'color_E', 'color_F', 'color_G', 'color_H', 'color_I', 'color_J',
       'clarity_I1', 'clarity_IF', 'clarity_SI1', 'clarity_SI2', 'clarity_VS1',
       'clarity_VS2', 'clarity_VVS1', 'clarity_VVS2'],
      dtype='object')

# Split the Data into Training and Testing

In [16]:
# Create our features
X = df.copy()
X = X.drop('price', axis=1)

# Create our target
y = df["price"].values

In [17]:
X.describe()

Unnamed: 0,carat,depth,table,x,y,z,cut_Fair,cut_Good,cut_Ideal,cut_Premium,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
count,53935.0,53935.0,53935.0,53935.0,53935.0,53935.0,53935.0,53935.0,53935.0,53935.0,...,53935.0,53935.0,53935.0,53935.0,53935.0,53935.0,53935.0,53935.0,53935.0,53935.0
mean,0.797874,61.749254,57.457097,5.732008,5.735254,3.540261,0.029832,0.090924,0.399536,0.255697,...,0.100528,0.052063,0.013739,0.033188,0.242273,0.17039,0.151479,0.227255,0.067748,0.093928
std,0.473973,1.432519,2.234123,1.119639,1.140311,0.702647,0.170126,0.287504,0.489808,0.436256,...,0.300706,0.222155,0.116406,0.179129,0.428463,0.375979,0.358518,0.419063,0.251316,0.291731
min,0.2,43.0,43.0,3.73,3.68,1.07,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,4.71,4.72,2.91,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.7,61.8,57.0,5.7,5.71,3.53,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.04,62.5,59.0,6.54,6.54,4.04,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,5.01,79.0,95.0,10.74,58.9,31.8,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [18]:
# Check the balance of our target values
# Counter(y)

In [26]:
# Splitting into Train and Test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25, # default
                                                    random_state=1)

In [27]:
# # Create a StandardScaler instances
# scaler = StandardScaler()

# # Fit the StandardScaler
# X_scaler = scaler.fit(X_train)

# # Scale the data
# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)

In [28]:
# import statsmodels.api as sm


In [29]:
from sklearn.linear_model import LinearRegression

In [30]:
lm = LinearRegression()
lm.fit(X_train, y_train)

LinearRegression()

In [31]:
# R-squared values
print(lm.score(X_train, y_train))
print(lm.score(X_test, y_test))

0.9196831285444621
0.9214680077548184
