In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

## Import Provisional CSV and Perform Basic Data Cleaning

In [9]:
# Load the data
file_name = "Resources/Diamonds_Prices2022.csv"
df = pd.read_csv(file_name, index_col = "Unnamed: 0")

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

print(df.shape)
df.head()

(53943, 10)


Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


# Split the Data into Training and Testing

In [10]:
# Create our features
X = df.copy()
X = X.drop('price', axis=1)

# Create our target
y = df["price"].values

In [11]:
X.describe()

Unnamed: 0,carat,depth,table,x,y,z
count,53943.0,53943.0,53943.0,53943.0,53943.0,53943.0
mean,0.797935,61.749322,57.457251,5.731158,5.734526,3.53873
std,0.473999,1.432626,2.234549,1.12173,1.142103,0.705679
min,0.2,43.0,43.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,6.54,6.54,4.04
max,5.01,79.0,95.0,10.74,58.9,31.8


In [12]:
# Check the balance of our target values
Counter(y)

Counter({326: 2,
         327: 1,
         334: 1,
         335: 1,
         336: 2,
         337: 2,
         338: 1,
         339: 1,
         340: 1,
         342: 1,
         344: 1,
         345: 2,
         348: 1,
         351: 4,
         352: 1,
         353: 3,
         354: 1,
         355: 1,
         357: 8,
         402: 20,
         403: 10,
         404: 7,
         405: 7,
         552: 113,
         553: 6,
         554: 34,
         2757: 12,
         2759: 5,
         2760: 9,
         2761: 6,
         2762: 13,
         2763: 5,
         2764: 2,
         2765: 6,
         2766: 3,
         2767: 4,
         2768: 5,
         2769: 1,
         2770: 9,
         2771: 4,
         2772: 5,
         2773: 2,
         2774: 4,
         2775: 1,
         2776: 9,
         2777: 21,
         2778: 1,
         2779: 2,
         2780: 5,
         2781: 3,
         2782: 11,
         2783: 1,
         2784: 2,
         2787: 3,
         2788: 8,
         2789: 13,
        

In [13]:
# Splitting into Train and Test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=1)