### Step 1: Load the Data

In [29]:
import pandas as pd 
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
# Load dataset
df = pd.read_csv(r"D:\Datasets\diamonds.csv")
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [16]:
df.duplicated().sum()

146

In [17]:
df.drop_duplicates(inplace=True)
df.shape

(53794, 10)

### Step 2: Identify Input and Output Variables

In [23]:
x = df.drop("price", axis=1)
y = df["price"]

In [24]:
x.shape

(53794, 9)

In [25]:
y.shape

(53794,)

### Step 3: Train–Test Split (75:25)

In [4]:
from sklearn.model_selection import train_test_split

In [26]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=42)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((40345, 9), (13449, 9), (40345,), (13449,))

### Step 4: Data Preprocessing on X_train

In [27]:
x_train

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
12820,1.02,Ideal,G,VS2,62.4,57.0,6.47,6.36,4.00
19997,1.05,Very Good,F,VVS2,61.3,59.0,6.48,6.56,4.00
6099,0.91,Premium,G,SI1,62.6,58.0,6.17,6.14,3.85
37984,0.32,Ideal,D,VVS2,60.9,57.0,4.39,4.45,2.70
24865,1.52,Premium,G,VS2,61.9,56.0,7.39,7.28,4.54
...,...,...,...,...,...,...,...,...,...
11311,1.00,Premium,H,VS2,60.4,58.0,6.51,6.46,3.92
44869,0.63,Ideal,G,SI1,61.7,54.0,5.52,5.56,3.42
38271,0.32,Ideal,J,SI1,61.8,54.9,4.39,4.42,2.72
860,0.90,Premium,J,SI1,62.8,59.0,6.13,6.03,3.82


In [30]:
df['cut'].unique()
df['clarity'].unique()
df['color'].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

array(['E', 'I', 'J', 'H', 'F', 'G', 'D'], dtype=object)

In [31]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler

In [33]:
ordinal_cols = ['cut', 'clarity', 'color']

In [32]:
cut_order = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
clarity_order = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
color_order = ['J', 'I', 'H', 'G', 'F', 'E', 'D']

In [34]:
oe = OrdinalEncoder(categories=[cut_order,clarity_order,color_order])
oe

In [35]:
num_cols = ['carat', 'depth', 'table', 'x', 'y', 'z']

In [36]:
min_max=MinMaxScaler()
min_max

### __Categorical Data Encoding and Numerical Data Rescaling on x_train__

In [37]:
ct = ColumnTransformer(transformers=[('ordinal', oe, ordinal_cols),('scaler', min_max, num_cols)]
                       ,remainder='passthrough',verbose_feature_names_out=False).set_output(transform='pandas')

In [38]:
x_train_transformed = ct.fit_transform(x_train)
x_train_transformed.head()

Unnamed: 0,cut,clarity,color,carat,depth,table,x,y,z
12820,4.0,3.0,3.0,0.170478,0.538889,0.269231,0.602421,0.10798,0.496278
19997,2.0,5.0,4.0,0.176715,0.508333,0.307692,0.603352,0.111375,0.496278
6099,3.0,2.0,3.0,0.147609,0.544444,0.288462,0.574488,0.104244,0.477667
37984,4.0,5.0,6.0,0.024948,0.497222,0.269231,0.408752,0.075552,0.334988
24865,3.0,3.0,3.0,0.274428,0.525,0.25,0.688082,0.123599,0.563275


### Step - 5: Data Preprocessing on X_test 


In [39]:
x_test

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
43657,0.71,Fair,J,VS2,64.9,54.0,5.63,5.53,3.62
4274,0.90,Very Good,E,SI2,61.0,59.0,6.14,6.18,3.76
47412,0.57,Ideal,G,VS1,62.1,56.0,5.30,5.33,3.30
44437,0.51,Premium,E,VS2,62.5,60.0,5.08,5.10,3.18
13975,1.19,Ideal,I,SI1,62.3,55.0,6.80,6.82,4.24
...,...,...,...,...,...,...,...,...,...
43980,0.32,Very Good,H,SI2,60.4,59.0,4.39,4.42,2.66
1115,0.73,Very Good,H,VS1,60.8,57.0,5.80,5.83,3.54
48829,0.52,Ideal,G,VVS1,62.0,55.0,5.15,5.18,3.20
42876,0.56,Fair,E,SI1,64.5,61.0,5.22,5.09,3.33


In [40]:
x_test_transformed = ct.transform(x_test)

In [41]:
x_test_transformed

Unnamed: 0,cut,clarity,color,carat,depth,table,x,y,z
43657,0.0,3.0,0.0,0.106029,0.608333,0.211538,0.524209,0.093888,0.449132
4274,2.0,1.0,5.0,0.145530,0.500000,0.307692,0.571695,0.104924,0.466501
47412,4.0,4.0,3.0,0.076923,0.530556,0.250000,0.493482,0.090492,0.409429
44437,3.0,3.0,5.0,0.064449,0.541667,0.326923,0.472998,0.086587,0.394541
13975,4.0,2.0,1.0,0.205821,0.536111,0.230769,0.633147,0.115789,0.526055
...,...,...,...,...,...,...,...,...,...
43980,2.0,1.0,2.0,0.024948,0.483333,0.307692,0.408752,0.075042,0.330025
1115,2.0,4.0,2.0,0.110187,0.494444,0.269231,0.540037,0.098981,0.439206
48829,4.0,6.0,3.0,0.066528,0.527778,0.230769,0.479516,0.087946,0.397022
42876,0.0,2.0,5.0,0.074844,0.597222,0.346154,0.486034,0.086418,0.413151


In [43]:
print(x_train_transformed.shape)
print(x_test_transformed.shape)


(40345, 9)
(13449, 9)


### Step 6: KNN Regressor from scratch

In [44]:
x_train_arr=np.array(x_train_transformed)
x_test_arr=np.array(x_test_transformed)
y_train_arr=np.array(y_train)

In [45]:
# Distance calculation

def distance_from_all_train(x_train_arr, one_test_row):
    # Euclidean distance
    return np.sqrt(np.sum((x_train_arr-one_test_row)**2, axis=1))

In [46]:
def predict_one(x_train_arr, y_train_arr, one_test_row, k):
    # Step 1: compute distances
    distances = distance_from_all_train(x_train_arr, one_test_row)
    
    # Step 2: get indices of k nearest neighbors
    k_indices = np.argsort(distances)[:k]
    
    # Step 3: take mean of their target values (REGRESSION)
    prediction = np.mean(y_train_arr[k_indices])
    
    return prediction

In [47]:
def predict_all(x_train_arr, y_train_arr, x_test_arr, k):
    preds = []
    for i in range(len(x_test_arr)):
        preds.append(
            predict_one(x_train_arr, y_train_arr, x_test_arr[i], k)
        )
    return np.array(preds)


In [48]:
x_train_small = x_train_arr[:5000]
y_train_small = y_train_arr[:5000]
x_test_small  = x_test_arr[:500]

In [56]:
y_test_small = y_test.iloc[:500]

#### This is important because scratch KNN has O(n × m) complexity

In [50]:
k = 5
y_pred_scratch = predict_all(
    x_train_small,
    y_train_small,
    x_test_small,
    k
)

In [51]:
y_pred_scratch

array([ 5284.2,  4374.2,  1941.8,  1513.8,  5154.8,   597.4,   716.6,
        2337.8,  1684.8,  4724. ,  2772.8,  5685.8,  5119. ,   911.6,
        1984. ,  5235. ,  4319.6,  5235. ,  3165.4,   979.4,  3118.2,
        4364.8,  5285.2,  3960.2,   755. ,  4432.4,   616.8,  4159.6,
       10634.4,  3874.4,  8219.8,  5043. ,   527.8,   800. ,   798.6,
        1066.2,  4361.8,  1659. ,  2564.4,  7165.2,  1162.8,  1524. ,
         494. ,  2467. ,  7750.8,  4687. ,  1952.4,  1592.6,  1590.6,
        3150.4,  1306.2,  2536.2,   631.8,   669.6,   628. ,  5497.2,
        1725.6, 15244.4,  2094.4,  1176.8,  5650.4,  4786. ,  4803.4,
        1801.4,  3574.2,  2799.2, 11539.4,   891.2, 12692.6,  6797.8,
        4384.2,  1057.6,  4456.8,  4880. ,  2431.8,  5355.2,  4387.8,
        1408.8,  5657.4,   939.8,  1433.4,  2781.6,  3825.8,  2220.8,
        7386. ,  2946.2,  3008.6,  6163.2,   563.8,  2421. ,  8721.2,
        1629.8,  4831.6,  5235. ,  1048. ,   466.6,   569.4,  4727.6,
       12486.6,   88

In [52]:
y_pred_scratch.shape

(500,)

In [59]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

print("Scratch KNN")
print("MAE :", mean_absolute_error(y_test_small, y_pred_scratch))
print("RMSE:", np.sqrt(mean_squared_error(y_test_small, y_pred_scratch)))
print("R²  :", r2_score(y_test_small, y_pred_scratch))

Scratch KNN
MAE : 754.6912
RMSE: 1649.9637441834896
R²  : 0.8159056876474406


In [61]:
from sklearn.neighbors import KNeighborsRegressor

k = 5
knn_model = KNeighborsRegressor(n_neighbors=k)   # default distance = Euclidean
knn_model.fit(x_train_small, y_train_small)

In [63]:
# Predict on the same small test

y_pred_sklearn = knn_model.predict(x_test_small)
y_pred_sklearn[:10]

array([5284.2, 4374.2, 1941.8, 1513.8, 5154.8,  597.4,  716.6, 2337.8,
       1684.8, 4724. ])

In [67]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

mae_sk = mean_absolute_error(y_test_small, y_pred_sklearn)
mse_sk = mean_squared_error(y_test_small, y_pred_sklearn)
rmse_sk = np.sqrt(mse_sk)
r2_sk = r2_score(y_test_small, y_pred_sklearn)

In [68]:
print("---- Comparison (Scratch vs Sklearn) ----")
print("MAE  :", mae, "vs", mae_sk)
print("RMSE :", rmse, "vs", rmse_sk)
print("R2   :", r2, "vs", r2_sk)

---- Comparison (Scratch vs Sklearn) ----
MAE  : 754.6912 vs 754.6912
RMSE : 1649.9637441834896 vs 1649.9637441834896
R2   : 0.8159056876474406 vs 0.8159056876474406


#### Step 8: Observations from Scratch Operation of KNN vs Using KNN from Sklearn

- The scratch KNN regressor and sklearn’s KNN regressor produced identical evaluation metrics when trained and tested on the same data subset with the same value of k and distance metric.

- This confirms that the scratch implementation correctly replicates the core logic of the KNN algorithm, including distance computation, neighbor selection, and prediction aggregation.

- The scratch implementation is computationally slower because it relies on explicit loops and brute-force distance calculations.

- While scratch implementation is useful for understanding the internal working of KNN, sklearn’s implementation is preferred for real-world applications and large datasets.

- Any differences between the two approaches typically arise from changes in the distance metric, the weighting scheme (for example, `weights='distance'`), or differences in data preprocessing.
