In [None]:
import pandas as pd
df = pd.read_csv("https://dlsun.github.io/pods/data/bordeaux.csv",
index_col="year")
df.head()

In [None]:
df.info()

In [None]:
import plotly.express as px
import plotly.graph_objects as go
fig1 = px.scatter(df[~df["price"].isnull()],
x="win", y="summer", color="price")
fig2 = px.scatter(df[df["price"].isnull()],
x="win", y="summer", symbol_sequence=["circle-open"])
go.Figure(data=fig1.data + fig2.data, layout=fig1.layout)

In [None]:
df_train = df.loc[:1980].copy()
df_test = df.loc[1981:].copy()

In [None]:
df_train.head()

In [None]:
X_train = df_train[["win", "summer"]]
y_train = df_train["price"]


In [None]:
# Standardize the features.
X_train_mean = X_train.mean()
X_train_sd = X_train.std()
X_train_scaled = (X_train - X_train_mean) / X_train_sd


In [None]:
X_test = df_test[["win", "summer"]]
X_test_scaled = (X_test - X_train_mean) / X_train_sd
X_test_scaled


### Using scikit-learn

In [11]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
# Scale the test data using a scaler that was fit to the training data!
X_test_scaled = scaler.transform(X_test)


In [12]:
from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor(n_neighbors=5)
model.fit(X=X_train_scaled, y=y_train)
model.predict(X=X_test_scaled)


array([35.8, 54. , 52.2, 18.4, 35.6, 13.2, 37. , 51.4, 36.6, 36.6, 40.6])

In [13]:
from sklearn.pipeline import make_pipeline
pipeline = make_pipeline(
StandardScaler(),
KNeighborsRegressor(n_neighbors=5))

In [None]:
pipeline.fit(X=X_train, y=y_train)
pipeline.predict(X=X_test)

array([35.8, 54. , 52.2, 18.4, 35.6, 13.2, 37. , 51.4, 36.6, 36.6, 40.6])

### Geeksforgeeks

In [1]:
import numpy as np
from collections import Counter

In [2]:
def euclidean_distance(point1, point2):
    return np.sqrt(np.sum((np.array(point1) - np.array(point2))**2))

In [3]:
def knn_predict(training_data, training_labels, test_point, k):
    distances = []
    for i in range(len(training_data)):
        dist = euclidean_distance(test_point, training_data[i])
        distances.append((dist, training_labels[i]))
    distances.sort(key=lambda x: x[0])
    k_nearest_labels = [label for _, label in distances[:k]]
    return Counter(k_nearest_labels).most_common(1)[0][0]

In [7]:
Counter([1,2,3,4,5,1,1,2,1,3]).most_common(1)[0][0]

1