---
title: Nearest Neighbor Model
author: Andrei Akopian
date: 2026-01-28
format:
  html:
    code-fold: true
    code-summary: "Show the code"
  pdf:
    code-overflow: wrap
    echo: false
    output: true
---

In [3]:
import numpy as np
import pandas as pd
import pathlib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [4]:
plt.rcParams['font.family'] = 'Times New Roman'

In [5]:
# helper functions
def open_file(filename):
    path = pathlib.PurePath(filename)
    file_format = path.suffix
    parsing_functions = {
        ".csv" : pd.read_csv,
    }
    return parsing_functions[file_format](filename)

def take_subset(df,start,end):
    """ grab a subset of wavelengths from the dataframe

    return (npv_fractions, spectra, spectra_sources)
    """

    columns = df.columns.to_list()
    wanted = []
    for c in columns:
        if c.isdigit():
            if start<=int(c)<=end:
                wanted.append(c)
    fractions = df[["npv_fraction","gv_fraction","soil_fraction"]]
    spectra = df[wanted]
    spectra_sources = df[["Spectra"]]
    return fractions, spectra, spectra_sources
def simple_histogram(data=[1,2,3],title="Title",x="x-axis",y='y-axis',bins=10):
    fig, ax = plt.subplots(figsize=(5, 3))
    ax.set_xlabel(x)
    ax.set_ylabel(y)
    ax.set_title(title)
    ax.hist(data,bins=bins)
    print()

In [6]:
df = open_file('unmixing/original_data.csv')

fractions, nr900to1700, _ = take_subset(df,start=900,end=1700)

# Finding Best KNN hyperparameters

In [None]:
# Normal KNN regression
X_train, X_test, y_train, y_test = train_test_split(nr900to1700, fractions, test_size=0.5, random_state=42)



k=3
knn = KNeighborsRegressor(n_neighbors=3, metric='cosine') # metric defines the distance difinition
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
current_r2 = r2_score(y_test, y_pred)