# Milk
Dataset obtained from https://www.kaggle.com/datasets/cpluzshrijayan/milkquality?select=milknew.csv

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier

from itertools import combinations

import numpy.typing as npt
from typing import Final

In [2]:
milkDF: pd.DataFrame = pd.read_csv("milk.csv")

# Fix typos in column headers
milkDF.columns = ["pH", "Temperature", "Taste", "Odor", "Fat", "Turbidity", "Color", "Grade"]

In [3]:
milkDF.head()

Unnamed: 0,pH,Temperature,Taste,Odor,Fat,Turbidity,Color,Grade
0,6.6,35,1,0,1,0,254,high
1,6.6,36,0,1,0,1,253,high
2,8.5,70,1,1,1,1,246,low
3,9.5,34,1,1,0,1,255,low
4,6.6,37,0,0,0,0,255,medium


In [4]:
TARGET_COLUMN_NAME: Final[str] = "Grade"
TARGET_SERIES: Final["pd.Series[str]"] = milkDF[TARGET_COLUMN_NAME].astype(str)

features: list[str] = [columnName for columnName in milkDF.columns if columnName != TARGET_COLUMN_NAME]
featureCombinations: list[list[str]] = []
for thisCombinationList in [list(combinations(features, featureCount)) for featureCount in range(1, len(features) + 1)]:
    featureCombinations.extend([list(thisCombination) for thisCombination in thisCombinationList])

In [5]:
NEIGHBOR_COUNT_MIN: Final[int] = 1
NEIGHBOR_COUNT_MAX: Final[int] = 40

def trainModel(columns: list[str]) -> tuple[KNeighborsClassifier, int, np.float64]:
    featureDF: pd.DataFrame = milkDF[columns].astype(float)

    trainX: pd.DataFrame
    testX: pd.DataFrame
    trainY: pd.DataFrame
    testY: pd.DataFrame
    trainX, testX, trainY, testY = train_test_split(featureDF, TARGET_SERIES)

    models: list[KNeighborsClassifier] = []
    modelAccuracies: npt.NDArray[np.float64] = np.zeros(NEIGHBOR_COUNT_MAX - NEIGHBOR_COUNT_MIN + 1, np.float64)

    for thisNeighborCount in range(NEIGHBOR_COUNT_MIN, NEIGHBOR_COUNT_MAX + 1):
        thisModel = KNeighborsClassifier(thisNeighborCount).fit(trainX, trainY)
        models.append(thisModel)
        modelAccuracies[thisNeighborCount - NEIGHBOR_COUNT_MIN] = np.float64(metrics.accuracy_score(testY, thisModel.predict(testX)))
    
    bestModelIndex: int = int(modelAccuracies.argmax())
    bestModel: KNeighborsClassifier = models[bestModelIndex]
    return (bestModel, bestModelIndex + NEIGHBOR_COUNT_MIN, modelAccuracies[bestModelIndex])

In [6]:
allModels: list[tuple[list[str], KNeighborsClassifier, int]] = []
allModelAccuracies: npt.NDArray[np.float64] = np.zeros(len(featureCombinations), np.float64)

for thisFeatureCombinationIndex, thisFeatureCombination in enumerate(featureCombinations):
    model: KNeighborsClassifier
    neighborCount: int
    accuracy: np.float64
    model, neighborCount, accuracy = trainModel(thisFeatureCombination)
    allModels.append((thisFeatureCombination, model, neighborCount))
    allModelAccuracies[thisFeatureCombinationIndex] = accuracy

finalBestModelIndex: int = int(allModelAccuracies.argmax())
finalBestModelInfo: tuple[list[str], KNeighborsClassifier, int] = allModels[finalBestModelIndex]
finalBestModelColumns: list[str] = finalBestModelInfo[0]
finalBestModel: KNeighborsClassifier = finalBestModelInfo[1]
finalBestModelNeighborCount: int = finalBestModelInfo[2]
finalBestModelAccuracy: np.float64 = allModelAccuracies[finalBestModelIndex]

In [7]:
print("Most accurate model:")

print(f"Columns: {finalBestModelColumns[0]}", end="")
for thisColumn in finalBestModelColumns[1:]:
    print(", " + thisColumn, end="")
print()

print(f"K: {finalBestModelNeighborCount}")

print(f"Accuracy: {finalBestModelAccuracy}")

Most accurate model:
Columns: pH, Temperature, Taste, Odor, Fat, Turbidity
K: 1
Accuracy: 1.0
