# Milk
Dataset obtained from https://www.kaggle.com/datasets/cpluzshrijayan/milkquality?select=milknew.csv

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier

from itertools import combinations

import numpy.typing as npt
from typing import Final

In [2]:
milkDF: pd.DataFrame = pd.read_csv("milk.csv")

# Fix typos in column headers
milkDF.columns = ["pH", "Temperature", "Taste", "Odor", "Fat", "Turbidity", "Color", "Grade"]

In [3]:
milkDF.head()

Unnamed: 0,pH,Temperature,Taste,Odor,Fat,Turbidity,Color,Grade
0,6.6,35,1,0,1,0,254,high
1,6.6,36,0,1,0,1,253,high
2,8.5,70,1,1,1,1,246,low
3,9.5,34,1,1,0,1,255,low
4,6.6,37,0,0,0,0,255,medium


In [4]:
# Determine features (will test every possible combination) and target

TARGET_COLUMN_NAME: Final[str] = "Grade"
TARGET_SERIES: Final["pd.Series[str]"] = milkDF[TARGET_COLUMN_NAME].astype(str)

features: list[str] = [columnName for columnName in milkDF.columns if columnName != TARGET_COLUMN_NAME]
featureCombinations: list[list[str]] = []
for thisCombinationList in [list(combinations(features, featureCount)) for featureCount in range(1, len(features) + 1)]:
    featureCombinations.extend([list(thisCombination) for thisCombination in thisCombinationList])

In [5]:
NEIGHBOR_COUNT_MIN: Final[int] = 1
NEIGHBOR_COUNT_MAX: Final[int] = 40

def trainModel(features: list[str]) -> tuple[KNeighborsClassifier, int, np.longdouble]:
    """Train a KNN model to predict values of TARGET_SERIES using the given features.

    Tests every possible K from NEIGHBOR_COUNT_MIN to NEIGHBOR_COUNT_MAX, inclusive.
    Returns a tuple containing the best model, the K used to obtain that model,
    and its accuracy from sklearn.metrics.accuracy_score (0.0 to 1.0, inclusive).
    """

    featureDF: pd.DataFrame = milkDF[features].astype(float)

    trainX: pd.DataFrame
    testX: pd.DataFrame
    trainY: pd.DataFrame
    testY: pd.DataFrame
    trainX, testX, trainY, testY = train_test_split(featureDF, TARGET_SERIES)

    models: list[KNeighborsClassifier] = []
    modelAccuracies: npt.NDArray[np.longdouble] = np.zeros(NEIGHBOR_COUNT_MAX - NEIGHBOR_COUNT_MIN + 1, np.longdouble)

    for thisNeighborCount in range(NEIGHBOR_COUNT_MIN, NEIGHBOR_COUNT_MAX + 1):
        thisModel = KNeighborsClassifier(thisNeighborCount).fit(trainX, trainY)
        models.append(thisModel)
        modelAccuracies[thisNeighborCount - NEIGHBOR_COUNT_MIN] = np.longdouble(metrics.accuracy_score(testY, thisModel.predict(testX)))
    
    bestModelIndex: int = int(modelAccuracies.argmax())
    bestModel: KNeighborsClassifier = models[bestModelIndex]
    return (bestModel, bestModelIndex + NEIGHBOR_COUNT_MIN, modelAccuracies[bestModelIndex])

In [6]:
# Enumerate through and train with all combinations of features, adding each model to allModels
# (and each model's accuracy to allModelAccuracies)

allModels: list[tuple[list[str], KNeighborsClassifier, int]] = []
allModelAccuracies: npt.NDArray[np.longdouble] = np.zeros(len(featureCombinations), np.longdouble)

for thisFeatureCombinationIndex, thisFeatureCombination in enumerate(featureCombinations):
    model: KNeighborsClassifier
    neighborCount: int
    accuracy: np.longdouble
    model, neighborCount, accuracy = trainModel(thisFeatureCombination)
    allModels.append((thisFeatureCombination, model, neighborCount))
    allModelAccuracies[thisFeatureCombinationIndex] = accuracy

In [7]:
# Determine the best possible combination of features for predicting TARGET_SERIES
# by finding the maximum of the combinations' highest achievable accuracies

highestAccuracy: np.longdouble = allModelAccuracies.max()
finalBestModelIndices: list[int] = np.argwhere(allModelAccuracies == highestAccuracy).flatten().tolist()

In [8]:
def printModelInfo(modelIndex: int) -> None:
    """Print info for a model, given its index in allModels."""
    modelInfo: tuple[list[str], KNeighborsClassifier, int] = allModels[modelIndex]
    modelFeatures: list[str] = modelInfo[0]
    # TEMP
    # finalBestModel: KNeighborsClassifier = finalBestModelInfo[1]
    modelNeighborCount: int = modelInfo[2]
    modelAccuracy: np.longdouble = allModelAccuracies[modelIndex]

    print(f"Features: {modelFeatures[0]}", end="")
    for thisFeature in modelFeatures[1:]:
        print(", " + thisFeature, end="")
    print()

    print(f"\tK: {modelNeighborCount}")
    print(f"\tAccuracy: ~{modelAccuracy}")

print("Most accurate model(s):", end="\n\n")

for thisModelIndex in finalBestModelIndices:
    printModelInfo(thisModelIndex)

Most accurate model(s):

Features: pH, Temperature, Taste, Odor, Turbidity
	K: 1
	Accuracy: ~1.0
Features: pH, Temperature, Taste, Fat, Turbidity
	K: 1
	Accuracy: ~1.0
Features: pH, Temperature, Taste, Odor, Fat, Color
	K: 1
	Accuracy: ~1.0
Features: pH, Temperature, Taste, Fat, Turbidity, Color
	K: 1
	Accuracy: ~1.0


In [9]:
# Create a DataFrame to represent all feature combinations and their corresponding highest accuracies
# (to explore the models a bit more, as below)

modelDict: dict[str, list[np.longdouble | int | list[str]]] = {"Features": [modelInfo[0] for modelInfo in allModels],
                                                               "K": [modelInfo[2] for modelInfo in allModels],
                                                               "Accuracy": [thisAccuracy for thisAccuracy in allModelAccuracies]}

modelDF: pd.DataFrame = pd.DataFrame(modelDict).set_index("Features").sort_values("Accuracy", ascending=False)

In [10]:
print("20 best models:")
modelDF[:20]

20 best models:


Unnamed: 0_level_0,K,Accuracy
Features,Unnamed: 1_level_1,Unnamed: 2_level_1
"[pH, Temperature, Taste, Fat, Turbidity]",1,1.0
"[pH, Temperature, Taste, Fat, Turbidity, Color]",1,1.0
"[pH, Temperature, Taste, Odor, Fat, Color]",1,1.0
"[pH, Temperature, Taste, Odor, Turbidity]",1,1.0
"[pH, Temperature, Taste, Odor, Fat]",1,0.996226
"[pH, Temperature, Taste, Odor, Fat, Turbidity]",1,0.996226
"[pH, Temperature, Fat, Turbidity]",13,0.992453
"[pH, Temperature, Taste, Odor, Color]",1,0.992453
"[pH, Temperature, Taste, Odor, Fat, Turbidity, Color]",1,0.988679
"[pH, Temperature, Odor, Turbidity, Color]",1,0.988679


In [11]:
print("20 worst models:")
modelDF[-20:]

20 worst models:


Unnamed: 0_level_0,K,Accuracy
Features,Unnamed: 1_level_1,Unnamed: 2_level_1
[Temperature],18,0.630189
"[Taste, Odor, Fat, Turbidity]",1,0.618868
"[Taste, Fat, Turbidity]",3,0.607547
"[Fat, Turbidity]",11,0.603774
[Turbidity],9,0.603774
"[Odor, Color]",10,0.603774
"[Taste, Odor, Turbidity]",18,0.603774
"[Fat, Color]",2,0.603774
"[Taste, Turbidity]",14,0.603774
"[Taste, Color]",11,0.6


In [12]:
MIN_ACCURACY_BOUND: Final[np.longdouble] = np.longdouble(0.95)

print(f"Models with accuracy ≥ {MIN_ACCURACY_BOUND}:")
modelDF[modelDF["Accuracy"] >= MIN_ACCURACY_BOUND]

Models with accuracy ≥ 0.95:


Unnamed: 0_level_0,K,Accuracy
Features,Unnamed: 1_level_1,Unnamed: 2_level_1
"[pH, Temperature, Taste, Fat, Turbidity]",1,1.0
"[pH, Temperature, Taste, Fat, Turbidity, Color]",1,1.0
"[pH, Temperature, Taste, Odor, Fat, Color]",1,1.0
"[pH, Temperature, Taste, Odor, Turbidity]",1,1.0
"[pH, Temperature, Taste, Odor, Fat]",1,0.996226
"[pH, Temperature, Taste, Odor, Fat, Turbidity]",1,0.996226
"[pH, Temperature, Fat, Turbidity]",13,0.992453
"[pH, Temperature, Taste, Odor, Color]",1,0.992453
"[pH, Temperature, Taste, Odor, Fat, Turbidity, Color]",1,0.988679
"[pH, Temperature, Odor, Turbidity, Color]",1,0.988679
