In [38]:
import numpy as np
import pandas as pd

# Load training data
training = pd.read_csv("Training_set.csv", usecols=['SO2', 'CO', 'Proximity_to_Industrial_Areas', 'Air Quality'])
training_array = training[['SO2', 'CO', 'Proximity_to_Industrial_Areas']].to_numpy()
category_array = training['Air Quality'].to_numpy()

# Minkowski distance function (p=3)
def minkowski_distance(x, y, p=3):
    sum=0
    for i in range(len(x)):
        sum += abs(x[i] - y[i])**p
    return sum ** (1 / p)

# Function to categorize a point
def categorize(point):
    for i in range(len(training_array)):
        if np.array_equal(training_array[i], point):
            return category_array[i]
    return None

# K-Nearest Neighbors (KNN) function
def KNN(trainSet, point, k):
    distances = []

    # Calculate distances between the point and all training data
    for i in range(len(trainSet)):
        dist = minkowski_distance(trainSet[i], point)
        #add distances to the list
        distances.append((dist, category_array[i]))

    # Sort distances in ascending order and select top k
    distances.sort()
    top_k_distances = distances[:k]

    # Count occurrences of each category in the top k
    counts = {'Good': 0, 'Moderate': 0, 'Poor': 0, 'Hazardous': 0}
    # donot need distance value in the loop so use _ to ignore
    for _, category in top_k_distances:
        counts[category] += 1

    # Determine the category with the highest count

    #use key= counts.get to retrieve the value for a given key
    return max(counts, key=counts.get)

# Example point to classify
point = [12.8, 2.09, 2.7]
k = 10

result = KNN(training_array, point, k)
print("Predicted Air Quality:", result)


Predicted Air Quality: Hazardous
