In [212]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Data Preprocessing

In [213]:
data = pd.read_csv("fruit_data_with_colors _1_.csv")

**Checking Null Values and imputing numerical values with column's mean**


In [214]:
data.isnull().any()

fruit_label      False
fruit_name       False
fruit_subtype    False
mass              True
width            False
height            True
color_score      False
dtype: bool

In [215]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
data[["mass"]] = imputer.fit_transform(data[["mass"]])
data[["height"]] = imputer.fit_transform(data[["height"]])

**Imputing the "Unknown" values in the Fruit Subtype column
based on the Fruit type / Fruit name**

In [216]:
# Conditional imputation of unknown enteries, with their respective fruit name
def impute_unknown(data):
    for fruit in data['fruit_name'].unique():
        mode_value = data[data['fruit_name'] == fruit]['fruit_subtype'].mode()[0]
        data.loc[(data['fruit_name'] == fruit) & (data['fruit_subtype'] == 'unknown'), 'fruit_subtype'] = mode_value
    return data

data = impute_unknown(data)

**Encoding Categorial Data using One Hot Encoder**

In [217]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

cl = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2])], remainder='passthrough')
data = np.array(cl.fit_transform(data))

**Moving the Fruit Label column to the end**

In [218]:
columnToMove = 12  # 13th column

# Get all columns except the one to be moved
columnsExceptMoving = np.concatenate((data[:, :columnToMove], data[:, columnToMove + 1:]), axis=1)

# Get the column to be moved
columnMoving = data[:, columnToMove].reshape(-1, 1)

# Concatenate columns_except_moving and column_moving to rearrange the columns
dataReordered = np.concatenate((columnsExceptMoving, columnMoving), axis=1)
dataReordered

array([[1.  , 0.  , 0.  , ..., 7.3 , 0.55, 1.  ],
       [1.  , 0.  , 0.  , ..., 6.8 , 0.59, 1.  ],
       [1.  , 0.  , 0.  , ..., 7.2 , 0.6 , 1.  ],
       ...,
       [0.  , 1.  , 0.  , ..., 8.1 , 0.73, 4.  ],
       [0.  , 1.  , 0.  , ..., 8.5 , 0.72, 4.  ],
       [0.  , 1.  , 0.  , ..., 8.1 , 0.7 , 4.  ]])

**Dividing the data into Dependent and Independent Variables.**

In [219]:
X = dataReordered[:, :-1]
y = dataReordered[:, -1]

**Splitting Data into Training and Testing set**

In [220]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=1)

**Standardization of the Training and Testing set respectively exluding the One Hot Encoded values**

In [221]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train[:,12:] = sc.fit_transform(X_train[:,12:])
X_test[:,12:] = sc.fit_transform(X_test[:,12:])

# K-Nearest Neighbours

In [224]:
class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = []
        for x in X:
            distances = [np.linalg.norm(x - x_train) for x_train in self.X_train]
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = [self.y_train[i] for i in k_indices]
            most_common = max(set(k_nearest_labels), key=k_nearest_labels.count)
            predictions.append(most_common)
        return predictions

**Function to get Accuracy of KNN with any number of k**

In [225]:
from sklearn.metrics import confusion_matrix, accuracy_score

def getAccuracy(n, X_train, X_test, y_train, y_test):
  classifier = KNN(k=n)
  classifier.fit(X_train, y_train)
  y_pred = classifier.predict(X_test)
  acc = accuracy_score(y_test, y_pred)
  return acc

**Testing Accuracy of KNN using different values of k**

In [226]:
candidate_neighbors = [1,3,5,7,9]
score = {nbr: getAccuracy(nbr, X_train, X_test, y_train, y_test) for nbr in candidate_neighbors}
print(score)
bestNeighVal = max(score, key=score.get)
print(bestNeighVal)

{1: 0.9444444444444444, 3: 0.9444444444444444, 5: 0.9444444444444444, 7: 0.9444444444444444, 9: 0.8888888888888888}
1


From our testing above. Using the value of k which yielded the best accuracy

In [227]:
classifier = KNN(k=bestNeighVal)
classifier.fit(X_train, y_train)

In [228]:
y_pred = classifier.predict(X_test)

# Prediction

In [229]:
predictions = pd.DataFrame({"Actual Values": y_test, "Predicted Values": y_pred})

In [230]:
predictions

Unnamed: 0,Actual Values,Predicted Values
0,3.0,3.0
1,3.0,1.0
2,1.0,1.0
3,4.0,4.0
4,4.0,4.0
5,4.0,4.0
6,3.0,3.0
7,3.0,3.0
8,3.0,3.0
9,3.0,3.0


In [231]:
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)

print(f"Confusion Matrix: \n{cm}\n")
print(f"Accuracy: {round((acc * 100), 2)}%\n")

Confusion Matrix: 
[[2 0 0 0]
 [0 1 0 0]
 [1 0 7 0]
 [0 0 0 7]]

Accuracy: 94.44%

