## Importing Essential Libraries

In [132]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 

## Loading the rice dataset from uci repo

In [144]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
rice_cammeo_and_osmancik = fetch_ucirepo(id=320)

# data (as pandas dataframes), let's convert to numpy
X = rice_cammeo_and_osmancik.data.features.to_numpy()
y_raw = rice_cammeo_and_osmancik.data.targets.to_numpy()

In [145]:
X[0]

array(['GP', 'F', 18, 'U', 'GT3', 'A', 4, 4, 'at_home', 'teacher',
       'course', 'mother', 2, 2, 0, 'yes', 'no', 'no', 'no', 'yes', 'yes',
       'no', 'no', 4, 3, 4, 1, 1, 3, 4], dtype=object)

In [146]:
print(X.shape)

(649, 30)


## Encoding the Columns

In [147]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()

# encode the first column (school)
X[:, 0] = le.fit_transform(X[:, 0])

# encode the 4th column. Rural or Urban
X[:, 3] = le.fit_transform(X[:, 3])    # address type 
X[:, 4] = le.fit_transform(X[:, 4])    # family size
X[:, 5] = le.fit_transform(X[:, 5])    # family cohabitation status

print("X shape before oneHot ", X.shape)  # Todo: remove this

# 9th column (mother's job) is nominal
onehotencoder = OneHotEncoder(categories='auto', sparse_output=False)    # set to false to return ndarry instead of scipy.sparse._csr.csr_matrix
col_9_encoded = onehotencoder.fit_transform(X[:, 8].reshape(-1, 1))
print("new dim added: ", col_9_encoded.shape)
X = np.concatenate((X[:,:8], col_9_encoded, X[:, 9:]), axis=1)  # add/concat the RHS array as a new column(s). Now we have 34cols
# at this point, col9 at idx8 has extended to indexes 8,9,10,11,12 due to the new encoded indexes
print(f"X's shape after mjob5: {X.shape}")

# encoding father's job column. Originally col idx9, now idx13
col_fjob_encoded = onehotencoder.fit_transform(X[:, 13].reshape(-1, 1))
print("new dim added: ", col_fjob_encoded.shape)
X = np.concatenate((X[:,:13], col_fjob_encoded, X[:, 14:]), axis=1)  # add/concat the RHS array as 5 new column(s)
print(f"X's shape after fjob5: {X.shape}")

# encoding the reason column
col_reason_encoded = onehotencoder.fit_transform(X[:, 18].reshape(-1, 1))
print("new dim added: ", col_reason_encoded.shape)
X = np.concatenate((X[:,:18], col_reason_encoded, X[:, 19:]), axis=1)  # add/concat the RHS array as 4 new column(s)
print(f"X's shape after reason4: {X.shape}")

# encoding the guardian column
col_guardian_encoded = onehotencoder.fit_transform(X[:, 22].reshape(-1, 1))
print("new guard cols added: ", col_guardian_encoded.shape)
X = np.concatenate((X[:,:22], col_guardian_encoded, X[:, 23:]), axis=1)  # add/concat the RHS array as 3 new column(s)
print(f"X's shape after guardian3: {X.shape}")

# encoding the remaining binary columns
for col in range(28, 36):
    X[:, col] = le.fit_transform(X[:, col]) 

print(f"X's new shape: {X.shape}")
print(X[0])

X shape before oneHot  (649, 30)
new dim added:  (649, 5)
X's shape after mjob5: (649, 34)
new dim added:  (649, 5)
X's shape after fjob5: (649, 38)
new dim added:  (649, 4)
X's shape after reason4: (649, 41)
new guard cols added:  (649, 3)
X's shape after guardian3: (649, 43)
X's new shape: (649, 43)
[0 'F' 18 1 0 0 4 4 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0
 0.0 0.0 1.0 0.0 2 2 0 1 0 0 0 1 1 0 0 4 3 4 1 1 3 4]


In [148]:
X[0,28:36] #array(['yes', 'no', 'no', 'no', 'yes', 'yes', 'no', 'no'], dtype=object)
X[0,35]

0

### Converting y to a 1D array

In [109]:
y = y.ravel()

NameError: name 'y' is not defined

## Splitting the dataset into the Training and Test sets

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

## Feature Scaling
we scale the features so they're in the same range

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
print(f"X and y shapes: {X.shape}, {y.shape}")

## Training the K-NN model on the Training set

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 4, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

## Insample Prediction: on training data

In [None]:
y_pred_ins = classifier.predict(X_train)
#print(np.concatenate((y_pred_ins.reshape(len(y_pred_ins),1), y_train.reshape(len(y_train),1)),1))

### Let's see the confusion matrix and accuracy score

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_train, y_pred_ins)
print(cm)
accuracy_score(y_train, y_pred_ins)

## Insample Prediction: on training data and confusion matrix

In [None]:
y_pred_out = classifier.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_pred_out)
print(cm)
accuracy_score(y_test, y_pred_out)

# Learning Curve
## Experiment to run with different values of k and observe the accuracy.

In [None]:
from sklearn.model_selection import learning_curve

model = KNeighborsClassifier(n_neighbors = 7, metric = 'minkowski', p = 2)
train_sizes, train_scores, test_scores = learning_curve(model, X, y, cv=5, scoring='accuracy', n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10), random_state=0, shuffle=True)

train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)

plt.plot(train_sizes, train_scores_mean, label='Training score')
plt.plot(train_sizes, test_scores_mean, label='Cross-validation score')
plt.xlabel('Training examples')
plt.ylabel('Score')
plt.legend(loc='best')
plt.title('Learning Curve')
plt.show()

In [None]:
# print(train_sizes)
# print(train_scores)
# print("Test scores \n", test_scores)


## Experimenting with different values of k

In [None]:
k_vals = [i for i in range(1,40)]
train_accuracies = []
test_accuracies = []
for k in k_vals:
    classifier = KNeighborsClassifier(n_neighbors = k, metric = 'minkowski', p = 2)
    classifier.fit(X_train, y_train)

    y_pred_test = classifier.predict(X_test)
    y_pred_train = classifier.predict(X_train)
    
    train_accuracies.append(accuracy_score(y_train, y_pred_train))
    test_accuracies.append(accuracy_score(y_test, y_pred_test))


# now let's plot it - Accuracy vs k
plt.figure(figsize=(10, 6))
plt.plot(k_vals, train_accuracies, label='In-Sample Accuracy (Training Set)', marker='o', color='b')
plt.plot(k_vals, test_accuracies, label='Out-of-Sample Accuracy (Test Set)', marker='o', color='r')
plt.title('KNN Accuracy for Different Values of k')
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Accuracy')
plt.xticks(k_vals)
plt.legend()
plt.grid(True)
plt.show()

In [None]:
print(train_accuracies)
print(test_accuracies)