## Importing Essential Libraries

In [259]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 

## Loading the rice dataset from uci repo

In [260]:
from ucimlrepo import fetch_ucirepo

# fetch dataset 
student_performance = fetch_ucirepo(id=320) 
  
# data (as pandas dataframes) 
X = student_performance.data.features.to_numpy()
y_raw = student_performance.data.targets.to_numpy()
#y_raw = y_raw[:, 2]  # I'm using only the 3rd target column (final grade)

## Converting y to a 1D array

In [261]:
y = y_raw[:,2].ravel()  # using only the 3rd output

In [262]:
print(y_raw[1].shape)

(3,)


## Encoding the Columns

In [263]:
# from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# le = LabelEncoder()

# # encode the first column (school)
# X[:, 0] = le.fit_transform(X[:, 0])
# X[:, 1] = le.fit_transform(X[:, 1])   # gender

# # encode the 4th column. Rural or Urban
# X[:, 3] = le.fit_transform(X[:, 3])    # address type 
# X[:, 4] = le.fit_transform(X[:, 4])    # family size
# X[:, 5] = le.fit_transform(X[:, 5])    # family cohabitation status

# print("X shape before oneHot ", X.shape)  # Todo: remove this

# # 9th column (mother's job) is nominal
# onehotencoder = OneHotEncoder(categories='auto', sparse_output=False)    # set to false to return ndarry instead of scipy.sparse._csr.csr_matrix
# col_9_encoded = onehotencoder.fit_transform(X[:, 8].reshape(-1, 1))
# print("new dim added: ", col_9_encoded.shape)
# X = np.concatenate((X[:,:8], col_9_encoded, X[:, 9:]), axis=1)  # add/concat the RHS array as a new column(s). Now we have 34cols
# # at this point, col9 at idx8 has extended to indexes 8,9,10,11,12 due to the new encoded indexes
# print(f"X's shape after mjob5: {X.shape}")

# # encoding father's job column. Originally col idx9, now idx13
# col_fjob_encoded = onehotencoder.fit_transform(X[:, 13].reshape(-1, 1))
# print("new dim added: ", col_fjob_encoded.shape)
# X = np.concatenate((X[:,:13], col_fjob_encoded, X[:, 14:]), axis=1)  # add/concat the RHS array as 5 new column(s)
# print(f"X's shape after fjob5: {X.shape}")

# # encoding the reason column
# col_reason_encoded = onehotencoder.fit_transform(X[:, 18].reshape(-1, 1))
# print("new dim added: ", col_reason_encoded.shape)
# X = np.concatenate((X[:,:18], col_reason_encoded, X[:, 19:]), axis=1)  # add/concat the RHS array as 4 new column(s)
# print(f"X's shape after reason4: {X.shape}")

# # encoding the guardian column
# col_guardian_encoded = onehotencoder.fit_transform(X[:, 22].reshape(-1, 1))
# print("new guard cols added: ", col_guardian_encoded.shape)
# X = np.concatenate((X[:,:22], col_guardian_encoded, X[:, 23:]), axis=1)  # add/concat the RHS array as 3 new column(s)
# print(f"X's shape after guardian3: {X.shape}")

# # encoding the remaining binary columns
# for col in range(28, 36):
#     X[:, col] = le.fit_transform(X[:, col]) 

# print(f"X's new shape: {X.shape}")
# print(X[0])

## Temporarily encoding a few columns and removing complex ones

In [264]:
#print(X[0])
X = np.concatenate((X[:, :8], X[:, 12:]), axis=1)
#xx[0, 11:19]
#X[0,12]

In [265]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()

# encode the first column (school)
X[:, 0] = le.fit_transform(X[:, 0])
X[:, 1] = le.fit_transform(X[:, 1])   # gender

# encode the 4th column. Rural or Urban
X[:, 3] = le.fit_transform(X[:, 3])    # address type 
X[:, 4] = le.fit_transform(X[:, 4])    # family size
X[:, 5] = le.fit_transform(X[:, 5])    # family cohabitation status

print("X shape before others ", X.shape)  # Todo: remove this



# encoding the remaining binary columns
for col in range(11, 19):
    X[:, col] = le.fit_transform(X[:, col]) 

print(f"X's new shape: {X.shape}")
print(X[0])

X shape before others  (649, 26)
X's new shape: (649, 26)
[0 0 18 1 0 0 4 4 2 2 0 1 0 0 0 1 1 0 0 4 3 4 1 1 3 4]


In [266]:
X[0,23]

1

In [267]:
# adding extra output columns to X
G1,G2 = y_raw[:,0].reshape(-1,1), y_raw[:,1].reshape(-1,1)
print(X.shape)
X = np.concatenate((X, G1, G2), axis=1)
print(X.shape)

(649, 26)
(649, 28)


In [268]:
# X[0,28:36] #array(['yes', 'no', 'no', 'no', 'yes', 'yes', 'no', 'no'], dtype=object)
# X[0,35]
# y_raw.shape

IndexError: index 35 is out of bounds for axis 1 with size 28

## Splitting the dataset into the Training and Test sets

In [269]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

## Feature Scaling
we scale the features so they're in the same range

In [270]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [271]:
print(f"X and y shapes: {X.shape}, {y.shape}")

X and y shapes: (649, 28), (649,)


## Training the K-NN model on the Training set

In [272]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 4, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

## Insample Prediction: on training data

In [273]:
y_pred_ins = classifier.predict(X_train)
#print(np.concatenate((y_pred_ins.reshape(len(y_pred_ins),1), y_train.reshape(len(y_train),1)),1))

### Let's see the confusion matrix and accuracy score

In [274]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_train, y_pred_ins)
print(cm)
accuracy_score(y_train, y_pred_ins)

[[10  0  0  0  0  0  0  0  2  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0]
 [ 0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  0  0  0  1  3  0  0  2  0  0  0  0  0  0  0  0]
 [ 0  2  0  0  4 10  0  2  8  1  1  0  0  0  0  0  0]
 [ 1  0  0  1  1  3 15  2  6  0  0  0  0  0  0  0  0]
 [ 2  0  0  0  0  1  5 51  9  1  3  3  0  0  0  0  0]
 [ 2  0  0  0  3  3  2 16 38  6  6  5  0  0  0  0  0]
 [ 0  0  0  0  0  0  2  3 15 21  4  1  0  1  0  0  0]
 [ 0  0  0  0  0  1  0  6  7  8 27  6  2  0  0  0  0]
 [ 0  0  0  0  0  1  0  3  5  5  6 21  5  0  0  0  0]
 [ 0  0  0  0  0  0  0  3  4  4  6  6 14  0  1  0  0]
 [ 0  0  0  0  0  0  0  3  2  5  2  5  2  8  1  0  0]
 [ 0  0  0  0  0  0  0  0  1  0  4  3  2  2  7  1  0]
 [ 0  0  0  0  0  0  0  0  0  0  1  1  2  4  2  2  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  2  0  0  0]]


0.46502057613168724

## Outsample Prediction: on test data and confusion matrix

In [275]:
y_pred_out = classifier.predict(X_test)

In [276]:
cm = confusion_matrix(y_test, y_pred_out)
print(cm)
accuracy_score(y_test, y_pred_out)

[[ 1  0  0  0  1  1  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  1  0  0  0  0  0  0  0]
 [ 0  0  0  0  2  1  0  0  0  0  0  0  0  0]
 [ 2  0  0  2  2  0  0  1  0  0  0  0  0  0]
 [ 0  0  1  1  0  0  1  1  2  0  0  0  0  0]
 [ 1  0  0  3  2  6  5  3  2  0  0  0  0  0]
 [ 1  0  1  4  1  7  4  1  3  1  0  0  0  0]
 [ 0  0  0  2  0  5 10  4  2  1  1  0  0  0]
 [ 0  0  0  0  0  6  6  4  4  3  2  0  0  0]
 [ 0  0  0  0  0  6  3  2  4  0  2  0  0  0]
 [ 0  0  0  0  0  0  3  1  3  0  2  1  1  0]
 [ 0  0  0  0  0  1  1  2  0  1  1  0  2  0]
 [ 0  0  0  0  0  0  0  0  3  2  3  1  0  0]
 [ 0  0  0  0  0  0  0  0  1  1  1  0  0  0]]


0.1411042944785276

# Learning Curve
## Experiment to run with different values of k and observe the accuracy.

In [None]:
from sklearn.model_selection import learning_curve

model = KNeighborsClassifier(n_neighbors = 7, metric = 'minkowski', p = 2)
train_sizes, train_scores, test_scores = learning_curve(model, X, y, cv=5, scoring='accuracy', n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10), random_state=0, shuffle=True)

train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)

plt.plot(train_sizes, train_scores_mean, label='Training score')
plt.plot(train_sizes, test_scores_mean, label='Cross-validation score')
plt.xlabel('Training examples')
plt.ylabel('Score')
plt.legend(loc='best')
plt.title('Learning Curve')
plt.show()

In [None]:
# print(train_sizes)
# print(train_scores)
# print("Test scores \n", test_scores)


## Experimenting with different values of k

In [None]:
k_vals = [i for i in range(1,40)]
train_accuracies = []
test_accuracies = []
for k in k_vals:
    classifier = KNeighborsClassifier(n_neighbors = k, metric = 'minkowski', p = 2)
    classifier.fit(X_train, y_train)

    y_pred_test = classifier.predict(X_test)
    y_pred_train = classifier.predict(X_train)
    
    train_accuracies.append(accuracy_score(y_train, y_pred_train))
    test_accuracies.append(accuracy_score(y_test, y_pred_test))


# now let's plot it - Accuracy vs k
plt.figure(figsize=(10, 6))
plt.plot(k_vals, train_accuracies, label='In-Sample Accuracy (Training Set)', marker='o', color='b')
plt.plot(k_vals, test_accuracies, label='Out-of-Sample Accuracy (Test Set)', marker='o', color='r')
plt.title('KNN Accuracy for Different Values of k')
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Accuracy')
plt.xticks(k_vals)
plt.legend()
plt.grid(True)
plt.show()

In [None]:
print(train_accuracies)
print(test_accuracies)