## 20 NG

In [1]:
from sklearn.datasets import fetch_openml,fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity,euclidean_distances
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

import pandas as pd


In [2]:
def split_data(X, y, test_size=0.2, val_size=0.1):
    # ensuring an even split across categories
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )
    # ensuring an even split across categories
    X_test, X_val, y_test, y_val = train_test_split(
        X_temp, y_temp, test_size=val_size, random_state=42, stratify=y_temp
    )
    return X_train, X_test, X_val, y_train, y_test, y_val

In [3]:
# Fetch the 20 newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all')

# Then split the temporary set evenly into a validation set and a test set
X_train, X_test,X_val, y_train, y_test,y_val = split_data(newsgroups.data, newsgroups.target, test_size=0.20,val_size=0.3)

# Vectorize the text data into a sparse matrix
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_test_counts = count_vect.transform(X_test)
X_val_counts = count_vect.transform(X_val)

# Compute term frequencies
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)

X_train_tf = tf_transformer.transform(X_train_counts)
X_test_tf = tf_transformer.transform(X_test_counts)
X_val_tf = tf_transformer.transform(X_val_counts)

In [4]:
cosine_sim_matrix = cosine_similarity(X_train_tf)


In [5]:
euclidean_dist_matrix = euclidean_distances(X_train_tf)


## Problme 4: Implemetation of KNN algorithm using distance matrix ON 20 NG

In [6]:
import numpy as np
class NaiveKNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y, sim_matrix=None):
        self.X_train = X
        self.y_train = y
        if sim_matrix is None:
            self.cosine_sim_matrix = cosine_similarity(self.X_train)
        else:
            self.cosine_sim_matrix = sim_matrix

    def predict(self, X):
        num_test = X.shape[0]
        y_pred = np.zeros(num_test)
        for i in range(num_test):
            # Compute cosine similarity between the test point and all points in the training set
            sim = cosine_similarity(X[i].reshape(1, -1), self.X_train)[0]
            # Get the indices of the k most similar training points
            neighbors = np.argsort(-sim)[:self.k]
            # Get the labels of these points
            labels = [self.y_train[neighbor] for neighbor in neighbors]
            # The prediction for this test point is the most common label of its neighbors
            y_pred[i] = np.argmax(np.bincount(labels))
        return y_pred



In [7]:
X_train_tf

<15076x160467 sparse matrix of type '<class 'numpy.float64'>'
	with 2384157 stored elements in Compressed Sparse Row format>

In [8]:
# Initialize the KNN model
knn = NaiveKNN(k=10)

# Fit the model to the training data
knn.fit(X_train_tf, y_train,cosine_sim_matrix)

# Make predictions on the training data
y_pred_train = knn.predict(X_train_tf)
y_pred_test = knn.predict(X_test_tf)


In [9]:
# Compute accuracy
train_accuracy = accuracy_score(y_pred_train ,y_train)
test_accuracy = accuracy_score(y_pred_test, y_test)

print(f"Training Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")

Training Accuracy: 0.688179888564606
Test Accuracy: 0.5665024630541872


### Use sklearn KNN on 20 NG dataset 

In [10]:

# Create a KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=10)

# Fit the model to the training data
knn.fit(X_train_tf, y_train)

# Make predictions on the test data
y_pred_train = knn.predict(X_train_tf)
y_pred_test = knn.predict(X_test_tf)


# Compute accuracy
train_accuracy = accuracy_score(y_pred_train ,y_train)
test_accuracy = accuracy_score(y_pred_test, y_test)

print(f"Training Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")




Training Accuracy: 0.688179888564606
Test Accuracy: 0.5665024630541872


### Cross validation to find the best parameters

In [11]:
from sklearn.model_selection import GridSearchCV


# Create a KNN classifier
knn = KNeighborsClassifier(metric='cosine', algorithm='brute')

# Create a dictionary of all values of k we want to test for n_neighbors
param_grid = {"n_neighbors": np.arange(1, 31, 2)}

# Use grid search to test all values of k
knn_gscv = GridSearchCV(knn, param_grid, cv=5)

# Fit model to data
knn_gscv.fit(X_val_tf,y_val)

# Check top performing n_neighbors value
print("Best parameter (CV score=%0.3f):" % knn_gscv.best_score_)
print(knn_gscv.best_params_)


Best parameter (CV score=0.275):
{'n_neighbors': 1}


## MNIST Dataset

In [12]:
# fetch the MNIST dataset

X, y = fetch_openml('mnist_784', version=1, return_X_y=True)

  warn(


In [13]:
# Check for missing values
if X.isna().any().any():
    print("Data contains missing values")

# check the shape of the data
print(f'The data shape is',X.shape)
print('the label shape is',y.shape)


The data shape is (70000, 784)
the label shape is (70000,)


In [14]:
#  Drop zero columns
X =X.loc[:,X.var() != 0]

print(X.shape)

(70000, 719)


In [15]:
X_train, X_test,X_val, y_train, y_test,y_val = split_data(X, y)

In [16]:
def shift_scale_normalization(X):
    """
    Shift and scale normalization
    """
    columns = X.columns

    # convert pandas to numpy arrays
    X = X.to_numpy()
    min_val = np.min(X)
    max_val = np.max(X)

    # Apply min-max normalization to the training, testing, and validation sets
    X = (X - min_val) / (max_val - min_val)

    # convert NaN to 0
    X[np.isnan(X)] = 0

    # check if there is no NaN
    if np.any(np.isnan(X)):
        print('NaN values found in X')


    # convert back to pandas
    X = pd.DataFrame(X, columns=columns)

    return X

def zero_mean_normalization(X):
    """
    Zero mean normalization
    """
    columns = X.columns
    # convert pandas to numpy arrays
    X = X.to_numpy()
    mean_val = np.mean(X)
    std_val = np.std(X)

    # Apply zero-mean normalization
    X = ( X - mean_val) / std_val

    # convert NaN to 0
    X[np.isnan(X)] = 0

    # check if there is no NaN
    if np.any(np.isnan(X)):
        print('NaN values found in X')

    # convert back to pandas
    X = pd.DataFrame(X, columns=columns)

    return X


In [17]:
X_train_shift = shift_scale_normalization(X_train)
X_test_shift = shift_scale_normalization(X_test)
X_train_mean = zero_mean_normalization(X_train)
X_test_mean = zero_mean_normalization(X_test)


In [18]:
X_train_shift = csr_matrix(X_train_shift)
X_test_shift = csr_matrix(X_test_shift)
X_train_mean = csr_matrix(X_test_mean)
X_test_mean= csr_matrix(X_test_mean)
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()


In [19]:
# Compute cosine similarity on shift-scale normalization

#cosine_similarity_shift = cosine_similarity(X_train_shift)
#euclidean_similarity_shift = euclidean_distances(X_train_shift)

model_knn = NearestNeighbors(metric='euclidean')
model_knn.fit(X_train_shift)

eucledian_distances, eucledian_indices = model_knn.kneighbors(X_train_shift, n_neighbors=10)


## Implement NaiveKNN on MNSIT Dataset

In [20]:
# Initialize the KNN model
knn = NaiveKNN(k=10)

# Fit the model to the X_train and convert it a dance Matrix
knn.fit(X_train_shift, y_train,eucledian_distances)

# Make predictions on the training data
y_pred_train = knn.predict(X_train_shift)
y_pred_test = knn.predict(X_test_shift)

KeyboardInterrupt: 

In [None]:
# Compute accuracy
train_accuracy = accuracy_score(y_pred_train ,y_train)
test_accuracy = accuracy_score(y_pred_test, y_test)

print(f"Training Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")

NameError: name 'accuracy_score' is not defined

In [None]:
# Compute cosine similarity on zero-mean normalization

#cosine_similarity_mean = cosine_similarity(X_train_mean)
#euclidean_similarity_mean = euclidean_distances(X_train_mean)

In [None]:
# Compute eucledian similarity between two vectors using shift-scale normalization
"""
model_knn = NearestNeighbors(metric='euclidean')
model_knn.fit(X_zero_mean_unit_variance_sparse)

eucledian_distances, eucledian_indices = model_knn.kneighbors(X_shift_scale_normalized_sparse, n_neighbors=10)
"""

## Sklearn kNN ON MNIST

In [None]:

knn = KNeighborsClassifier(n_neighbors=10)

knn.fit(X_train_shift, y_train)

y_train_pred = knn.predict(X_train)
y_test_pred = knn.predict(X_test)

NameError: ignored

In [None]:
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f'Training accuracy: {train_accuracy}')
print(f'Testing accuracy: {test_accuracy}')

Training accuracy: 0.9739102483972928
Testing accuracy: 0.9648571428571429
