In [4]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pandas as pd
# import seaborn as sns
# from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
import math
from sklearn.metrics import accuracy_score
rng = np.random.default_rng(51)


In [50]:
df = pd.read_csv('letter-recognition.data', header=None)


In [6]:
le = preprocessing.LabelEncoder()
for column_name in df.columns:
    df[column_name] = le.fit_transform(df[column_name])
y = df[0]
X = df.loc[:, df.columns != 0]

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


In [52]:
X_train =  X_train.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()
X_test = X_test.to_numpy()

In [53]:

class ParzenRosenblattClassifier:
    def __init__(self, kernel='gaussian'):
        self.kernel = kernel
        self.h_values = None

    def fit(self, X, y, h_values=None):
        self.X = X
        self.y = y
        self.h_values = h_values

    def leave_one_out(self, h):
        predictions = []
        for i in range(len(self.X)):
            train_X = np.delete(self.X, i, axis=0)
            train_y = np.delete(self.y, i, axis=0)
            test_point = self.X[i]
            predicted_class = self.predict([test_point], h, train_X, train_y)[0]
            predictions.append(predicted_class)

        return accuracy_score(self.y, predictions)

    def select_h(self, h_candidates):
        best_h = None
        best_accuracy = 0

        for h in h_candidates:
            accuracy = self.leave_one_out(h)
            print(f"Accuracy for h={h}: {accuracy}")
            
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_h = h

        return best_h
    
    def epanechnikov_kernel(self, u):
        return 0.75 * (1 - u**2) if np.abs(u) <= 1 else 0

    def quartic_kernel(self, u):
        return (15 / 16) * (1 - u**2)**2 if np.abs(u) <= 1 else 0
    def gaussian_kernel(self, distances, h):
        return np.exp(-0.5 * (distances / h)**2) / ((2 * np.pi)**(len(distances[0])/2) * h**len(distances[0]))

    

    def predict(self, X_new, h=None, train_X=None, train_y=None):
        predictions = []
        if h is None:
            h = self.select_h(self.h_values)

        for point in X_new:
            scores = {}
            for label in set(self.y):
                label_indices = np.where(self.y == label)[0]
                label_data = self.X[label_indices]
                distances = np.linalg.norm(label_data - point, axis=1)
                if self.kernel == 'gaussian':
                    weights = self.gaussian_kernel(distances, h)
                elif self.kernel == 'quartic':
                    weights = np.array([self.quartic_kernel(u / h) for u in distances])
                elif self.kernel == 'epanechnikov':
                    weights = np.array([self.epanechnikov_kernel(u / h) for u in distances])
                else:
                    pass
                scores[label] = sum(weights)

            predicted_class = max(scores, key=scores.get)
            predictions.append(predicted_class)

        return np.array(predictions)


In [None]:
classParzen = ParzenRosenblattClassifier(kernel='gaussian')
classParzen.fit(X_train, y_train, h_values=[20, 1, 0.1])

predictions = classParzen.predict(X_test)

Accuracy for h=20: 0.3379285714285714
Accuracy for h=1: 0.9998571428571429
Accuracy for h=0.1: 1.0


In [64]:
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy gaussian test: {accuracy}')

Accuracy gaussian test: 0.9141666666666667


In [69]:
classParzen = ParzenRosenblattClassifier(kernel='epanechnikov')
classParzen.fit(X_train, y_train, h_values=[40, 20, 10, 5, 4])

# Работает дольше
predictions = classParzen.predict(X_test)

Accuracy for h=40: 0.20907142857142857
Accuracy for h=20: 0.4777142857142857
Accuracy for h=10: 0.6365
Accuracy for h=5: 0.9424285714285714
Accuracy for h=4: 0.9774285714285714


In [59]:
predictions = classParzen.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy epanechnikov test: {accuracy}')

Accuracy epanechnikov test: 0.937


In [72]:
classParzen = ParzenRosenblattClassifier(kernel='quartic')
classParzen.fit(X_train, y_train, h_values=[5, 2, 1])

# Работает дольше
predictions = classParzen.predict(X_test)

Accuracy for h=5: 0.9702142857142857
Accuracy for h=2: 1.0
Accuracy for h=1: 1.0


In [71]:
predictions_q = classParzen.predict(X_test)
accuracy = accuracy_score(y_test, predictions_q)
print(f'Accuracy  quartic test: {accuracy}')

Accuracy  quartic test: 0.9151666666666667
