In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from collections import Counter

In [2]:
# some basic HDC operations
# binding: element-wise multiplication
def bind(hv1, hv2):
    return hv1 * hv2

# element-wise addition
def bundle(hv_list):
    bundled = np.zeros(D, dtype=complex)
    for hv in hv_list:
        bundled += hv
    return bundled

# use dot product to test similarity
def similarity(hv1, hv2):
    return np.real(np.dot(hv1, np.conj(hv2))) / D

In [3]:
# generate the matrix W (D * n) for later feature encoding
def gen_basis(D, n, distribution, cov=None):
  if distribution == 'gaussian':
    if cov is None:
      alpha = 1.0
      cov = alpha * np.eye(n)
    W = np.random.multivariate_normal(np.zeros(n), cov, size=D)
  elif distribution == 'uniform':
    W = 2 * np.pi * np.random.rand(D, n)

  return W

In [4]:
# W(D, n) * x(batch_size, n)^T = (D, batch_size)
def encode_feature(x, W):
  return np.exp(1j * W @ x.T).T

In [13]:
data = fetch_covtype()
x = data.data
y = data.target - 1

x = x[:, :10]

scaler = StandardScaler()
x = scaler.fit_transform(x)

smote = SMOTE(random_state=42)
x_balanced, y_balanced = smote.fit_resample(x, y)
x_balanced, _, y_balanced, _ = train_test_split(x_balanced, y_balanced, train_size=0.1, random_state=42, stratify=y_balanced)

x_train, x_test, y_train, y_test = train_test_split(x_balanced, y_balanced, test_size=0.2, random_state=42)

num_classes = len(np.unique(y))

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

print(f"x_train.shape: {x_train.shape}, y_train.shape: {y_train.shape}")
print(f"x_val.shape: {x_val.shape}, y_val.shape: {y_val.shape}")
print(f"x_test.shape: {x_test.shape}, y_test.shape: {y_test.shape}")
print(f"number_classes: {num_classes}")
print(f"#samples in each class after under sampling: {Counter(y_train)}")

x_train.shape: (126918, 10), y_train.shape: (126918,)
x_val.shape: (31730, 10), y_val.shape: (31730,)
x_test.shape: (39662, 10), y_test.shape: (39662,)
number_classes: 7
#samples in each class after under sampling: Counter({1: 18208, 0: 18191, 2: 18147, 6: 18137, 5: 18118, 3: 18065, 4: 18052})


In [14]:
D = 10000
alpha = 2.0
n_features = x_train.shape[1]
cov = alpha * np.eye(n_features)
W = gen_basis(D, n_features, distribution='gaussian', cov=cov)

In [15]:
# encode data
x_train_hv = encode_feature(x_train, W)
x_val_hv = encode_feature(x_val, W)
x_test_hv = encode_feature(x_test, W)

print(f"x_train_hv.shape: {x_train_hv.shape}")
print(f"x_val_hv.shape: {x_val_hv.shape}")
print(f"x_test_hv.shape: {x_test_hv.shape}")

x_train_hv.shape: (126918, 10000)
x_val_hv.shape: (31730, 10000)
x_test_hv.shape: (39662, 10000)


In [16]:
# only take the real part of the values
x_train_real = x_train_hv.real
x_val_real = x_val_hv.real
x_test_real = x_test_hv.real

In [17]:
rf_model_balanced = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1) #n_estimators => #of trees
rf_model_balanced.fit(x_train_real, y_train)

In [18]:
y_val_pred = rf_model_balanced.predict(x_val_real)
val_acc = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_acc:.4f}")

y_test_pred = rf_model_balanced.predict(x_test_real)
test_acc = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_acc:.4f}")

print(classification_report(y_test, y_test_pred))

Validation Accuracy: 0.9225
Test Accuracy: 0.9202
              precision    recall  f1-score   support

           0       0.84      0.82      0.83      5623
           1       0.83      0.77      0.80      5633
           2       0.94      0.92      0.93      5633
           3       0.97      1.00      0.98      5746
           4       0.96      0.99      0.97      5723
           5       0.92      0.95      0.94      5590
           6       0.97      0.99      0.98      5714

    accuracy                           0.92     39662
   macro avg       0.92      0.92      0.92     39662
weighted avg       0.92      0.92      0.92     39662

