## AI300
### Data Preprocessing

Normalization may be needed for numerical data if the variables are on very different scales (if one is on terms of magnitude of maybe $10^4$ and the other is on terms of magnitude of maybe $10^1$.)

In [2]:
import numpy as np

train_size = 80
test_size = 20
num_features = 4

np.random.seed(1)
X_train = np.random.rand(train_size, num_features)
X_test = np.random.rand(test_size, num_features)

In [None]:
class MyMinMaxScaler():
  def __init__(self):
    self._min = None
    self._max = None

  def fit_transform(self, X_train):
    self._min = np.min(X_train, axis = 0)
    self._max = np.max(X_train, axis = 0)

    transformed = (X_train - self._min) / (self._max - self._min)

    return transformed

  def transform(self, X_test):
    transformed = (X_test - self._min) / (self._max - self._min)

    return transformed

In [None]:
my_minmax_scaler = MyMinMaxScaler()

X_train_minmax_scaled = my_minmax_scaler.fit_transform(X_train)
X_test_minmax_scaled = my_minmax_scaler.transform(X_test)

In [None]:
from sklearn.preprocessing import MinMaxScaler

sklearn_minmax_scaler = MinMaxScaler()

X_train_minmax_sklearn_scaled = sklearn_minmax_scaler.fit_transform(X_train)
X_test_minmax_sklearn_scaled = sklearn_minmax_scaler.transform(X_test)

In [None]:
print(np.max(np.abs(X_train_minmax_scaled - X_train_minmax_sklearn_scaled)))
print(np.max(np.abs(X_test_minmax_scaled - X_test_minmax_sklearn_scaled)))

1.1102230246251565e-16
1.1102230246251565e-16


In [None]:
class MyStandardScaler():
  def __init__(self):
    self._mean = None
    self._std = None

  def fit_transform(self, X_train):
    self._mean = np.mean(X_train, axis = 0)
    self._std = np.std(X_train, axis = 0)

    transformed = (X_train - self._mean) / (self._std)

    return transformed

  def transform(self, X_test):
    transformed = (X_test - self._mean) / (self._std)

    return transformed

In [None]:
my_standard_scaler = MyStandardScaler()

X_train_standard_scaler = my_standard_scaler.fit_transform(X_train)
X_test_standard_scaler = my_standard_scaler.transform(X_test)

In [None]:
from sklearn.preprocessing import StandardScaler

sklearn_standard_scaler = StandardScaler()

X_train_sklearn_standard_scaler = sklearn_standard_scaler.fit_transform(X_train)
X_test_sklearn_standard_scaler = sklearn_standard_scaler.transform(X_test)

In [None]:
print(np.max(np.abs(X_train_sklearn_standard_scaler - X_train_standard_scaler)))
print(np.max(np.abs(X_test_sklearn_standard_scaler - X_test_standard_scaler)))

0.0
0.0


In [None]:
class NewMyStandardScaler():
  def __init__(self):
    self._mean = None
    self._std = None

  def fit_transform(self, X_train):
    self._mean = np.mean(X_train, axis = 0)
    self._std = np.std(X_train, axis = 0)

    transformed = (X_train - self._mean) / (self._std + 1e-16)

    return transformed

  def transform(self, X_test):
    transformed = (X_test - self._mean) / (self._std + 1e-16)

    return transformed

There is no need to normalize a dataset that is all $1$ feature since you do not need to scale it compared to other variables since it is the only variable in the dataset, so you can just consider it without normalizing it since there is no other feature to compare to.

In [None]:
labels1 = ['big', 'small', 'tall', 'short']
labels2 = ['fish', 'elephant', 'dog', 'hamster']

num_samples = 10

np.random.seed(15)
X = np.stack([np.random.choice(labels1, size = num_samples),
             np.random.choice(labels2, size = num_samples)], axis = 1)

In [None]:
class MyLabelEncoder():
  def __init__(self):
    self._classes = None

  def fit(self, X):
    self._classes = np.unique(X)

  def transform(self, X):
    encoded = np.zeros_like(X)
    for i, label in enumerate(self._classes):
      encoded[X == label] = i

    return encoded

In [None]:
X_label_encoded = np.empty_like(X)

label_encoder = MyLabelEncoder()

for i in range(X.shape[1]):
  label_encoder.fit(X[:, i])
  print(f"Labels of feature {i} are: {label_encoder._classes}")
  X_label_encoded[:, i] = label_encoder.transform(X[:, i])

print(X_label_encoded)

Labels of feature 0 are: ['big' 'short' 'small']
Labels of feature 1 are: ['dog' 'elephant' 'fish' 'hamster']
[['0' '1']
 ['2' '0']
 ['0' '3']
 ['2' '1']
 ['1' '1']
 ['0' '1']
 ['0' '3']
 ['1' '3']
 ['1' '2']
 ['1' '0']]


In [None]:
from sklearn.preprocessing import LabelEncoder

X_label_encoded_sklearn = np.empty_like(X)

label_encoder = LabelEncoder()

for i in range(X.shape[1]):
  label_encoder.fit(X[:, i])
  print(f"Labels of feature {i} are: {label_encoder.classes_}")
  X_label_encoded_sklearn[:, i] = label_encoder.transform(X[:, i])

print(X_label_encoded_sklearn)

Labels of feature 0 are: ['big' 'short' 'small']
Labels of feature 1 are: ['dog' 'elephant' 'fish' 'hamster']
[['0' '1']
 ['2' '0']
 ['0' '3']
 ['2' '1']
 ['1' '1']
 ['0' '1']
 ['0' '3']
 ['1' '3']
 ['1' '2']
 ['1' '0']]


In [None]:
print(np.all(X_label_encoded == X_label_encoded_sklearn))

True


In [None]:
class MyOneHotEncoder():
  def __init__(self):
    self._categories = None

  def fit(self, X):
    self._categories = []
    for i in range(X.shape[1]):
      self._categories.append(np.unique(X[:, i]))

  def transform(self, X):
    col_num = 0
    for i in range(X.shape[1]):
      col_num += len(self._categories[i])

    encoded = np.zeros(shape = (X.shape[0], col_num))

    col_index = 0
    for i in range(X.shape[1]):
      for label in self._categories[i]:
        encoded[X[:, i] == label, col_index] = 1
        col_index += 1

    return encoded

In [None]:
one_hot_encoder = MyOneHotEncoder()

one_hot_encoder.fit(X)
X_one_hot_encoder = one_hot_encoder.transform(X)

In [None]:
from sklearn.preprocessing import OneHotEncoder

one_hot_encoder_sklearn = OneHotEncoder()

one_hot_encoder_sklearn.fit(X)
X_one_hot_encoder_sklearn = one_hot_encoder_sklearn.transform(X)

In [None]:
print(np.all(X_one_hot_encoder == X_one_hot_encoder_sklearn))

True


In [105]:
from sklearn.datasets import load_iris

dataset = load_iris()

In [106]:
if np.isnan(dataset.data).any():
    dataset.data = dataset.data[~np.isnan(dataset).any(axis=1)]

dataset.data = np.unique(dataset.data, axis=1)

In [107]:
ones = np.ones((dataset.data.shape[0], 1))
dataset.data = np.hstack([dataset.data, ones])

In [108]:
from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler()

for i in range(dataset.data.shape[1]):
  scaled = standard_scaler.fit_transform(dataset.data[:, i].reshape(-1, 1))
  dataset.data[:, i] = scaled.flatten()

In [119]:
from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder()

one_hot_encoder.fit(dataset.target.reshape(-1, 1))
one_hot_encoded = one_hot_encoder.transform(dataset.target.reshape(-1, 1))
dataset.target = one_hot_encoded