In [50]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from category_encoders import BinaryEncoder,HashingEncoder

from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

In [51]:
# Simulated dataset

np.random.seed(42)

data = pd.DataFrame({
    'ZipCode': np.random.choice(['10001', '10002', '10003', '10004', '10005', '10006', '10007', '10008', '10009', '10010'], size=1000),
    'Income': np.random.randint(20000, 100000, size=1000),
    'HousePrice': np.random.randint(150000, 500000, size=1000)
})

In [52]:
# Split data

X = data[['ZipCode', 'Income' ]]

y = data['HousePrice']

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [54]:
# Binary Encoding

binary_enc = BinaryEncoder(cols=['ZipCode'])
X_train_binary = binary_enc.fit_transform(X_train['ZipCode'])
X_train_binary.head()



Unnamed: 0,ZipCode_0,ZipCode_1,ZipCode_2,ZipCode_3
29,0,0,0,1
535,0,0,1,0
695,0,0,1,1
557,0,0,1,1
836,0,1,0,0


In [55]:
# Hash Encoding

hash_enc = HashingEncoder(cols=['ZipCode'], n_components=5)  # Reduces to 5 dimensions

X_train_hash = hash_enc.fit_transform(X_train[['ZipCode']])

X_train_hash.head()




Unnamed: 0,col_0,col_1,col_2,col_3,col_4
29,0,0,0,1,0
535,0,0,0,0,1
695,0,0,0,1,0
557,0,0,0,1,0
836,0,0,0,1,0


In [56]:
# Frequency encoding

freq_enc = X_train['ZipCode'].value_counts(normalize = True)

X_train['ZipCode_freq'] = X_train['ZipCode'].map(freq_enc)

# Scaling the frequency-encoded column

scaler = MinMaxScaler()

X_train['ZipCode_freq_scaled'] = scaler.fit_transform(X_train[['ZipCode_freq']]) 

X_train[['ZipCode','ZipCode_freq','ZipCode_freq_scaled']].head()

Unnamed: 0,ZipCode,ZipCode_freq,ZipCode_freq_scaled
29,10004,0.09375,0.310345
535,10002,0.0825,0.0
695,10007,0.105,0.62069
557,10007,0.105,0.62069
836,10006,0.0875,0.137931


In [59]:
# Binary Encoding
lr_binary = LinearRegression()
X_train_binary_combined = pd.concat([X_train_binary, X_train[['Income']].reset_index(drop=True)], axis=1)
lr_binary.fit(X_train_binary_combined, y_train)
mse_binary = mean_squared_error(y_test, lr_binary.predict(pd.concat([binary_enc.transform(X_test[['ZipCode']]), X_test[['Income']].reset_index(drop=True)], axis=1)))

# Hash Encoding
lr_hash = LinearRegression()
X_train_hash_combined = pd.concat([X_train_hash, X_train[['Income']].reset_index(drop=True)], axis=1)
lr_hash.fit(X_train_hash_combined, y_train)
mse_hash = mean_squared_error(y_test, lr_hash.predict(pd.concat([hash_enc.transform(X_test[['ZipCode']]), X_test[['Income']].reset_index(drop=True)], axis=1)))

# Frequency Encoding with Scaling
lr_freq_scaled = LinearRegression()
lr_freq_scaled.fit(X_train[['ZipCode_freq_scaled', 'Income']], y_train)
mse_freq_scaled = mean_squared_error(y_test, lr_freq_scaled.predict(X_test[['ZipCode_freq_scaled', 'Income']]))

# Results
print(f"Mean Squared Error (Binary Encoding): {mse_binary}")
print(f"Mean Squared Error (Hash Encoding): {mse_hash}")
print(f"Mean Squared Error (Frequency Scaled): {mse_freq_scaled}")


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values