In [215]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,TargetEncoder

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

In [216]:
# Simulated dataset

np.random.seed(42)

data = pd.DataFrame({
    'ZipCode': np.random.choice(['10001', '10002', '10003', '10004', '10005', '10006', '10007', '10008', '10009', '10010'], size=1000),
    'Income': np.random.randint(20000, 100000, size=1000),
    'HousePrice': np.random.randint(150000, 500000, size=1000)
})

In [217]:
# Split data

X = data[['ZipCode', 'Income' ]]

y = data['HousePrice']

In [218]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [219]:
X_train.head()

Unnamed: 0,ZipCode,Income
29,10004,88244
535,10002,52196
695,10007,96707
557,10007,81788
836,10006,21367


In [220]:
# One-hot Encoder

ohe = OneHotEncoder( sparse_output = False)

X_train_ohe = ohe.fit_transform(X_train[['ZipCode']])

X_train_ohe = pd.DataFrame(X_train_ohe, columns  = ohe.get_feature_names_out(['ZipCode']))

X_train_ohe.head()

Unnamed: 0,ZipCode_10001,ZipCode_10002,ZipCode_10003,ZipCode_10004,ZipCode_10005,ZipCode_10006,ZipCode_10007,ZipCode_10008,ZipCode_10009,ZipCode_10010
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [221]:
# Frequency encoding

freq_encoding = X_train['ZipCode'].value_counts(normalize = True)

X_train['ZipCode_freq']= X_train['ZipCode'].map(freq_encoding)

X_train[['ZipCode','ZipCode_freq']].head()

Unnamed: 0,ZipCode,ZipCode_freq
29,10004,0.09375
535,10002,0.0825
695,10007,0.105
557,10007,0.105
836,10006,0.0875


In [222]:
# Ordinal Encoding

ord_enc = OrdinalEncoder()

X_train['ZipCode_ordinal'] = ord_enc.fit_transform(X_train[['ZipCode']])

X_train[['ZipCode','ZipCode_ordinal']].head()

Unnamed: 0,ZipCode,ZipCode_ordinal
29,10004,3.0
535,10002,1.0
695,10007,6.0
557,10007,6.0
836,10006,5.0


In [223]:
# Target encoding

# Combining X_train,y_train to find mean target house price is in y _train

train_data = pd.concat([X_train,y_train], axis = 1)

mean_target = train_data.groupby('ZipCode')['HousePrice'].mean()

X_train['ZipCode_target'] = X_train['ZipCode'].map(mean_target)

X_train[['ZipCode', 'ZipCode_target']].head()

Unnamed: 0,ZipCode,ZipCode_target
29,10004,326432.826667
535,10002,334750.924242
695,10007,296245.940476
557,10007,296245.940476
836,10006,309744.771429


In [224]:
# Using One-Hot Encoding

lr_ohe = LinearRegression()

X_train_ohe_combined = pd.concat([pd.DataFrame(X_train_ohe), X_train[['Income']].reset_index(drop=True)], axis = 1)

lr_ohe.fit(X_train_ohe_combined, y_train)

mse_ohe = mean_squared_error(y_test,lr_ohe.predict(pd.concat([pd.DataFrame(ohe.transform(X_test[['ZipCode']]), columns=ohe.get_feature_names_out(['ZipCode'])), X_test[['Income']].reset_index(drop= True)], axis =1)))


In [225]:
# Using Frequency Encoding

lr_freq = LinearRegression()

lr_freq.fit(X_train[['ZipCode_freq', 'Income']],y_train)

# Predict on the training set
y_pred_train = lr_freq.predict(X_train[['ZipCode_freq', 'Income']])

mse_freq = mean_squared_error(y_train, y_pred_train)


# mse_freq = mean_squared_error(y_test, lr_freq.predict(X_train[['ZipCode_freq','Income']]))


In [226]:
# Using Target Encoding

lr_target = LinearRegression()

lr_target.fit(X_train[['ZipCode_target', 'Income']], y_train)

mse_target = mean_squared_error(y_train, lr_target.predict(X_train[['ZipCode_target', 'Income']]))

In [227]:
# Results
print(f"Mean Squared Error (One-Hot Encoding): {mse_ohe}")
print(f"Mean Squared Error (Frequency Encoding): {mse_freq}")
print(f"Mean Squared Error (Target Encoding): {mse_target}")

Mean Squared Error (One-Hot Encoding): 9630721637.257563
Mean Squared Error (Frequency Encoding): 10061554147.956675
Mean Squared Error (Target Encoding): 9897319592.421825
