In [1]:
import pandas as pd

df_items = pd.read_csv("../data/sales/items.csv")
print(f"Number of items: {len(df_items)}")
print(f"Number of category: {len(df_items['item_category_id'].unique())}")
df_items.head(5)

Number of items: 22170
Number of category: 84


Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


In [2]:
import hashlib
from typing import Tuple

from sklearn.base import BaseEstimator, TransformerMixin


def hash_modulo(val, mod):
    md5 = hashlib.md5()  # can be other deterministic hash functions
    md5.update(str(val).encode())
    return int(md5.hexdigest(), 16) % mod


class FeatureHasher(BaseEstimator, TransformerMixin):
    def __init__(self, num_buckets: int):
        self.num_buckets = num_buckets

    def fit(self, X: pd.Series):
        return self

    def transform(self, X: pd.Series):
        return X.apply(lambda x: hash_modulo(x, self.num_buckets))


fh = FeatureHasher(num_buckets=1000)

df_items["hashed_item"] = fh.transform(df_items["item_name"])
df_items.head(5)

Unnamed: 0,item_name,item_id,item_category_id,hashed_item
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40,252
1,!ABBYY FineReader 12 Professional Edition Full...,1,76,812
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40,198
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40,584
4,***КОРОБКА (СТЕКЛО) D,4,40,210


In [3]:
df_items["hashed_item"].value_counts()

69     39
353    36
962    36
997    36
194    35
       ..
204    11
197    11
567    10
228    10
219     8
Name: hashed_item, Length: 1000, dtype: int64