# Feature Engineering


In [1]:
import sys
import os
import polars as pl
sys.path.append('../') 
from hnm_data_analysis.feature_engineering import ArticleDescriptionVectoriser
from hnm_data_analysis.feature_engineering.articles_text_bert import ArticleDescriptionBertEmbedder
from hnm_data_analysis.feature_engineering import CombinedBertArticleFeatures


### Create a Feature using TF-IDF on detail_desc


In [2]:
vec = ArticleDescriptionVectoriser(
    input_path="../data/cleaned/articles_last_3_months_cleaned.parquet",
    language="en",
    use_lemmatise=True,
    use_stem=False,
)
tfidf, svd = vec.process(
    output_dir="../data/features/tfidf_svd",
    include_svd=True,
    svd_components=200,
    max_features=30000, min_df=5, max_df=0.8, ngram_range=(1,2),
)

Loading articles from: ../data/cleaned/articles_last_3_months_cleaned.parquet
Articles with valid descriptions: 42,229
Prepared cleaned texts: 42,229
Fitting TF-IDF: max_features=30000, min_df=5, max_df=0.8, ngram_range=(1, 2)
TF-IDF shape: 42,229 docs x 11,309 terms
Fitting TruncatedSVD with n_components=200 ...
SVD embeddings shape: 42,229 x 200
Saving TF-IDF matrix to: ../data/features/tfidf_svd\tfidf_features.npz
Saving vectorizer to: ../data/features/tfidf_svd\vectorizer.joblib
Saving article_id index to: ../data/features/tfidf_svd\article_id_index.csv
Saving SVD embeddings to: ../data/features/tfidf_svd\svd_embeddings.parquet
Saving SVD model to: ../data/features/tfidf_svd\svd_model.joblib


### Create a Feature using BERT on detail_desc


In [3]:
bert_embedder = ArticleDescriptionBertEmbedder(
    input_path="../data/cleaned/articles_last_3_months_cleaned.parquet",
    model_name="sentence-transformers/all-mpnet-base-v2",
    max_length=256,
    batch_size=16,
    device="auto",  # Uses GPU if available
)
bert_embeddings, pca_embeddings = bert_embedder.process(
    output_dir="../data/features/bert",
    include_pca=True,
    pca_components=50,
    pca_normalize=True,
    show_progress_bar=True,
)

Loading articles from: ../data/cleaned/articles_last_3_months_cleaned.parquet
Articles with valid descriptions: 42,229
Prepared cleaned texts: 42,229


INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


Loading BERT model: sentence-transformers/all-mpnet-base-v2
Model loaded on device: cpu
Generating BERT embeddings for 42,229 texts...
Model: sentence-transformers/all-mpnet-base-v2, Max length: 256, Batch size: 16


Batches:   0%|          | 0/2640 [00:00<?, ?it/s]

BERT embeddings shape: 42,229 docs x 768 dimensions
Fitting PCA with n_components=50 ...
PCA embeddings shape: 42,229 x 50
Explained variance ratio: 0.793
Saving BERT embeddings to: ../data/features/bert\bert_embeddings.parquet
Saving model info to: ../data/features/bert\bert_model_info.json
Saving article_id index to: ../data/features/bert\article_id_index.csv
Saving PCA embeddings to: ../data/features/bert\pca_embeddings.parquet
Saving PCA model to: ../data/features/bert\pca_model.joblib


### Combine the BERT Vector Feature with data\cleaned\articles_last_3_months_cleaned.parquet


In [4]:
os.chdir("..") 
# Combine the BERT Vector Feature with data\cleaned\articles_last_3_months_cleaned.parquet
# Combine cleaned articles + BERT embeddings and save to data/features/combined/
job = CombinedBertArticleFeatures()  # uses module defaults
out_path = job.run()

# Load and inspect
df = pl.read_parquet(out_path)
df.head()

Loaded cleaned articles: (42298, 28)
Loaded BERT embeddings: (42229, 769)
Combined dataset shape: (42229, 776)
Articles with embeddings and metadata: 42,229
Saved combined dataset to: data/features/combined\articles_with_bert_embeddings.parquet


article_id,bert_001,bert_002,bert_003,bert_004,bert_005,bert_006,bert_007,bert_008,bert_009,bert_010,bert_011,bert_012,bert_013,bert_014,bert_015,bert_016,bert_017,bert_018,bert_019,bert_020,bert_021,bert_022,bert_023,bert_024,bert_025,bert_026,bert_027,bert_028,bert_029,bert_030,bert_031,bert_032,bert_033,bert_034,bert_035,bert_036,…,bert_739,bert_740,bert_741,bert_742,bert_743,bert_744,bert_745,bert_746,bert_747,bert_748,bert_749,bert_750,bert_751,bert_752,bert_753,bert_754,bert_755,bert_756,bert_757,bert_758,bert_759,bert_760,bert_761,bert_762,bert_763,bert_764,bert_765,bert_766,bert_767,bert_768,product_type_name,product_group_name,department_name,section_name,garment_group_name,colour_group_name,graphical_appearance_name
i64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,cat,cat,cat,cat,cat,cat,cat
816591001,-0.015659,0.012156,0.010178,-0.006604,-0.039259,-0.018793,-0.007029,0.054467,-0.034701,-0.008512,0.070179,0.018095,0.034471,0.033576,0.094397,0.003092,-0.055582,0.002851,0.051617,-0.005365,0.025957,0.020131,0.000646,-0.001181,0.010801,-0.036969,-0.010229,0.040888,-0.005502,-0.000103,0.076183,-0.012406,-0.023416,-0.068657,2e-06,-0.000894,…,0.005918,0.022829,-0.047841,0.02728,0.004846,-0.004082,0.018529,0.02017,0.014933,0.021912,-0.020965,0.006171,0.053716,0.008077,0.046846,0.024844,0.010362,0.006734,1.9350000000000002e-34,-0.009219,-0.127457,0.013274,0.050405,0.011029,0.002934,0.070942,0.027207,-0.000197,-0.013809,0.006784,"""T-shirt""","""Garment Upper body""","""Tops Fancy Jersey""","""Divided Collection""","""Jersey Fancy""","""Dark Blue""","""Stripe"""
804472003,0.005563,0.005978,-0.026642,0.002938,-0.027864,0.003424,0.001553,0.018717,0.018902,-0.004157,0.000965,0.015299,0.05248,0.112323,0.072043,-0.03552,-0.022659,-0.012899,0.003428,-0.026845,0.022018,0.038685,-0.010421,0.027536,0.012282,-0.034946,0.002565,0.000958,0.002286,0.025895,0.094062,0.00401,-0.036547,-0.048726,2e-06,-0.018688,…,0.051404,-0.018781,-0.077134,-0.013477,0.010288,0.006449,0.014299,0.07041,0.00431,0.037104,0.004784,-0.076596,0.022273,0.029969,0.049993,0.0013,-0.007544,0.011644,1.9875e-34,0.019272,-0.066649,0.023891,0.003674,0.029417,0.015441,0.041853,-0.015929,0.024076,0.007616,-0.000851,"""Sneakers""","""Shoes""","""Kids Boy Shoes""","""Kids & Baby Shoes""","""Shoes""","""Black""","""Solid"""
693915001,-0.035382,0.025676,0.020326,-0.009956,-0.0252,-0.040034,-0.055042,0.048816,-0.043896,-0.034095,0.014015,0.03632,0.052355,0.040688,0.101161,-0.017168,-0.015411,-0.030049,-0.02938,0.001041,0.012573,0.01813,-0.010806,-0.000751,0.039573,-0.036114,-0.030136,0.002133,-0.019197,0.039195,0.090534,0.003774,-0.002759,-0.079025,2e-06,-0.008549,…,0.024847,0.006655,-0.064195,-0.015331,0.021835,-0.000923,-0.008696,0.052818,-0.006522,-0.001527,-0.018466,-0.047959,0.0397,0.023257,0.051558,0.017016,0.046344,0.002421,1.5923000000000001e-34,0.016653,-0.02932,-0.005469,0.051259,0.01495,0.000143,-0.002265,-0.006005,0.03819,-0.028675,-0.008054,"""Trousers""","""Garment Lower body""","""Trousers & Skirt""","""Womens Trend""","""Trousers""","""Beige""","""Check"""
790904007,-0.035949,0.046203,0.011405,0.01268,-0.055287,-0.026921,-0.039075,0.056623,-0.054116,-0.03501,0.011305,0.007535,0.039332,0.022054,0.082046,-2.9e-05,-0.024672,-0.010841,-0.049222,-0.01248,-0.003162,-0.002767,-0.001446,-0.001519,0.064139,-0.011927,-0.056071,-0.008529,-0.008399,0.05896,0.06918,0.020556,-0.025252,-0.053308,2e-06,-0.013209,…,0.038971,0.031761,-0.053834,-0.029495,0.005277,-0.004412,-0.029428,0.054736,-0.018975,0.00354,-0.011066,-0.045968,0.013938,0.019726,0.035718,0.028495,0.006232,0.005376,1.6376e-34,0.02205,-0.03512,-0.017729,0.066212,0.005572,-0.005966,0.043238,-0.018493,0.046377,-0.029211,-0.009323,"""Trousers""","""Garment Lower body""","""Trouser""","""Contemporary Casual""","""Trousers""","""Beige""","""Solid"""
829618001,0.017185,0.015238,-0.031034,-0.029762,-0.018644,0.06515,0.003714,0.017217,-0.035679,0.018255,0.008036,-0.006915,0.024826,0.095959,0.044352,-0.028764,-0.048612,0.0308,-0.001608,0.015041,0.013765,0.01781,-0.039813,-0.011494,0.061811,-0.066379,-0.017043,0.05007,-0.007313,0.00322,0.054836,-0.051389,-0.010636,-0.057132,1e-06,-0.014798,…,0.018852,0.026622,-0.048741,-0.015574,-0.038639,0.009846,-0.0049,0.0019,0.021045,0.030472,0.040846,-0.046962,0.016626,-0.038583,0.08201,0.033284,-0.101032,-0.014251,7.866e-35,0.005772,-0.061571,-0.012571,0.011222,0.010149,-0.01142,-0.007657,-0.03718,0.034058,0.008531,0.011802,"""Bra""","""Underwear""","""Expressive Lingerie""","""Womens Lingerie""","""Under-, Nightwear""","""White""","""Solid"""
