# Feature Engineering


In [1]:
import sys
sys.path.append('../') 
from hnm_data_analysis.feature_engineering import ArticleDescriptionVectoriser
from hnm_data_analysis.feature_engineering.articles_text_bert import ArticleDescriptionBertEmbedder

### Create a Feature using TF-IDF on detail_desc


In [2]:
vec = ArticleDescriptionVectoriser(
    input_path="../data/cleaned/articles_last_3_months_cleaned.parquet",
    language="en",
    use_lemmatise=True,
    use_stem=False,
)
tfidf, svd = vec.process(
    output_dir="../data/features/tfidf_svd",
    include_svd=True,
    svd_components=200,
    max_features=30000, min_df=5, max_df=0.8, ngram_range=(1,2),
)

Loading articles from: ../data/cleaned/articles_last_3_months_cleaned.parquet
Articles with valid descriptions: 42,229
Prepared cleaned texts: 42,229
Fitting TF-IDF: max_features=30000, min_df=5, max_df=0.8, ngram_range=(1, 2)
TF-IDF shape: 42,229 docs x 11,309 terms
Fitting TruncatedSVD with n_components=200 ...
SVD embeddings shape: 42,229 x 200
Saving TF-IDF matrix to: ../data/features/tfidf_svd\tfidf_features.npz
Saving vectorizer to: ../data/features/tfidf_svd\vectorizer.joblib
Saving article_id index to: ../data/features/tfidf_svd\article_id_index.csv
Saving SVD embeddings to: ../data/features/tfidf_svd\svd_embeddings.parquet
Saving SVD model to: ../data/features/tfidf_svd\svd_model.joblib


### Create a Feature using BERT on detail_desc


In [3]:
bert_embedder = ArticleDescriptionBertEmbedder(
    input_path="../data/cleaned/articles_last_3_months_cleaned.parquet",
    model_name="all-MiniLM-L6-v2",  # Fast, good quality model
    batch_size=32,
    device="auto",  # Uses GPU if available
)
bert_embeddings, pca_embeddings = bert_embedder.process(
    output_dir="../data/features/bert",
    include_pca=True,
    pca_components=200,
    pca_normalize=True,
    show_progress_bar=True,
)

Loading articles from: ../data/cleaned/articles_last_3_months_cleaned.parquet
Articles with valid descriptions: 42,229
Prepared cleaned texts: 42,229


INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Loading BERT model: all-MiniLM-L6-v2
Model loaded on device: cpu
Generating BERT embeddings for 42,229 texts...
Model: all-MiniLM-L6-v2, Max length: 128, Batch size: 32


Batches:   0%|          | 0/1320 [00:00<?, ?it/s]

BERT embeddings shape: 42,229 docs x 384 dimensions
Fitting PCA with n_components=200 ...
PCA embeddings shape: 42,229 x 200
Explained variance ratio: 0.979
Saving BERT embeddings to: ../data/features/bert\bert_embeddings.parquet
Saving model info to: ../data/features/bert\bert_model_info.json
Saving article_id index to: ../data/features/bert\article_id_index.csv
Saving PCA embeddings to: ../data/features/bert\pca_embeddings.parquet
Saving PCA model to: ../data/features/bert\pca_model.joblib
