# Article Feature Engineering


In [1]:
import sys
import os
import polars as pl
sys.path.append('../') 
from hnm_data_analysis.feature_engineering import ArticleDescriptionVectoriser
from hnm_data_analysis.feature_engineering.articles_text_bert import ArticleDescriptionBertEmbedder
from hnm_data_analysis.feature_engineering import CombinedBertArticleFeatures


### Create a Feature using TF-IDF on detail_desc


In [2]:
vec = ArticleDescriptionVectoriser(
    input_path="../data/cleaned/articles_last_3_months_cleaned.parquet",
    language="en",
    use_lemmatise=True,
    use_stem=False,
)
tfidf, svd = vec.process(
    output_dir="../data/features/tfidf_svd",
    include_svd=True,
    svd_components=200,
    max_features=30000, min_df=5, max_df=0.8, ngram_range=(1,2),
)

Loading articles from: ../data/cleaned/articles_last_3_months_cleaned.parquet
Articles with valid descriptions: 42,229
Prepared cleaned texts: 42,229
Fitting TF-IDF: max_features=30000, min_df=5, max_df=0.8, ngram_range=(1, 2)
TF-IDF shape: 42,229 docs x 11,309 terms
Fitting TruncatedSVD with n_components=200 ...
SVD embeddings shape: 42,229 x 200
Saving TF-IDF matrix to: ../data/features/tfidf_svd/tfidf_features.npz
Saving vectorizer to: ../data/features/tfidf_svd/vectorizer.joblib
Saving article_id index to: ../data/features/tfidf_svd/article_id_index.csv
Saving SVD embeddings to: ../data/features/tfidf_svd/svd_embeddings.parquet
Saving SVD model to: ../data/features/tfidf_svd/svd_model.joblib


### Create a Feature using BERT on detail_desc


In [2]:
bert_embedder = ArticleDescriptionBertEmbedder(
    input_path="../data/cleaned/articles_last_3_months_cleaned.parquet",
    model_name="sentence-transformers/all-mpnet-base-v2",
    max_length=256,
    batch_size=16,
    device="auto",  # Uses GPU if available
)
bert_embeddings, pca_embeddings = bert_embedder.process(
    output_dir="../data/features/bert",
    include_pca=True,
    pca_components=50,
    pca_normalize=True,
    show_progress_bar=True,
)

Loading articles from: ../data/cleaned/articles_last_3_months_cleaned.parquet
Articles with valid descriptions: 42,229
Prepared cleaned texts: 42,229


INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


Loading BERT model: sentence-transformers/all-mpnet-base-v2


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model loaded on device: cpu
Generating BERT embeddings for 42,229 texts...
Model: sentence-transformers/all-mpnet-base-v2, Max length: 256, Batch size: 16


Batches:   0%|          | 0/2640 [00:00<?, ?it/s]

BERT embeddings shape: 42,229 docs x 768 dimensions
Fitting PCA with n_components=50 ...
PCA embeddings shape: 42,229 x 50
Explained variance ratio: 0.793
Saving BERT embeddings to: ../data/features/bert/bert_embeddings.parquet
Saving model info to: ../data/features/bert/bert_model_info.json
Saving article_id index to: ../data/features/bert/article_id_index.csv
Saving PCA embeddings to: ../data/features/bert/pca_embeddings.parquet
Saving PCA model to: ../data/features/bert/pca_model.joblib


### Combine the BERT Vector Feature with data\cleaned\articles_last_3_months_cleaned.parquet


In [None]:
os.chdir("..") 
# Combine the BERT Vector Feature with data\cleaned\articles_last_3_months_cleaned.parquet
# Combine cleaned articles + BERT embeddings and save to data/features/combined/
job = CombinedBertArticleFeatures()  # uses module defaults
out_path = job.run()

# Load and inspect
df = pl.read_parquet(out_path)
df.head()