In [8]:
import pandas as pd
from pathlib import Path

df = pd.read_csv("../data/test_logs_extracted_text.csv")

texts = df[['id', 'text']]

print(texts.head(3))

         id text
0  0000aaaa     
1  2222bbbb   qq
2  4444cccc   q 


In [9]:
# TF-IDF + SVD for text feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(1, 5),
    max_features=30000,
    dtype=np.float32,
)

X_tfidf = vectorizer.fit_transform(texts['text'])

tfidf_df = pd.DataFrame(
    X_tfidf.toarray(),
    columns=vectorizer.get_feature_names_out()
)
tfidf_df.insert(0, 'id', texts['id'])

print(tfidf_df.head())

from sklearn.decomposition import TruncatedSVD

n_features = X_tfidf.shape[1]
svdsize = min(64, n_features - 1)  

if svdsize > 0:
    svd = TruncatedSVD(
        n_components=svdsize,
        random_state=42,
        n_iter=7
    )
    X_svd = svd.fit_transform(X_tfidf)
else:
    X_svd = X_tfidf.toarray()
    svdsize = n_features

svd_df = pd.DataFrame(
    X_svd,
    columns=[f'{i:02d}' for i in range(svdsize)]
)
svd_df.insert(0, 'id', texts['id'].values)

print(svd_df.head())
svd_df.to_csv("../data/test_tfidf_text.csv", index=False)

         id                   q        q         qq
0  0000aaaa  1.000000  0.000000  0.000000  0.000000
1  2222bbbb  0.000000  0.835592  0.000000  0.549351
2  4444cccc  0.517856  0.517856  0.680919  0.000000
         id        00            01        02
0  0000aaaa  0.702226 -6.412066e-01 -0.309409
1  2222bbbb  0.586774  7.673686e-01 -0.258540
2  4444cccc  0.915109  6.005710e-08  0.403208
