In [11]:
import pandas as pd
from pathlib import Path

df = pd.read_csv("../data/train_logs_extracted_text.csv")

texts = df[['id', 'text']]

In [12]:
# TF-IDF + SVD for text feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pickle

vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(1, 5),
    max_features=30000,
    dtype=np.float32,
)

X_tfidf = vectorizer.fit_transform(texts['text'])

tfidf_df = pd.DataFrame(
    X_tfidf.toarray(),
    columns=vectorizer.get_feature_names_out()
)
tfidf_df.insert(0, 'id', texts['id'])


from sklearn.decomposition import TruncatedSVD

n_features = X_tfidf.shape[1]
svdsize = min(64, n_features - 1)  

svd = TruncatedSVD(
    n_components=svdsize,
    random_state=42,
    n_iter=7
)
X_svd = svd.fit_transform(X_tfidf)


svd_df = pd.DataFrame(
    X_svd,
    columns=[f'{i:02d}' for i in range(svdsize)]
)
svd_df.insert(0, 'id', texts['id'].values)

svd_df.to_csv("../data/train_tfidf_text.csv", index=False)

with open('../data/text_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

if svd is not None:
    with open('../data/text_svd.pkl', 'wb') as f:
        pickle.dump(svd, f)


In [14]:
test_df = pd.read_csv("../data/test_logs_extracted_text.csv")
test_texts = test_df[['id', 'text']]

with open('../data/text_vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

X_test_tfidf = vectorizer.transform(test_texts['text'])

with open('../data/text_svd.pkl', 'rb') as f:
    svd = pickle.load(f)
X_test_svd = svd.transform(X_test_tfidf)
svdsize = X_test_svd.shape[1]

test_svd_df = pd.DataFrame(
    X_test_svd,
    columns=[f'{i:02d}' for i in range(svdsize)]
)
test_svd_df.insert(0, 'id', test_texts['id'].values)
test_svd_df.to_csv("../data/test_tfidf_text.csv", index=False)
