In [1]:
import pandas as pd
from pathlib import Path

df = pd.read_csv("train_logs_extracted_text.csv")

texts = df[['id', 'text']]

print(texts.head(3))

         id                                               text
0  001519c8  qqqqqqqqq qq qqqqq qq qqqq qqqq.  qqqqqq qqq q...
1  0022f953  qqqq qq qqqqqqqqqqq ? qq qq qqq qqq qqq, qqqqq...
2  0042269b  qqqqqqqqqqq qq qqqqq qqqqqqqqq qq qqqqqqqqqqq ...


In [2]:
# TF-IDF + SVD for text feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(3, 5),
    max_features=30000,
    dtype=np.float32,
)

X_tfidf = vectorizer.fit_transform(texts['text'])

tfidf_df = pd.DataFrame(
    X_tfidf.toarray(),
    columns=vectorizer.get_feature_names_out()
)
tfidf_df.insert(0, 'id', texts['id'])

print(tfidf_df.head())

from sklearn.decomposition import TruncatedSVD

svdsize = 64

svd = TruncatedSVD(
    n_components=svdsize,
    random_state=42,
    n_iter=7
)

X_svd = svd.fit_transform(X_tfidf)

svd_df = pd.DataFrame(
    X_svd,
    columns=[f'{i:02d}' for i in range(svdsize)]
)
svd_df.insert(0, 'id', texts['id'].values)

print(svd_df.head())
svd_df.to_csv("texts_tfidf_svd_vectors.csv", index=False)

         id   !    ! q   ! qq   !!   !!    !! q   !!!   !!!    !!!.  ...  \
0  001519c8  0.0   0.0    0.0  0.0   0.0    0.0   0.0    0.0    0.0  ...   
1  0022f953  0.0   0.0    0.0  0.0   0.0    0.0   0.0    0.0    0.0  ...   
2  0042269b  0.0   0.0    0.0  0.0   0.0    0.0   0.0    0.0    0.0  ...   
3  0059420b  0.0   0.0    0.0  0.0   0.0    0.0   0.0    0.0    0.0  ...   
4  0075873a  0.0   0.0    0.0  0.0   0.0    0.0   0.0    0.0    0.0  ...   

   — qqq  —qq  —qq   —qq q  —qq,  —qq,   —qqq  —qqq   —qqqq  —qqq—  
0    0.0  0.0   0.0    0.0   0.0    0.0   0.0    0.0    0.0    0.0  
1    0.0  0.0   0.0    0.0   0.0    0.0   0.0    0.0    0.0    0.0  
2    0.0  0.0   0.0    0.0   0.0    0.0   0.0    0.0    0.0    0.0  
3    0.0  0.0   0.0    0.0   0.0    0.0   0.0    0.0    0.0    0.0  
4    0.0  0.0   0.0    0.0   0.0    0.0   0.0    0.0    0.0    0.0  

[5 rows x 4011 columns]
         id        00        01        02        03        04        05  \
0  001519c8  0.999098  0.0355

In [None]:
df = pd.read_csv("test_logs_extracted_text.csv")

df = df[['id', 'text']]

texts = df[['id', 'text']]

print(texts.head(3))

test_tfidf = vectorizer.transform(texts['text'])

test_svd = svd.transform(test_tfidf)

test_svd_df = pd.DataFrame(
    test_svd,
    columns=[f'{i:02d}' for i in range(svdsize)]
)
test_svd_df.insert(0, 'id', texts['id'].values)

test_svd_df.to_csv("test_operations_tfidf_svd_vectors.csv", index=False)

print(test_svd_df.head(3))

KeyError: "['text'] not in index"