In [1]:
import pandas as pd
from pathlib import Path

df = pd.read_csv("train_logs_extracted_text.csv")

texts = df[['id', 'text']]

print(texts.head(3))

         id                                               text
0  001519c8  qqqqqqqqq qq qqqqq qq qqqq qqqq.  qqqqqq qqq q...
1  0022f953  qqqq qq qqqqqqqqqqq ? qq qq qqq qqq qqq, qqqqq...
2  0042269b  qqqqqqqqqqq qq qqqqq qqqqqqqqq qq qqqqqqqqqqq ...


In [2]:
# TF-IDF + SVD for text feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(1, 5),
    max_features=30000,
    dtype=np.float32,
)

X_tfidf = vectorizer.fit_transform(texts['text'])

tfidf_df = pd.DataFrame(
    X_tfidf.toarray(),
    columns=vectorizer.get_feature_names_out()
)
tfidf_df.insert(0, 'id', texts['id'])

print(tfidf_df.head())

from sklearn.decomposition import TruncatedSVD

svdsize = 64

svd = TruncatedSVD(
    n_components=svdsize,
    random_state=42,
    n_iter=7
)

X_svd = svd.fit_transform(X_tfidf)

svd_df = pd.DataFrame(
    X_svd,
    columns=[f'{i:02d}' for i in range(svdsize)]
)
svd_df.insert(0, 'id', texts['id'].values)

print(svd_df.head())
svd_df.to_csv("texts_tfidf_svd_vectors.csv", index=False)

         id              !   !    ! q   ! qq   !!   !!    !! q   !!!  ...  \
0  001519c8  0.126195  0.0  0.0   0.0    0.0  0.0   0.0    0.0   0.0  ...   
1  0022f953  0.160433  0.0  0.0   0.0    0.0  0.0   0.0    0.0   0.0  ...   
2  0042269b  0.114013  0.0  0.0   0.0    0.0  0.0   0.0    0.0   0.0  ...   
3  0059420b  0.135302  0.0  0.0   0.0    0.0  0.0   0.0    0.0   0.0  ...   
4  0075873a  0.140531  0.0  0.0   0.0    0.0  0.0   0.0    0.0   0.0  ...   

    —q  —qq  —qq   —qq q  —qq,  —qq,   —qqq  —qqq   —qqqq  —qqq—  
0  0.0  0.0   0.0    0.0   0.0    0.0   0.0    0.0    0.0    0.0  
1  0.0  0.0   0.0    0.0   0.0    0.0   0.0    0.0    0.0    0.0  
2  0.0  0.0   0.0    0.0   0.0    0.0   0.0    0.0    0.0    0.0  
3  0.0  0.0   0.0    0.0   0.0    0.0   0.0    0.0    0.0    0.0  
4  0.0  0.0   0.0    0.0   0.0    0.0   0.0    0.0    0.0    0.0  

[5 rows x 4236 columns]
         id        00        01        02        03        04        05  \
0  001519c8  0.999581  0.024750 -0.

In [3]:
df = pd.read_csv("test_logs_extracted_text.csv")

df = df[['id', 'text']]

texts = df[['id', 'text']]

print(texts.head(3))

test_tfidf = vectorizer.transform(texts['text'])

test_svd = svd.transform(test_tfidf)

test_svd_df = pd.DataFrame(
    test_svd,
    columns=[f'{i:02d}' for i in range(svdsize)]
)
test_svd_df.insert(0, 'id', texts['id'].values)

test_svd_df.to_csv("test_operations_tfidf_svd_vectors.csv", index=False)

print(test_svd_df.head(3))

         id text
0  0000aaaa     
1  2222bbbb   qq
2  4444cccc   q 
         id        00        01        02        03        04        05  \
0  0000aaaa  0.133119 -0.208592 -0.010232 -0.165733  0.111861 -0.038116   
1  2222bbbb  0.763731 -0.033761 -0.009919 -0.187567 -0.040646  0.160399   
2  4444cccc  0.501594 -0.297041 -0.010037 -0.168277  0.161586  0.058945   

         06        07        08  ...        54        55        56        57  \
0  0.060394 -0.106699  0.177698  ... -0.004176 -0.023058 -0.007138 -0.064382   
1  0.130930  0.006696  0.018144  ... -0.012592  0.010105 -0.003868  0.032823   
2  0.079454 -0.196909  0.181865  ...  0.018682 -0.007194  0.022382 -0.047709   

         58        59        60        61        62        63  
0  0.014699  0.013954 -0.020810 -0.008230  0.004583 -0.000717  
1 -0.000545 -0.008911  0.001609  0.001614 -0.003713  0.013509  
2 -0.004116 -0.000400 -0.021395  0.014968  0.003493 -0.007495  

[3 rows x 65 columns]
