In [18]:
import pandas as pd
from pathlib import Path

df = pd.read_csv("train_logs_clean.csv")

def rebuild_text(grp):
    buf = []
    for ch, act in zip(grp['text_change'], grp['activity']):
        if act == "Input":
            if ch == "Space":
                buf.append(' ')
            elif ch == "Enter":
                buf.append('\n')
            else:
                buf.append(ch)
        elif act == "Remove/Cut" and buf:
            buf.pop()
    return ''.join(buf)

texts = (
    df.groupby('id')
      .apply(rebuild_text)
      .reset_index(name='text')
)

print(texts.head(3))

         id                                               text
0  001519c8  qqqqqq qqq qqqqq qqqqqq qq qq qqqqq qq qqqq qq...
1  0022f953  qqqq qq qqqqqqqqqqq ? qq qq qqq qqq qqq, qqqqq...
2  0042269b  qqqqqqq qqq qqqq qqqq qqqq qq qqqqqqqq qqqqqqq...


  .apply(rebuild_text)


In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(3, 5),
    max_features=30000,
    dtype=np.float32,
)

X_tfidf = vectorizer.fit_transform(texts['text'])

tfidf_df = pd.DataFrame(
    X_tfidf.toarray(),
    columns=vectorizer.get_feature_names_out()
)
tfidf_df.insert(0, 'id', texts['id'])

print(tfidf_df.head())

from sklearn.decomposition import TruncatedSVD

svdsize = 64

svd = TruncatedSVD(
    n_components=svdsize,
    random_state=42,
    n_iter=7
)

X_svd = svd.fit_transform(X_tfidf)

svd_df = pd.DataFrame(
    X_svd,
    columns=[f'tfidf_svd_{i:02d}' for i in range(svdsize)]
)
svd_df.insert(0, 'id', texts['id'].values)

print(svd_df.head())

         id  \n""  \n""   \n"" q  \n"q  \n"qq  \n"qqq  \n'q  \n'q   \n'q q  \
0  001519c8   0.0    0.0     0.0   0.0    0.0     0.0   0.0    0.0     0.0   
1  0022f953   0.0    0.0     0.0   0.0    0.0     0.0   0.0    0.0     0.0   
2  0042269b   0.0    0.0     0.0   0.0    0.0     0.0   0.0    0.0     0.0   
3  0059420b   0.0    0.0     0.0   0.0    0.0     0.0   0.0    0.0     0.0   
4  0075873a   0.0    0.0     0.0   0.0    0.0     0.0   0.0    0.0     0.0   

   ...  — qqq  —qq  —qq   —qq q  —qq,  —qq,   —qqq  —qqq   —qqqq  —qqq—  
0  ...    0.0  0.0   0.0    0.0   0.0    0.0   0.0    0.0    0.0    0.0  
1  ...    0.0  0.0   0.0    0.0   0.0    0.0   0.0    0.0    0.0    0.0  
2  ...    0.0  0.0   0.0    0.0   0.0    0.0   0.0    0.0    0.0    0.0  
3  ...    0.0  0.0   0.0    0.0   0.0    0.0   0.0    0.0    0.0    0.0  
4  ...    0.0  0.0   0.0    0.0   0.0    0.0   0.0    0.0    0.0    0.0  

[5 rows x 5107 columns]
         id  tfidf_svd_00  tfidf_svd_01  tfidf_svd_02  tfidf_s