In [25]:
import pandas as pd

df = pd.read_csv("../data/train_logs_clean.csv")

df = df[['id', 'activity']]

print(df.head(3))

def rebuild_text(grp):
    buf = []
    for op in grp['activity']:
        buf.append(op[0])
    return "".join(buf)

operations = (
    df.groupby('id')
        .apply(rebuild_text)
        .reset_index(name='operation')
)

print(operations.head(3))

         id activity
0  001519c8    Input
1  001519c8    Input
2  001519c8    Input
         id                                          operation
0  001519c8  IIIIIIIIIIIIIIIRIIIIIIIIIIIIIIRIIIIIIIIIIIIIII...
1  0022f953  IIIIIIIIIIIIIIIIIIIRRRRRRRRRRRIIIIIIIIIIIIIIRR...
2  0042269b  IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...


  .apply(rebuild_text)


In [None]:
# TF-IDF + SVD for operation feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pickle

vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(1, 5), 
    max_features=30000,
    dtype=np.float32,
)

X_tfidf = vectorizer.fit_transform(operations['operation'])

tfidf_df = pd.DataFrame(
    X_tfidf.toarray(),
    columns=vectorizer.get_feature_names_out()
)
tfidf_df.insert(0, 'id', operations['id'])

from sklearn.decomposition import TruncatedSVD

n_features = X_tfidf.shape[1]
svdsize = min(64, n_features - 1)  

svd = TruncatedSVD(
    n_components=svdsize,
    random_state=42,
    n_iter=7
)
X_svd = svd.fit_transform(X_tfidf)


svd_df = pd.DataFrame(
    X_svd,
    columns=[f'{i:02d}' for i in range(svdsize)]
)
svd_df.insert(0, 'id', operations['id'].values)

print(svd_df.head())
svd_df.to_csv("../data/train_tfidf_operation.csv", index=False)

with open('../data/operation_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

with open('../data/operation_svd.pkl', 'wb') as f:
    pickle.dump(svd, f)

         id         i        ii       iii      iiii     iiiii     iiiim  \
0  001519c8  0.486461  0.461533  0.437815  0.415065  0.393284  0.002867   
1  0022f953  0.482611  0.462191  0.442518  0.423841  0.406658  0.000000   
2  0042269b  0.468981  0.455906  0.444031  0.432290  0.420949  0.000000   
3  0059420b  0.490705  0.465869  0.442162  0.419959  0.398510  0.000000   
4  0075873a  0.460274  0.446528  0.433492  0.421405  0.409554  0.000000   

      iiiip     iiiir      iiim  ...  rrrmi  rrrp  rrrpi  rrrpp  rrrpr  \
0  0.000000  0.021298  0.002841  ...    0.0   0.0    0.0    0.0    0.0   
1  0.000969  0.016934  0.000000  ...    0.0   0.0    0.0    0.0    0.0   
2  0.000000  0.011208  0.000000  ...    0.0   0.0    0.0    0.0    0.0   
3  0.000000  0.021073  0.000000  ...    0.0   0.0    0.0    0.0    0.0   
4  0.000000  0.011851  0.000000  ...    0.0   0.0    0.0    0.0    0.0   

       rrrr     rrrri  rrrrm  rrrrp     rrrrr  
0  0.051010  0.008019    0.0    0.0  0.043151  
1  0.024

In [28]:
test_df = pd.read_csv("../data/test_logs_clean.csv")
test_df = test_df[['id', 'activity']]


test_operations = (
    test_df.groupby('id')
        .apply(rebuild_text)
        .reset_index(name='operation')
)

with open('../data/operation_vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

X_test_tfidf = vectorizer.transform(test_operations['operation'])

with open('../data/operation_svd.pkl', 'rb') as f:
    svd = pickle.load(f)
X_test_svd = svd.transform(X_test_tfidf)
svdsize = X_test_svd.shape[1]

test_svd_df = pd.DataFrame(
    X_test_svd,
    columns=[f'{i:02d}' for i in range(svdsize)]
)
test_svd_df.insert(0, 'id', test_operations['id'].values)
test_svd_df.to_csv("../data/test_tfidf_operation.csv", index=False)

  .apply(rebuild_text)
