In [51]:
import pandas as pd

df = pd.read_csv("train_logs_clean.csv")

df = df[['id', 'activity']]

print(df.head(3))

def rebuild_text(grp):
    buf = []
    for op in grp['activity']:
        buf.append(op[0])
    return "".join(buf)

operations = (
    df.groupby('id')
        .apply(rebuild_text)
        .reset_index(name='operation')
)

print(operations.head(3))

         id       activity
0  001519c8  Nonproduction
1  001519c8  Nonproduction
2  001519c8  Nonproduction
         id                                          operation
0  001519c8  NNNIIIIIIIIIIIIIIIRIIIIIIIIIIIIIIRIIIIIIIIIIII...
1  0022f953  NNIIIIIIIINIIIIIIIIIIIRRRRRRRRRRRIIIIIIIIIIIII...
2  0042269b  NNIIIIIIIIIIIIIIIIIIIIIIIIIIIIIINIIIIIIIIIIIII...


  .apply(rebuild_text)


In [52]:
# TF-IDF + SVD for operation feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(3, 5),
    max_features=30000,
    dtype=np.float32,
)

X_tfidf = vectorizer.fit_transform(operations['operation'])

tfidf_df = pd.DataFrame(
    X_tfidf.toarray(),
    columns=vectorizer.get_feature_names_out()
)
tfidf_df.insert(0, 'id', operations['id'])

print(tfidf_df.head())

from sklearn.decomposition import TruncatedSVD

svdsize = 64

svd = TruncatedSVD(
    n_components=svdsize,
    random_state=42,
    n_iter=7
)

X_svd = svd.fit_transform(X_tfidf)

svd_df = pd.DataFrame(
    X_svd,
    columns=[f'{i:02d}' for i in range(svdsize)]
)
svd_df.insert(0, 'id', operations['id'].values)

print(svd_df.head())
svd_df.to_csv("operations_tfidf_svd_vectors.csv", index=False)

         id       iii      iiii     iiiii     iiiin  iiiip     iiiir  \
0  001519c8  0.605252  0.567680  0.533865  0.008549    0.0  0.025306   
1  0022f953  0.604434  0.569855  0.537745  0.012012    0.0  0.020137   
2  0042269b  0.596965  0.574090  0.551953  0.009420    0.0  0.012744   
3  0059420b  0.607514  0.572120  0.537782  0.005818    0.0  0.028032   
4  0075873a  0.583251  0.561007  0.539744  0.011136    0.0  0.010153   

       iiin     iiini  iiinm  ...     rrrnr  rrrp  rrrpi  rrrpn  rrrpr  \
0  0.011281  0.002406    0.0  ...  0.000000   0.0    0.0    0.0    0.0   
1  0.012713  0.007100    0.0  ...  0.000000   0.0    0.0    0.0    0.0   
2  0.009601  0.004454    0.0  ...  0.001269   0.0    0.0    0.0    0.0   
3  0.005816  0.003189    0.0  ...  0.000000   0.0    0.0    0.0    0.0   
4  0.011786  0.005595    0.0  ...  0.000000   0.0    0.0    0.0    0.0   

       rrrr     rrrri     rrrrn  rrrrp     rrrrr  
0  0.070962  0.009974  0.001926    0.0  0.059571  
1  0.034351  0.00532

In [56]:
df = pd.read_csv("test_logs_clean.csv")

df = df[['id', 'activity']]

print(df.head(3))

def rebuild_text(grp):
    buf = []
    for op in grp['activity']:
        buf.append(op[0])
    return "".join(buf)

operations = (
    df.groupby('id')
        .apply(rebuild_text)
        .reset_index(name='operation')
)

print(operations.head(3))

test_tfidf = vectorizer.transform(operations['operation'])

test_svd = svd.transform(test_tfidf)

test_svd_df = pd.DataFrame(
    test_svd,
    columns=[f'{i:02d}' for i in range(svdsize)]
)
test_svd_df.insert(0, 'id', operations['id'].values)

test_svd_df.to_csv("test_operations_tfidf_svd_vectors.csv", index=False)

print(test_svd_df.head(3))

         id activity
0  0000aaaa    Input
1  0000aaaa    Input
2  2222bbbb    Input
         id operation
0  0000aaaa        II
1  2222bbbb        II
2  4444cccc        II
         id   00   01   02   03   04   05   06   07   08  ...   54   55   56  \
0  0000aaaa  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
1  2222bbbb  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
2  4444cccc  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   

    57   58   59   60   61   62   63  
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  

[3 rows x 65 columns]


  .apply(rebuild_text)
