In [1]:
import pandas as pd

df = pd.read_csv("train_logs_clean.csv")

df = df[['id', 'activity']]

print(df.head(3))

def rebuild_text(grp):
    buf = []
    for op in grp['activity']:
        buf.append(op[0])
    return "".join(buf)

operations = (
    df.groupby('id')
        .apply(rebuild_text)
        .reset_index(name='operation')
)

print(operations.head(3))

         id       activity
0  001519c8  Nonproduction
1  001519c8  Nonproduction
2  001519c8  Nonproduction
         id                                          operation
0  001519c8  NNNIIIIIIIIIIIIIIIRIIIIIIIIIIIIIIRIIIIIIIIIIII...
1  0022f953  NNIIIIIIIINIIIIIIIIIIIRRRRRRRRRRRIIIIIIIIIIIII...
2  0042269b  NNIIIIIIIIIIIIIIIIIIIIIIIIIIIIIINIIIIIIIIIIIII...


  .apply(rebuild_text)


In [2]:
# TF-IDF + SVD for operation feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(1, 5),
    max_features=30000,
    dtype=np.float32,
)

X_tfidf = vectorizer.fit_transform(operations['operation'])

tfidf_df = pd.DataFrame(
    X_tfidf.toarray(),
    columns=vectorizer.get_feature_names_out()
)
tfidf_df.insert(0, 'id', operations['id'])

print(tfidf_df.head())

from sklearn.decomposition import TruncatedSVD

svdsize = 64

svd = TruncatedSVD(
    n_components=svdsize,
    random_state=42,
    n_iter=7
)

X_svd = svd.fit_transform(X_tfidf)

svd_df = pd.DataFrame(
    X_svd,
    columns=[f'{i:02d}' for i in range(svdsize)]
)
svd_df.insert(0, 'id', operations['id'].values)

print(svd_df.head())
svd_df.to_csv("operations_tfidf_svd_vectors.csv", index=False)

         id         i        ii       iii      iiii     iiiii     iiiin  \
0  001519c8  0.494963  0.464428  0.436356  0.409268  0.384889  0.006164   
1  0022f953  0.495473  0.464283  0.437949  0.412895  0.389629  0.008703   
2  0042269b  0.480796  0.460415  0.442633  0.425672  0.409258  0.006984   
3  0059420b  0.498737  0.468140  0.439837  0.414212  0.389351  0.004212   
4  0075873a  0.470123  0.450030  0.431632  0.415171  0.399435  0.008241   

   iiiip     iiiir      iiin  ...     rrrnr  rrrp  rrrpi  rrrpn  rrrpr  \
0    0.0  0.018245  0.008133  ...  0.000000   0.0    0.0    0.0    0.0   
1    0.0  0.014590  0.009211  ...  0.000000   0.0    0.0    0.0    0.0   
2    0.0  0.009450  0.007119  ...  0.000941   0.0    0.0    0.0    0.0   
3    0.0  0.020295  0.004211  ...  0.000000   0.0    0.0    0.0    0.0   
4    0.0  0.007514  0.008722  ...  0.000000   0.0    0.0    0.0    0.0   

       rrrr     rrrri     rrrrn  rrrrp     rrrrr  
0  0.051160  0.007191  0.001389    0.0  0.042947  
1 

In [3]:
df = pd.read_csv("test_logs_clean.csv")

df = df[['id', 'activity']]

print(df.head(3))

def rebuild_text(grp):
    buf = []
    for op in grp['activity']:
        buf.append(op[0])
    return "".join(buf)

operations = (
    df.groupby('id')
        .apply(rebuild_text)
        .reset_index(name='operation')
)

print(operations.head(3))

test_tfidf = vectorizer.transform(operations['operation'])

test_svd = svd.transform(test_tfidf)

test_svd_df = pd.DataFrame(
    test_svd,
    columns=[f'{i:02d}' for i in range(svdsize)]
)
test_svd_df.insert(0, 'id', operations['id'].values)

test_svd_df.to_csv("test_operations_tfidf_svd_vectors.csv", index=False)

print(test_svd_df.head(3))

         id activity
0  0000aaaa    Input
1  0000aaaa    Input
2  2222bbbb    Input
         id operation
0  0000aaaa        II
1  2222bbbb        II
2  4444cccc        II
         id       00        01        02        03        04        05  \
0  0000aaaa  0.64088 -0.042732  0.037049  0.425723  0.218728  0.034862   
1  2222bbbb  0.64088 -0.042732  0.037049  0.425723  0.218728  0.034862   
2  4444cccc  0.64088 -0.042732  0.037049  0.425723  0.218728  0.034862   

         06       07        08  ...       54        55        56        57  \
0  0.129818  0.09343 -0.049924  ...  0.01479  0.009482  0.003007 -0.002232   
1  0.129818  0.09343 -0.049924  ...  0.01479  0.009482  0.003007 -0.002232   
2  0.129818  0.09343 -0.049924  ...  0.01479  0.009482  0.003007 -0.002232   

         58        59        60        61        62        63  
0 -0.004541 -0.004171  0.010524 -0.002097  0.002967  0.010363  
1 -0.004541 -0.004171  0.010524 -0.002097  0.002967  0.010363  
2 -0.004541 -0.004171  0.0

  .apply(rebuild_text)
