In [1]:
from keystrokes.utils.path_utils import DATA_PREPROCESSED_FOLDER, ARTIFACTS_FOLDER
import pandas as pd
import numpy as np
from keystrokes.transformers.transpose_transformer import TransposeTransformer
from sklearn.utils import shuffle

In [2]:
def load_examples(example_type, set_type="train_set"):
    folder_path = DATA_PREPROCESSED_FOLDER / set_type/ "features" / example_type 
    csv_files = list(folder_path.glob('*.csv'))
    return [pd.read_csv(f) for f in csv_files]

In [3]:
def get_Xy():
    positive_dfs = load_examples('positive')
    negative_dfs = load_examples('negative')

    dfs = negative_dfs + positive_dfs
    labels = [0]*len(negative_dfs) + [1]*len(positive_dfs)
    return dfs, labels

In [4]:
dfs, labels = get_Xy()
dfs, labels = shuffle(dfs, labels)

In [22]:
batch_size = 1000
for batch_id in range(0, len(labels) + 1, batch_size):
    transpose_transformer = TransposeTransformer(top_columns=2000)
    print(batch_id, batch_id + batch_size)
    X = transpose_transformer.fit_transform(dfs[batch_id : batch_id + batch_size])
    y = labels[batch_id : batch_id + batch_size]
    X = X.assign(labels=y)
    X.to_csv(
        DATA_PREPROCESSED_FOLDER / "train_set" / "matrices" / f"batch_{batch_id}",
        index=False,
    )

0 1000
1000 2000
2000 3000
3000 4000
4000 5000
5000 6000
6000 7000
7000 8000
8000 9000
9000 10000
10000 11000
11000 12000
12000 13000
13000 14000
14000 15000
15000 16000
16000 17000
17000 18000
18000 19000
19000 20000
20000 21000
21000 22000
22000 23000
23000 24000
24000 25000
25000 26000
26000 27000
27000 28000
28000 29000
29000 30000
30000 31000
31000 32000
32000 33000
33000 34000
34000 35000
35000 36000
36000 37000
37000 38000
38000 39000
39000 40000
40000 41000
41000 42000
42000 43000
43000 44000
44000 45000
45000 46000
46000 47000
47000 48000
48000 49000
49000 50000
50000 51000
51000 52000
52000 53000
53000 54000
54000 55000
55000 56000
56000 57000
57000 58000
58000 59000
59000 60000
60000 61000


In [2]:
all_dfs = [pd.read_csv(f) for f in (DATA_PREPROCESSED_FOLDER / "train_set" / "matrices" ).glob('*')]

In [3]:
all_dfs = pd.concat(all_dfs, axis=0)

In [4]:
len(all_dfs)

60024

In [5]:
labels = all_dfs['labels']
all_dfs = all_dfs.drop(columns='labels')

In [6]:
top_1000_features = all_dfs.sum().nlargest(1000).index.tolist()

In [7]:
pd.DataFrame(top_1000_features, columns=["FeatName"]).sort_values(
    by="FeatName"
).reset_index(drop=True).to_csv(DATA_PREPROCESSED_FOLDER / "train_set" / 'top_features.csv', index=False)

In [8]:
X = all_dfs[top_1000_features]

In [9]:
X.shape

(60024, 1000)

In [10]:
labels.shape

(60024,)

In [11]:
from xgboost import XGBClassifier

In [12]:
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X, labels)



In [13]:
transpose_transformer = TransposeTransformer(top_columns=1000)
transpose_transformer.selected_columns_ = top_1000_features

In [14]:
from sklearn.pipeline import Pipeline

In [15]:

transpose_and_predict_pipeline = Pipeline([('transpose', transpose_transformer),
    ('xgboost', model)
])

In [16]:
from keystrokes.pipelines.feature_pipeline import preprocessing_pipeline
pipeline = Pipeline(
    [('preprocessing', preprocessing_pipeline),
     ('transpose_and_predict', transpose_and_predict_pipeline)]
)

In [17]:
from joblib import dump, load

# Save the pipeline
dump(pipeline, ARTIFACTS_FOLDER / 'pipeline.joblib') 


['/home/tarek/.keystrokes/data/artifacts/pipeline.joblib']

In [18]:
# Save the pipeline
dump(transpose_and_predict_pipeline, ARTIFACTS_FOLDER / 'transpose_and_predict_pipeline.joblib') 

['/home/tarek/.keystrokes/data/artifacts/transpose_and_predict_pipeline.joblib']

# Test pipeline

In [19]:
from keystrokes.features.example_creation import ExampleCreator

In [23]:
ec = ExampleCreator(sampling_start_index=10000, sampling_end_index=20000)

In [30]:
from joblib import Parallel, delayed

def process_user(user_id):
    p = ec.create_positive_examples(user_id)
    n = ec.create_negative_examples(user_id)
    return p, n

results = Parallel(n_jobs=-1, verbose=1)(delayed(process_user)(user_id) for user_id in range(10000, 11000))

all_p = []
all_n = []

for res in results:
    all_p.extend(res[0])
    all_n.extend(res[1])


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   25.6s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:  9.8min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 12.7min finished


In [33]:
all_pairs, y_test = shuffle(all_p + all_n, [1]*len(all_p) + [0]*len(all_n))

In [36]:
# Step 5: Make predictions and check the accuracy on the test set
y_pred = pipeline.predict(all_pairs)
# y_pred_proba = model.predict_proba(all_pairs)[:,1]

KeyboardInterrupt: 

In [25]:
test2 = [5,6]

In [26]:
test.append(test2)

In [27]:
test

[1, 2, 3, [5, 6]]