# Model Training

We will train a small xgboost model to demonstrate the training and prediction steps

### Generate train and test data sets


In [None]:
from keystrokes.features.example_creation import ExampleCreator
from keystrokes.features.example_generation import ExampleGenerator

ec = ExampleCreator(sampling_start_index=10000, sampling_end_index=20000)
eg = ExampleGenerator(creator=ec, num_users=50, first_user_id=10000)
keystrokes_train, labels_train = eg.generate()
ec.set_sampling_indices(0, 10000)
eg = ExampleGenerator(creator=ec, num_users=50, first_user_id=0)
keystrokes_test, labels_test = eg.generate()

In [8]:
# Take a peek at one row of the train set
keystrokes_train[0]

[     PARTICIPANT_ID  TEST_SECTION_ID     PRESS_TIME   RELEASE_TIME  KEYCODE
 0             30536           327122  1472282754185  1472282754320       16
 1             30536           327122  1472282754313  1472282754353       82
 2             30536           327122  1472282754649  1472282754720        8
 3             30536           327122  1472282754768  1472282754849        8
 4             30536           327122  1472282755048  1472282755176       16
 ..              ...              ...            ...            ...      ...
 728           30536           327263  1472282977448  1472282977537       79
 729           30536           327263  1472282977593  1472282977721       82
 730           30536           327263  1472282977696  1472282977760       69
 731           30536           327263  1472282977921  1472282978000       83
 732           30536           327263  1472282978121  1472282978193      190
 
 [230 rows x 5 columns],
      PARTICIPANT_ID  TEST_SECTION_ID     PRESS_T

## Train a model


In [9]:
from keystrokes.pipelines.feature_pipeline import FeaturePipeline
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

feature_pipeline = FeaturePipeline(top_columns=50)
model = XGBClassifier(eval_metric='logloss')
pipeline = Pipeline(
    [('feature_pipeline', feature_pipeline),
     ('model', model)]
)
pipeline.fit(keystrokes_train, labels_train)

### KeyCodes we Care about

In [21]:
feature_pipeline.named_steps['transpose'].selected_columns_[:3]

['KEYCODES_190_16_ABS_DIFF_PRESS_PRESS_TIME',
 'KEYCODES_32_87_ABS_DIFF_PRESS_PRESS_TIME',
 'KEYCODES_32_70_ABS_DIFF_PRESS_PRESS_TIME']

### Demo Prediction

In [27]:
ind = 1
predicted_label = pipeline.predict(keystrokes_test[ind:ind+1])
true_label = labels_test[ind]
print(f"Predicted Label: {predicted_label[0]}\n True Label:{true_label}")

Predicted Label: 0
 True Label:0
