In [6]:
print(imputed_data.columns)


Index(['id', 'key', 'action', 'rhythm', 'dwell_time', 'flight_time',
       'up_down_time', 'session_duration', 'user_id', 'timestamp'],
      dtype='object')


In [7]:
import pandas as pd
import numpy as np

# 1. Load the imputed dataset (make sure this file is already saved after imputation)
imputed_data = pd.read_csv('Dataset_TZ_KM/imputed_keystroke_data.csv')

# 2. Assume the entire data is genuine and label it as 0
imputed_data['label'] = 0  # Assign label 0 for genuine data

# 3. Generate keystroke attack data
def generate_keystroke_attack_data(genuine_data):
    # Create a copy of the genuine data to simulate attack data
    attack_data = genuine_data.copy()

    # Introduce some random noise or modification to simulate attack behavior
    # For example, we can randomly swap the 'dwell_time' or 'flight_time' values
    attack_data['dwell_time'] = attack_data['dwell_time'] * np.random.uniform(0.5, 1.5, size=len(attack_data))  # Random change
    attack_data['flight_time'] = attack_data['flight_time'] * np.random.uniform(0.5, 1.5, size=len(attack_data))  # Random change

    # Label the attack data as '1' (attack)
    attack_data['label'] = 1

    return attack_data

# Generate attack data
attack_data = generate_keystroke_attack_data(imputed_data)

# 4. Display the first few rows of the generated attack data
print("Generated Attack Data (First 5 rows):")
print(attack_data.head())

# Save the generated attack data
attack_data.to_csv('keystroke_attack_data.csv', index=False)


Generated Attack Data (First 5 rows):
       id        key   action  rhythm  dwell_time  flight_time  up_down_time  \
0  3067.0          1  release     0.0    0.105190     2.532167      1.788202   
1  3068.0  Key.enter  release     0.0    0.093136     0.000000      0.000000   
2  3069.0          b  release     0.0    0.048118     0.085511      0.108629   
3  3070.0          z  release     0.0    0.105374     0.022252      0.019156   
4  3071.0          t  release     0.0    0.084559     0.141724      0.154120   

   session_duration  user_id                      timestamp  label  
0         13.830218      1.0  2025-01-15 10:08:05.783485+00      1  
1         13.830218      1.0  2025-01-15 10:08:05.793162+00      1  
2         13.830218      1.0  2025-01-15 10:25:53.170786+00      1  
3         13.830218      1.0  2025-01-15 10:25:53.182156+00      1  
4         13.830218      1.0   2025-01-15 10:25:53.18817+00      1  


In [8]:
# Combine the genuine data with the attack data
combined_data = pd.concat([imputed_data, attack_data])

# Shuffle the combined data to mix genuine and attack data
combined_data = combined_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the combined data
combined_data.to_csv('combined_keystroke_data.csv', index=False)

# Display the first few rows of the combined dataset
print("Combined Data (First 5 rows):")
print(combined_data.head())


Combined Data (First 5 rows):
        id        key   action  rhythm  dwell_time  flight_time  up_down_time  \
0  13533.0          6    press   0.088    0.088000     0.000000         0.000   
1  17984.0  Backspace    press   0.000    0.000000     3.632000         0.037   
2  22451.0          0  release   0.085    0.085000     0.000000         0.000   
3  22723.0          7    press   0.263    0.162087     0.249440         0.388   
4   6420.0          3    press   0.084    0.111101     0.020516         0.006   

   session_duration  user_id                   timestamp  label  
0             9.192     57.0  2025-05-14 09:55:25.411+00      0  
1            14.293     69.0  2025-05-14 14:38:49.919+00      0  
2             4.246     83.0  2025-05-17 13:43:02.928+00      0  
3            13.432     83.0   2025-05-17 14:52:12.34+00      1  
4             3.081     19.0  2025-04-30 08:23:53.339+00      1  
