In [14]:
import pandas as pd

# Load the dataset
mouse_data = pd.read_csv("mouse_modified_trimmed_clean_imputed.csv")

# Check data types and look for mixed types in columns
print("Data Types:\n", mouse_data.dtypes)

# Inspecting columns that have 'object' data types (which could include strings)
categorical_columns = mouse_data.select_dtypes(include=['object']).columns
print("\nCategorical Columns:", categorical_columns)

# Check for unique values in the categorical columns
for col in categorical_columns:
    print(f"\nUnique values in {col}:\n", mouse_data[col].unique())

# Check if any column has mixed types (int and str)
for col in categorical_columns:
    if mouse_data[col].apply(type).nunique() > 1:
        print(f"\nColumn {col} has mixed types!")


Data Types:
 id             float64
action          object
coordinates     object
button          object
delta           object
distance       float64
speed          float64
user_id        float64
timestamp       object
dtype: object

Categorical Columns: Index(['action', 'coordinates', 'button', 'delta', 'timestamp'], dtype='object')

Unique values in action:
 ['pressed' 'released' 'move' 'click']

Unique values in coordinates:
 ['(144, 403)' '(27, 541)' '(32, 277)' ... '391,163' '389,164' '389,165']

Unique values in button:
 ['Button.left' 'Button.right' 'left' 'middle' 'right']

Unique values in delta:
 ['0,-1' '0,0' '180,-34' ... '15,77' '-19,-186' '-128,-2']

Unique values in timestamp:
 ['2025-02-26 06:56:05.412578+00' '2025-02-26 06:56:05.418595+00'
 '2025-02-26 06:56:05.424294+00' ... '2025-05-18 08:35:44.301+00'
 '2025-05-18 08:35:44.535+00' '2025-05-18 08:35:48.959+00']


In [15]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load the benign (genuine user) dataset
mouse_data = pd.read_csv("mouse_modified_trimmed_clean_imputed.csv")

# 1. Convert 'coordinates' and 'delta' to two separate columns for x and y
mouse_data[['coord_x', 'coord_y']] = mouse_data['coordinates'].str.extract(r'\((\d+),\s*(\d+)\)').astype(float)
mouse_data[['delta_x', 'delta_y']] = mouse_data['delta'].str.extract(r'(\-?\d+),\s*(\-?\d+)').astype(float)

# Drop the original 'coordinates' and 'delta' columns as we now have x and y
mouse_data = mouse_data.drop(columns=['coordinates', 'delta'])

# 2. Encode categorical columns ('action', 'button', 'timestamp')
label_encoder = LabelEncoder()

# Encoding 'action', 'button', and 'timestamp'
mouse_data['action'] = label_encoder.fit_transform(mouse_data['action'])
mouse_data['button'] = label_encoder.fit_transform(mouse_data['button'])
mouse_data['timestamp'] = label_encoder.fit_transform(mouse_data['timestamp'])

# Check data types and the first few rows
print(mouse_data.dtypes)
print(mouse_data.head())


id           float64
action         int64
button         int64
distance     float64
speed        float64
user_id      float64
timestamp      int64
coord_x      float64
coord_y      float64
delta_x      float64
delta_y      float64
dtype: object
        id  action  button  distance  speed    user_id  timestamp  coord_x  \
0  63962.0       2       0       0.0    0.0  52.270322          0    144.0   
1  63963.0       3       0       0.0    0.0  52.270322          1    144.0   
2  63964.0       2       0       0.0    0.0  52.270322          2     27.0   
3  63965.0       3       0       0.0    0.0  52.270322          3     27.0   
4  63966.0       2       0       0.0    0.0  52.270322          4     32.0   

   coord_y  delta_x  delta_y  
0    403.0      0.0     -1.0  
1    403.0      0.0     -1.0  
2    541.0      0.0     -1.0  
3    541.0      0.0     -1.0  
4    277.0      0.0     -1.0  


In [19]:
import pandas as pd

# Load the dataset
mouse_data = pd.read_csv("mouse_modified_trimmed_clean_imputed.csv")

# Print the column names
print("Columns in the dataset:")
print(mouse_data.columns)


Columns in the dataset:
Index(['id', 'action', 'coordinates', 'button', 'delta', 'distance', 'speed',
       'user_id', 'timestamp'],
      dtype='object')


In [20]:
import pandas as pd
import numpy as np

# Load the benign (genuine user) dataset
mouse_data = pd.read_csv("mouse_modified_trimmed_clean_imputed.csv")

# Define a function to generate attack data with a given ratio
def generate_attack_data(mouse_data, attack_ratio):
    num_attack_samples = int(len(mouse_data) * attack_ratio)  # Number of attack samples to generate
    
    # Copy the benign dataset to create attack data
    attack_data = mouse_data.copy()
    
    # Perturb numeric features to simulate attack
    np.random.seed(42)  # For reproducibility
    
    # Generate random noise for the attack data (high speed, distance, erratic delta)
    speed_noise = np.random.normal(loc=50, scale=15, size=num_attack_samples)  # Simulating high speed
    distance_noise = np.random.normal(loc=30, scale=10, size=num_attack_samples)  # Simulating high distance
    delta_noise = np.random.normal(loc=5, scale=2, size=num_attack_samples)  # Simulating erratic movement
    
    # Apply the noise to the relevant columns
    attack_data.loc[:num_attack_samples - 1, 'speed'] += speed_noise
    attack_data.loc[:num_attack_samples - 1, 'distance'] += distance_noise
    attack_data.loc[:num_attack_samples - 1, 'delta'] = delta_noise  # Apply noise to the delta column
    
    # Simulate an attack by frequent button presses (set higher frequencies of clicks)
    attack_data.loc[:num_attack_samples - 1, 'button'] = np.random.choice([0, 1], size=num_attack_samples, p=[0.1, 0.9])  # Simulate frequent button presses
    
    # Return the attack data (only the attack portion)
    return attack_data.head(num_attack_samples)  # Ensure to return only the attack samples

# Attack ratios for generating 5%, 10%, and 30% attack data
attack_ratios = [0.05, 0.10, 0.30]

# Generate attack data and save to CSV files
for attack_ratio in attack_ratios:
    print(f"\nGenerating {int(attack_ratio * 100)}% Attack Data")
    
    # Generate attack data for the given ratio
    attack_data = generate_attack_data(mouse_data, attack_ratio)
    
    # Save the generated attack data to a CSV file
    attack_data_filename = f"synthetic_attack_data_{int(attack_ratio * 100)}.csv"
    attack_data.to_csv(attack_data_filename, index=False)
    print(f"Attack data saved to: {attack_data_filename}")



Generating 5% Attack Data
Attack data saved to: synthetic_attack_data_5.csv

Generating 10% Attack Data
Attack data saved to: synthetic_attack_data_10.csv

Generating 30% Attack Data
Attack data saved to: synthetic_attack_data_30.csv


In [21]:
import pandas as pd

# Load the benign (genuine user) dataset
mouse_data = pd.read_csv("mouse_modified_trimmed_clean_imputed.csv")

# Load the 5% attack data
attack_data_5 = pd.read_csv("synthetic_attack_data_5.csv")

# Add a 'label' column to the attack data (1 for attack)
attack_data_5['label'] = 1  # Attack data is labeled as 1

# Add a 'label' column to the benign data (0 for benign)
mouse_data['label'] = 0  # Benign data is labeled as 0

# Combine the benign and attack datasets
combined_data_5 = pd.concat([mouse_data, attack_data_5], ignore_index=True)

# Shuffle the combined data to randomize the order of samples
combined_data_5 = combined_data_5.sample(frac=1).reset_index(drop=True)

# Save the combined data to a new CSV file
combined_data_5.to_csv("combined_data_5.csv", index=False)

print("5% Attack Data Combined and Saved to: combined_data_5.csv")


5% Attack Data Combined and Saved to: combined_data_5.csv


In [22]:
# Load the 10% attack data
attack_data_10 = pd.read_csv("synthetic_attack_data_10.csv")

# Add a 'label' column to the 10% attack data (1 for attack)
attack_data_10['label'] = 1  # Attack data is labeled as 1

# Combine the benign and attack datasets
combined_data_10 = pd.concat([mouse_data, attack_data_10], ignore_index=True)

# Shuffle the combined data to randomize the order of samples
combined_data_10 = combined_data_10.sample(frac=1).reset_index(drop=True)

# Save the combined data to a new CSV file
combined_data_10.to_csv("combined_data_10.csv", index=False)

print("10% Attack Data Combined and Saved to: combined_data_10.csv")


10% Attack Data Combined and Saved to: combined_data_10.csv


In [23]:
# Load the 30% attack data
attack_data_30 = pd.read_csv("synthetic_attack_data_30.csv")

# Add a 'label' column to the 30% attack data (1 for attack)
attack_data_30['label'] = 1  # Attack data is labeled as 1

# Combine the benign and attack datasets
combined_data_30 = pd.concat([mouse_data, attack_data_30], ignore_index=True)

# Shuffle the combined data to randomize the order of samples
combined_data_30 = combined_data_30.sample(frac=1).reset_index(drop=True)

# Save the combined data to a new CSV file
combined_data_30.to_csv("combined_data_30.csv", index=False)

print("30% Attack Data Combined and Saved to: combined_data_30.csv")


30% Attack Data Combined and Saved to: combined_data_30.csv
