## 1. Import Libraries


In [25]:
import numpy as np
import pandas as pd
import warnings
from pathlib import Path

warnings.filterwarnings('ignore')

## 2. Feature Extraction Functions

Extract different behavioural features from keystroke logs.
We want to capture:

- pauses (when people are thinking)
- bursts (when they're typing continuously)
- editing behaviour (how much they revise)
- cursor movement (planning vs going back to edit)

### 2.1 Base Features


In [26]:
def extract_features(df):
    """Pull out the main features from the log data"""
    
    # Count up events and get basic stats
    features = df.groupby("id").agg(
        events_count=('event_id', 'count'),
        total_time=('up_time', 'max'),
        total_chars=('word_count', 'max'),
        mean_action_time=('action_time', 'mean'),
        std_action_time=('action_time', 'std'),
        max_action_time=('action_time', 'max'),
        min_action_time=('action_time', 'min'),
        
        # Count different types of actions
        backspace_count=('activity', lambda x: (x == "Remove/Cut").sum()),
        paste_count=('activity', lambda x: (x == "Paste").sum()),
        input_count=('activity', lambda x: (x == "Input").sum()),
        move_count=('activity', lambda x: x.str.contains("Move", na=False).sum()),
        replace_count=('activity', lambda x: (x == "Replace").sum()),
        nonproduction_count=('activity', lambda x: (x == "Nonproduction").sum()),
        
        # Where the cursor was
        cursor_pos_mean=('cursor_position', 'mean'),
        cursor_pos_std=('cursor_position', 'std'),
        cursor_pos_max=('cursor_position', 'max'),
        
        # Word count stats
        word_count_mean=('word_count', 'mean'),
        word_count_std=('word_count', 'std'),
        word_count_diff=('word_count', lambda x: x.max() - x.min()),
    ).reset_index()
    
    # Replace any missing values with 0
    features = features.fillna(0)
    
    # Calculate some ratios
    features['chars_per_min'] = features['total_chars'] / (features['total_time'] / 60000 + 1e-6)
    features['events_per_min'] = features['events_count'] / (features['total_time'] / 60000 + 1e-6)
    features['backspace_ratio'] = features['backspace_count'] / (features['input_count'] + 1)
    features['paste_ratio'] = features['paste_count'] / (features['events_count'] + 1)
    features['replace_ratio'] = features['replace_count'] / (features['events_count'] + 1)
    features['nonproduction_ratio'] = features['nonproduction_count'] / (features['events_count'] + 1)
    features['revision_ratio'] = (features['backspace_count'] + features['replace_count']) / (features['total_chars'] + 1)
    
    return features

### 2.2 Pause Features


In [27]:
def pause_features(df):
    """Get features about pauses (gaps between keystrokes)"""
    df = df.sort_values(["id", "down_time"]).copy()
    df["iki"] = df.groupby("id")["down_time"].diff()
    
    # Count pauses at different thresholds (2s, 5s, 10s)
    pause_2s = df.groupby("id")["iki"].apply(lambda x: (x > 2000).sum()).rename("pause_2s_count")
    pause_5s = df.groupby("id")["iki"].apply(lambda x: (x > 5000).sum()).rename("pause_5s_count")
    pause_10s = df.groupby("id")["iki"].apply(lambda x: (x > 10000).sum()).rename("pause_10s_count")
    
    # Basic pause stats
    mean_pause = df.groupby("id")["iki"].mean().rename("mean_pause")
    median_pause = df.groupby("id")["iki"].median().rename("median_pause")
    std_pause = df.groupby("id")["iki"].std().rename("std_pause")
    max_pause = df.groupby("id")["iki"].max().rename("max_pause")
    min_pause = df.groupby("id")["iki"].min().rename("min_pause")
    
    return pause_2s, pause_5s, pause_10s, mean_pause, median_pause, std_pause, max_pause, min_pause


def burst_features(df):
    """Get features about bursts (when they're typing continuously)"""
    df = df.sort_values(["id", "down_time"]).copy()
    df["iki"] = df.groupby("id")["down_time"].diff()
    df["burst"] = (df["iki"] > 2000).astype(int)
    df["burst_id"] = df.groupby("id")["burst"].cumsum()
    
    burst_len = df.groupby(["id", "burst_id"]).size()
    avg_burst = burst_len.groupby("id").mean().rename("avg_burst")
    max_burst = burst_len.groupby("id").max().rename("max_burst")
    std_burst = burst_len.groupby("id").std().rename("std_burst")
    
    return avg_burst, max_burst, std_burst


def p_burst_features(df):
    """Get P-burst features (how many words per burst)"""
    df = df.sort_values(["id", "down_time"]).copy()
    df["iki"] = df.groupby("id")["down_time"].diff()
    
    # P-bursts: pauses longer than 2s
    df["p_burst"] = (df["iki"] > 2000).astype(int)
    df["p_burst_id"] = df.groupby("id")["p_burst"].cumsum()
    
    # How many words in each burst
    p_burst_words = df.groupby(["id", "p_burst_id"])["word_count"].apply(lambda x: x.max() - x.min())
    avg_words_per_p_burst = p_burst_words.groupby("id").mean().rename("avg_words_per_p_burst")
    
    return avg_words_per_p_burst

### 2.3 Activity Sequence & Text Change Features


In [28]:
def activity_sequence_features(df):
    """Get features from activity patterns and transitions"""
    features = []
    
    for id_val in df['id'].unique():
        id_df = df[df['id'] == id_val].sort_values('down_time')
        activities = id_df['activity'].values
        
        # Track how activities transition from one to another
        transitions = {}
        for i in range(len(activities) - 1):
            transition = f"{activities[i]}->{activities[i+1]}"
            transitions[transition] = transitions.get(transition, 0) + 1
        
        # Common patterns
        input_to_remove = transitions.get('Input->Remove/Cut', 0)
        remove_to_input = transitions.get('Remove/Cut->Input', 0)
        input_to_input = transitions.get('Input->Input', 0)
        paste_to_input = transitions.get('Paste->Input', 0)
        
        # Find the longest streaks of the same activity
        max_input_streak = 0
        max_remove_streak = 0
        current_input_streak = 0
        current_remove_streak = 0
        
        for act in activities:
            if act == 'Input':
                current_input_streak += 1
                max_input_streak = max(max_input_streak, current_input_streak)
                current_remove_streak = 0
            elif act == 'Remove/Cut':
                current_remove_streak += 1
                max_remove_streak = max(max_remove_streak, current_remove_streak)
                current_input_streak = 0
            else:
                current_input_streak = 0
                current_remove_streak = 0
        
        # How varied are the activities
        unique_activities = len(set(activities))
        activity_switches = sum(1 for i in range(len(activities)-1) if activities[i] != activities[i+1])
        
        features.append({
            'id': id_val,
            'input_to_remove_trans': input_to_remove,
            'remove_to_input_trans': remove_to_input,
            'input_to_input_trans': input_to_input,
            'paste_to_input_trans': paste_to_input,
            'max_input_streak': max_input_streak,
            'max_remove_streak': max_remove_streak,
            'unique_activities': unique_activities,
            'activity_switches': activity_switches,
            'activity_switch_rate': activity_switches / len(activities) if len(activities) > 0 else 0
        })
    
    return pd.DataFrame(features)


def text_change_features(df):
    """Features about how the text changes"""
    df = df.sort_values(['id', 'down_time']).copy()
    
    # How much text was added or removed
    df['text_change'] = df.groupby('id')['word_count'].diff().fillna(0)
    
    features = df.groupby('id').agg(
        total_text_produced=('text_change', lambda x: x[x > 0].sum()),
        total_text_removed=('text_change', lambda x: abs(x[x < 0].sum())),
        text_production_rate=('text_change', lambda x: x[x > 0].mean()),
        text_removal_rate=('text_change', lambda x: x[x < 0].mean()),
        max_text_addition=('text_change', 'max'),
        max_text_removal=('text_change', 'min'),
        text_volatility=('text_change', 'std'),
        positive_text_changes=('text_change', lambda x: (x > 0).sum()),
        negative_text_changes=('text_change', lambda x: (x < 0).sum()),
    ).reset_index()
    
    # Calculate some more useful ratios
    features['text_removal_ratio'] = features['total_text_removed'] / (features['total_text_produced'] + 1)
    features['net_text_production'] = features['total_text_produced'] - features['total_text_removed']
    features['text_efficiency'] = features['total_text_produced'] / (features['positive_text_changes'] + 1)
    
    return features

### 2.4 Temporal & Velocity Features


In [29]:
def time_based_features(df):
    """Features based on when things happen (early, middle, late)"""
    df = df.sort_values(['id', 'down_time']).copy()
    
    # Split the writing session into three parts
    df['time_percentile'] = df.groupby('id')['down_time'].rank(pct=True)
    
    features = []
    for id_val in df['id'].unique():
        id_df = df[df['id'] == id_val]
        
        # Split into early, middle, and late phases
        early_phase = id_df[id_df['time_percentile'] <= 0.33]
        middle_phase = id_df[(id_df['time_percentile'] > 0.33) & (id_df['time_percentile'] <= 0.67)]
        late_phase = id_df[id_df['time_percentile'] > 0.67]
        
        features.append({
            'id': id_val,
            'early_events': len(early_phase),
            'middle_events': len(middle_phase),
            'late_events': len(late_phase),
            'early_input_ratio': (early_phase['activity'] == 'Input').sum() / (len(early_phase) + 1),
            'middle_input_ratio': (middle_phase['activity'] == 'Input').sum() / (len(middle_phase) + 1),
            'late_input_ratio': (late_phase['activity'] == 'Input').sum() / (len(late_phase) + 1),
            'early_remove_ratio': (early_phase['activity'] == 'Remove/Cut').sum() / (len(early_phase) + 1),
            'late_remove_ratio': (late_phase['activity'] == 'Remove/Cut').sum() / (len(late_phase) + 1),
            'middle_paste_ratio': (middle_phase['activity'] == 'Paste').sum() / (len(middle_phase) + 1),
            'late_phase_activity': len(late_phase) / (len(id_df) + 1),
        })
    
    return pd.DataFrame(features)


def keystroke_velocity_features(df):
    """Features about typing speed"""
    df = df.sort_values(['id', 'down_time']).copy()
    df['iki'] = df.groupby('id')['down_time'].diff()
    
    # Only look at actual typing events
    input_df = df[df['activity'] == 'Input'].copy()
    
    if len(input_df) == 0:
        return pd.DataFrame()
    
    features = input_df.groupby('id').agg(
        input_iki_mean=('iki', 'mean'),
        input_iki_std=('iki', 'std'),
        input_iki_median=('iki', 'median'),
        input_iki_min=('iki', 'min'),
        input_iki_max=('iki', 'max'),
        fast_keystrokes=('iki', lambda x: (x < 100).sum()),
        moderate_keystrokes=('iki', lambda x: ((x >= 100) & (x <= 1000)).sum()),
        slow_keystrokes=('iki', lambda x: (x > 1000).sum()),
    ).reset_index()
    
    # How consistent is the typing
    features['keystroke_consistency'] = features['input_iki_std'] / (features['input_iki_mean'] + 1)
    features['fast_keystroke_ratio'] = features['fast_keystrokes'] / (features['fast_keystrokes'] + features['moderate_keystrokes'] + features['slow_keystrokes'] + 1)
    features['typing_rhythm_score'] = features['moderate_keystrokes'] / (features['fast_keystrokes'] + features['moderate_keystrokes'] + features['slow_keystrokes'] + 1)
    
    return features


def word_count_velocity_features(df):
    """Features about how word count changes"""
    df = df.sort_values(['id', 'down_time']).copy()
    
    features = []
    for id_val in df['id'].unique():
        id_df = df[df['id'] == id_val]
        
        word_counts = id_df['word_count'].values
        time_stamps = id_df['down_time'].values
        
        # How fast are words being added
        if len(word_counts) > 1:
            word_velocity = np.diff(word_counts) / (np.diff(time_stamps) + 1)
            
            features.append({
                'id': id_val,
                'avg_word_velocity': np.mean(word_velocity),
                'max_word_velocity': np.max(word_velocity),
                'min_word_velocity': np.min(word_velocity),
                'std_word_velocity': np.std(word_velocity),
                'positive_velocity_ratio': (word_velocity > 0).sum() / len(word_velocity)
            })
        else:
            features.append({
                'id': id_val,
                'avg_word_velocity': 0,
                'max_word_velocity': 0,
                'min_word_velocity': 0,
                'std_word_velocity': 0,
                'positive_velocity_ratio': 0
            })
    
    return pd.DataFrame(features)


def activity_timing_features(df):
    """How much time is spent on each type of activity"""
    df = df.sort_values(['id', 'down_time']).copy()
    
    features = []
    for id_val in df['id'].unique():
        id_df = df[df['id'] == id_val]
        
        # Add up time for each activity
        input_time = id_df[id_df['activity'] == 'Input']['action_time'].sum()
        remove_time = id_df[id_df['activity'] == 'Remove/Cut']['action_time'].sum()
        paste_time = id_df[id_df['activity'] == 'Paste']['action_time'].sum()
        nonprod_time = id_df[id_df['activity'] == 'Nonproduction']['action_time'].sum()
        
        total_time = id_df['action_time'].sum()
        
        features.append({
            'id': id_val,
            'input_time_total': input_time,
            'remove_time_total': remove_time,
            'paste_time_total': paste_time,
            'nonprod_time_total': nonprod_time,
            'input_time_ratio': input_time / (total_time + 1),
            'remove_time_ratio': remove_time / (total_time + 1),
            'productive_time_ratio': (input_time + paste_time) / (total_time + 1),
        })
    
    return pd.DataFrame(features)

### 2.5 Revision & Cursor Movement Features


In [30]:
def revision_pattern_features(df):
    """Features about revision behaviour and editing patterns"""
    df = df.sort_values(['id', 'down_time']).copy()
    
    features = []
    for id_val in df['id'].unique():
        id_df = df[df['id'] == id_val]
        
        # Where in the text are they making changes
        cursor_positions = id_df['cursor_position'].values
        activities = id_df['activity'].values
        word_counts = id_df['word_count'].values
        
        # Count edits at start, middle, and end
        revisions_start = 0
        revisions_middle = 0
        revisions_end = 0
        
        for i, (pos, act, wc) in enumerate(zip(cursor_positions, activities, word_counts)):
            if act in ['Remove/Cut', 'Replace'] and wc > 0:
                relative_pos = pos / (wc + 1)
                if relative_pos < 0.33:
                    revisions_start += 1
                elif relative_pos < 0.67:
                    revisions_middle += 1
                else:
                    revisions_end += 1
        
        # Look for write-then-edit cycles
        review_cycles = 0
        in_writing = False
        for act in activities:
            if act == 'Input':
                in_writing = True
            elif act in ['Remove/Cut', 'Replace'] and in_writing:
                review_cycles += 1
                in_writing = False
        
        # How often they go backwards to edit
        backward_movements = sum(1 for i in range(len(cursor_positions)-1) 
                                if cursor_positions[i+1] < cursor_positions[i])
        
        total_revisions = revisions_start + revisions_middle + revisions_end
        
        features.append({
            'id': id_val,
            'revisions_at_start': revisions_start,
            'revisions_at_middle': revisions_middle,
            'revisions_at_end': revisions_end,
            'total_revisions': total_revisions,
            'review_cycles': review_cycles,
            'backward_movements': backward_movements,
            'early_revision_ratio': revisions_start / (total_revisions + 1),
            'end_revision_ratio': revisions_end / (total_revisions + 1),
            'revision_density': total_revisions / (len(id_df) + 1),
        })
    
    return pd.DataFrame(features)


def cursor_movement_features(df):
    """Features about how the cursor moves around"""
    df = df.sort_values(['id', 'down_time']).copy()
    df['cursor_jump'] = df.groupby('id')['cursor_position'].diff().abs()
    
    features = df.groupby('id').agg(
        avg_cursor_jump=('cursor_jump', 'mean'),
        max_cursor_jump=('cursor_jump', 'max'),
        total_cursor_movement=('cursor_jump', 'sum'),
        small_cursor_jumps=('cursor_jump', lambda x: (x <= 5).sum()),
        medium_cursor_jumps=('cursor_jump', lambda x: ((x > 5) & (x <= 50)).sum()),
        large_cursor_jumps=('cursor_jump', lambda x: (x > 50).sum()),
        cursor_jump_std=('cursor_jump', 'std'),
    ).reset_index()
    
    # Where is the cursor most of the time
    cursor_at_end = df.groupby('id').apply(
        lambda x: (x['cursor_position'] == x['word_count']).sum() / len(x)
    ).rename('cursor_at_end_ratio')
    
    cursor_at_start = df.groupby('id').apply(
        lambda x: (x['cursor_position'] == 0).sum() / len(x)
    ).rename('cursor_at_start_ratio')
    
    features = features.merge(cursor_at_end, on='id', how='left')
    features = features.merge(cursor_at_start, on='id', how='left')
    
    # Are they mostly writing forwards
    features['forward_writing_tendency'] = features['cursor_at_end_ratio']
    features['navigation_complexity'] = features['large_cursor_jumps'] / (features['total_cursor_movement'] + 1)
    
    return features

### 2.6 Rolling Window & Distribution Features


In [31]:
def rolling_features(df, window=10):
    """Look at trends over time using a sliding window"""
    df = df.sort_values(['id', 'down_time']).copy()
    
    features = []
    for id_val in df['id'].unique():
        id_df = df[df['id'] == id_val]
        
        if len(id_df) < window:
            features.append({
                'id': id_val,
                'action_time_rolling_mean': id_df['action_time'].mean(),
                'action_time_rolling_std': id_df['action_time'].std(),
                'word_count_rolling_trend': 0,
                'action_time_trend': 0,
                'action_time_acceleration': 0
            })
            continue
        
        # Calculate moving averages
        action_rolling = id_df['action_time'].rolling(window=window, min_periods=1)
        word_rolling = id_df['word_count'].rolling(window=window, min_periods=1)
        
        # Are things speeding up or slowing down
        word_trend = (word_rolling.mean().iloc[-1] - word_rolling.mean().iloc[0]) if len(id_df) >= window else 0
        action_trend = (action_rolling.mean().iloc[-1] - action_rolling.mean().iloc[0]) if len(id_df) >= window else 0
        
        features.append({
            'id': id_val,
            'action_time_rolling_mean': action_rolling.mean().mean(),
            'action_time_rolling_std': action_rolling.std().mean(),
            'word_count_rolling_trend': word_trend,
            'action_time_trend': action_trend,
            'action_time_acceleration': action_rolling.mean().diff().mean()
        })
    
    return pd.DataFrame(features)


def action_time_distribution_features(df):
    """Statistical properties of action times"""
    features = df.groupby('id')['action_time'].agg([
        ('action_time_q25', lambda x: x.quantile(0.25)),
        ('action_time_q75', lambda x: x.quantile(0.75)),
        ('action_time_iqr', lambda x: x.quantile(0.75) - x.quantile(0.25)),
        ('action_time_skew', lambda x: x.skew()),
        ('action_time_kurtosis', lambda x: x.kurtosis()),
    ]).reset_index()
    
    return features

### 2.7 Advanced Event Timing Features


## 3. Main Feature Builder


In [32]:
def build_all_features(df):
    """
    Main function to build all features from log data
    
    Parameters:
    -----------
    df : DataFrame
        Input log data with columns: id, event_id, down_time, up_time, 
        action_time, activity, cursor_position, word_count
    
    Returns:
    --------
    DataFrame with all extracted features
    """
    print("Building all features...")
    
    # Base features
    print("  - Base features")
    features = extract_features(df)
    
    # Pause features
    print("  - Pause features")
    pause_feats = pause_features(df)
    for feat in pause_feats:
        features = features.merge(feat, on="id", how="left")
    
    # Burst features
    print("  - Burst features")
    burst_feats = burst_features(df)
    for feat in burst_feats:
        features = features.merge(feat, on="id", how="left")
    
    # P-burst features
    print("  - P-burst features")
    p_burst_feat = p_burst_features(df)
    features = features.merge(p_burst_feat, on="id", how="left")
    
    # Activity sequence features
    print("  - Activity sequence features")
    activity_seq_feat = activity_sequence_features(df)
    features = features.merge(activity_seq_feat, on="id", how="left")
    
    # Text change features
    print("  - Text change features")
    text_feat = text_change_features(df)
    features = features.merge(text_feat, on="id", how="left")
    
    # Time-based features
    print("  - Time-based features")
    time_feat = time_based_features(df)
    features = features.merge(time_feat, on="id", how="left")
    
    # Keystroke velocity features
    print("  - Keystroke velocity features")
    keystroke_feat = keystroke_velocity_features(df)
    if not keystroke_feat.empty:
        features = features.merge(keystroke_feat, on="id", how="left")
    
    # Revision pattern features
    print("  - Revision pattern features")
    revision_feat = revision_pattern_features(df)
    features = features.merge(revision_feat, on="id", how="left")
    
    # Cursor movement features
    print("  - Cursor movement features")
    cursor_feat = cursor_movement_features(df)
    features = features.merge(cursor_feat, on="id", how="left")
    
    # Rolling features
    print("  - Rolling window features")
    rolling_feat = rolling_features(df, window=10)
    features = features.merge(rolling_feat, on="id", how="left")
    
    # Action time distribution features
    print("  - Action time distribution features")
    action_dist_feat = action_time_distribution_features(df)
    features = features.merge(action_dist_feat, on="id", how="left")
    
    # Word count velocity features
    print("  - Word count velocity features")
    word_vel_feat = word_count_velocity_features(df)
    features = features.merge(word_vel_feat, on="id", how="left")
    
    # Activity timing features
    print("  - Activity timing features")
    activity_time_feat = activity_timing_features(df)
    features = features.merge(activity_time_feat, on="id", how="left")
    
    # Fill NaN and inf values
    features = features.fillna(0)
    features = features.replace([np.inf, -np.inf], 0)
    
    print(f"\nTotal features extracted: {features.shape[1] - 1}")  # -1 for id column
    print(f"Total samples: {features.shape[0]}")
    
    return features

## 4. Load Data & Extract Features


In [33]:
# Load cleaned training logs
train_logs_path = Path('data') / 'train_logs_clean.csv'

if not train_logs_path.exists():
    raise FileNotFoundError(
        f"{train_logs_path} not found. "
        "Please run Preprocessing.ipynb or preprocess/preprocess.py first to generate the cleaned data."
    )

train_logs = pd.read_csv(train_logs_path)
print(f"Loaded {len(train_logs)} rows from {train_logs_path}")
print(f"Unique IDs: {train_logs['id'].nunique()}")
print(f"\nColumns: {list(train_logs.columns)}")
train_logs.head()

Loaded 8399747 rows from data/train_logs_clean.csv
Unique IDs: 2471

Columns: ['id', 'event_id', 'down_time', 'up_time', 'action_time', 'activity', 'down_event', 'up_event', 'text_change', 'cursor_position', 'word_count', 'id_encoded']


Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count,id_encoded
0,001519c8,0,0,31,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0,0
1,001519c8,1,32,436,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0,0
2,001519c8,2,60032,60032,0,Nonproduction,Shift,Shift,NoChange,0,0,0
3,001519c8,3,60147,60238,91,Input,q,q,q,1,1,0
4,001519c8,4,60657,60784,127,Input,q,q,q,2,1,0


In [34]:
# Extract all behavioural features
train_features = build_all_features(train_logs)

Building all features...
  - Base features
  - Pause features
  - Pause features
  - Burst features
  - Burst features
  - P-burst features
  - P-burst features
  - Activity sequence features
  - Activity sequence features
  - Text change features
  - Text change features
  - Time-based features
  - Time-based features
  - Keystroke velocity features
  - Keystroke velocity features
  - Revision pattern features
  - Revision pattern features
  - Cursor movement features
  - Cursor movement features
  - Rolling window features
  - Rolling window features
  - Action time distribution features
  - Action time distribution features
  - Word count velocity features
  - Word count velocity features
  - Activity timing features
  - Activity timing features

Total features extracted: 122
Total samples: 2471

Total features extracted: 122
Total samples: 2471


## 5. Inspect Results


In [35]:
# Display first few rows
print(f"Feature matrix shape: {train_features.shape}")
print(f"\nFeature names ({len(train_features.columns)} total):")
print(list(train_features.columns))
train_features.head()

Feature matrix shape: (2471, 123)

Feature names (123 total):
['id', 'events_count', 'total_time', 'total_chars', 'mean_action_time', 'std_action_time', 'max_action_time', 'min_action_time', 'backspace_count', 'paste_count', 'input_count', 'move_count', 'replace_count', 'nonproduction_count', 'cursor_pos_mean', 'cursor_pos_std', 'cursor_pos_max', 'word_count_mean', 'word_count_std', 'word_count_diff', 'chars_per_min', 'events_per_min', 'backspace_ratio', 'paste_ratio', 'replace_ratio', 'nonproduction_ratio', 'revision_ratio', 'pause_2s_count', 'pause_5s_count', 'pause_10s_count', 'mean_pause', 'median_pause', 'std_pause', 'max_pause', 'min_pause', 'avg_burst', 'max_burst', 'std_burst', 'avg_words_per_p_burst', 'input_to_remove_trans', 'remove_to_input_trans', 'input_to_input_trans', 'paste_to_input_trans', 'max_input_streak', 'max_remove_streak', 'unique_activities', 'activity_switches', 'activity_switch_rate', 'total_text_produced', 'total_text_removed', 'text_production_rate', 'text_

Unnamed: 0,id,events_count,total_time,total_chars,mean_action_time,std_action_time,max_action_time,min_action_time,backspace_count,paste_count,...,min_word_velocity,std_word_velocity,positive_velocity_ratio,input_time_total,remove_time_total,paste_time_total,nonprod_time_total,input_time_ratio,remove_time_ratio,productive_time_ratio
0,001519c8,2557,1661257,256,116.246774,91.797374,2259,0,417,0,...,-0.037037,0.003265,0.13615,243731,34130,0,18506,0.819969,0.114821,0.819969
1,0022f953,2454,1597474,323,112.221271,55.431189,1758,0,260,1,...,-0.032258,0.002967,0.150428,237891,23550,71,13781,0.863827,0.085514,0.864085
2,0042269b,4136,1609411,404,101.837766,82.383766,3005,0,439,0,...,-0.113636,0.009244,0.132769,353718,32905,0,33951,0.839782,0.078122,0.839782
3,0059420b,1556,1282520,206,121.848329,113.768226,806,0,151,1,...,-0.333333,0.019826,0.156913,167790,18410,160,3062,0.884982,0.097101,0.885826
4,0075873a,2531,1501522,252,123.943896,62.082013,701,0,517,0,...,-0.041667,0.005011,0.133992,266515,40199,0,6988,0.849577,0.128143,0.849577


In [36]:
# Check for any issues
print("Missing values per column:")
print(train_features.isnull().sum().sum())
print("\nInfinite values per column:")
print(np.isinf(train_features.select_dtypes(include=[np.number])).sum().sum())
print("\nBasic statistics:")
train_features.describe().T

Missing values per column:
0

Infinite values per column:
0

Basic statistics:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
events_count,2471.0,3.399331e+03,1576.264508,262.000000,2.193500e+03,3.081000e+03,4.293000e+03,1.287600e+04
total_time,2471.0,1.544667e+06,299372.409828,174907.000000,1.444728e+06,1.622665e+06,1.719150e+06,4.854776e+06
total_chars,2471.0,3.899664e+02,172.455317,35.000000,2.550000e+02,3.510000e+02,4.800000e+02,1.326000e+03
mean_action_time,2471.0,9.996737e+01,24.110130,8.423294,8.366643e+01,9.705898e+01,1.145210e+02,2.826124e+02
std_action_time,2471.0,8.870110e+01,227.013591,14.770675,4.439469e+01,5.789011e+01,8.150258e+01,6.854329e+03
...,...,...,...,...,...,...,...,...
paste_time_total,2471.0,2.473695e+01,102.280410,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,1.931000e+03
nonprod_time_total,2471.0,1.775888e+04,32038.878116,6.000000,3.993000e+03,9.271000e+03,1.966500e+04,4.821150e+05
input_time_ratio,2471.0,8.596604e-01,0.082084,0.188857,8.240689e-01,8.746794e-01,9.136391e-01,9.992014e-01
remove_time_ratio,2471.0,8.676406e-02,0.047348,0.000000,5.388254e-02,7.856729e-02,1.113687e-01,5.684344e-01


## 6. Save Features


In [37]:
# Save behavioural features to CSV
output_path = Path('data') / 'train_behaviour_features.csv'
train_features.to_csv(output_path, index=False)
print(f"Saved {train_features.shape[0]} samples with {train_features.shape[1]} features to {output_path}")

Saved 2471 samples with 123 features to data/train_behaviour_features.csv


## Summary

This notebook extracts **comprehensive behavioural features** from keystroke logging data. The features capture:

### Feature Categories (150+ features total):

1. **Base Features**: Event counts, total time, typing speed, activity ratios
2. **Pause Features**: Gaps between keystrokes at different thresholds (2s, 5s, 10s)
3. **Burst Features**: When they're typing continuously and how fluently
4. **Activity Sequence**: How activities transition from one to another, streaks, variety
5. **Text Change**: How fast they produce/remove text, editing efficiency
6. **Temporal Patterns**: What they do in early/middle/late stages
7. **Keystroke Velocity**: Typing speed variations, rhythm, consistency
8. **Word Count Velocity**: How the word count changes over time
9. **Activity Timing**: How much time on each type of activity
10. **Revision Patterns**: Where they edit, review cycles, going backwards
11. **Cursor Movement**: How they navigate around, jump distances
12. **Rolling Window**: Trends and changes in typing behaviour
13. **Distribution Features**: Statistical properties (skew, kurtosis, IQR)

### Output:

- `data/train_behaviour_features.csv` - One row per essay ID with all behavioural features

### Next Steps:

- Combine with text features from `FeatureExtraction_Essay.ipynb`
- Merge with TF-IDF/SVD features from `tfidf/tfidf.ipynb`
- Build predictive models using these features
