# Integrated Notebook for Task WritingProcess
## Data Preprocessing

---------
### Idle Removing and Time Regularization from `preprocess.py`

In [34]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
import ftfy
import warnings
from pathlib import Path
import re

warnings.filterwarnings('ignore')

In [35]:
# ## Temp Code to fix Unicode, so dont need to do it in live
# train = pd.read_csv('data/train_logs.csv')
# cols = ['down_event', 'up_event', 'text_change']
# train.loc[:, cols] = train.loc[:, cols].apply(
#             lambda s: s.astype('string').map(lambda x: ftfy.fix_text(x) if x is not pd.NA else x)
#         )
# train.to_csv('data/train_logs_raw_unicode_fixed.csv', index=False)

In [36]:
class Preprocess:

    def label_encoding(self, df, col="id"):
        label_encoder = LabelEncoder()
        label_encoder.fit(df[col])
        df[col + "_encoded"] = label_encoder.transform(df[col])
        return df

    # remove time that the author havent start writing or is resting
    # reference: remove_margin for https://www.kaggle.com/code/tomooinubushi/1st-place-solution-training-and-inference-code

    def remove_start_and_end_time(
        self, df, start_margin=2 * 60 * 1000, end_margin=2 * 60 * 1000
    ):
        df = df[df["up_event"] != "Unidentified"].reset_index(drop=True)
        result_df = []
        grouped_df = df.groupby("id_encoded")

        for _, log in tqdm(grouped_df):
            valid_events = log[
                (log.activity != "Nonproduction")
                | (log.up_event != "Shift")
                | (log.up_event != "CapsLock")
            ].down_time.values
            if len(valid_events) == 0:
                continue
            log = log[
                (log.down_time > valid_events.min() - start_margin)
                & (log["down_time"] <= valid_events.max() + end_margin)
            ].copy()
            log["event_id"] = range(len(log))
            result_df.append(log)

        result = pd.concat(result_df, ignore_index=True)

        return result

    def remove_rest_time(
        self, df, time_margin=1 * 60 * 1000, action_margin=5 * 60 * 1000
    ):
        down_times, up_times = [], []
        prev_idx = -1
        result_df = df[["id_encoded", "down_time", "up_time"]].values
        for row in tqdm(result_df):
            idx, down_time, up_time = int(row[0]), int(row[1]), int(row[2])
            if prev_idx != idx:
                prev_down_time = down_time
                prev_corrected_down_time = 0
            gap_down_time = np.clip(down_time - prev_down_time, 0, time_margin)
            action_time = np.clip(up_time - down_time, 0, action_margin)

            new_down_time = prev_corrected_down_time + gap_down_time
            new_up_time = new_down_time + action_time
            down_times.append(new_down_time)
            up_times.append(new_up_time)
            prev_idx, prev_corrected_down_time, prev_down_time = (
                idx,
                new_down_time,
                down_time,
            )
        df["down_time"], df["up_time"] = down_times, up_times
        return df

In [3]:
preprocessor = Preprocess()
# ------------------ Config dataset (In submission we only have test file) ----------------------------
df = pd.read_csv("data/test_logs.csv")
# ------------------ Config dataset (TFIDF has to be fit on train and transform on test) ----------------------------
train_df = pd.read_csv("data/train_logs.csv")





df = preprocessor.label_encoding(df)
df = preprocessor.remove_start_and_end_time(df)
df = preprocessor.remove_rest_time(df)

train_df = preprocessor.label_encoding(train_df)
train_df = preprocessor.remove_start_and_end_time(train_df)
train_df = preprocessor.remove_rest_time(train_df)




100%|██████████| 3/3 [00:00<00:00, 1486.64it/s]
100%|██████████| 6/6 [00:00<?, ?it/s]
100%|██████████| 2471/2471 [00:03<00:00, 768.87it/s]
100%|██████████| 8399747/8399747 [01:17<00:00, 108505.36it/s]


-----------
### Event,Unicode Cleaning from `Preprocessing.ipynb`

In [37]:
def label_encoding(df, col="id"):
    label_encoder = LabelEncoder()
    label_encoder.fit(df[col])
    df[col + "_encoded"] = label_encoder.transform(df[col])
    return df


# remove time that the author havent start writing or is resting
# reference: remove_margin for https://www.kaggle.com/code/tomooinubushi/1st-place-solution-training-and-inference-code

def remove_procrastination_time(df, start_margin=2*60*1000, end_margin=2*60*1000):
    df = df[df['up_event'] != 'Unidentified'].reset_index(drop=True)
    result_df = []
    grouped_df = df.groupby('id_encoded')

    for _, log in tqdm(grouped_df):
        valid_events = log[(log.activity != 'Nonproduction') & (
            log.up_event != 'Shift') & (log.up_event != 'CapsLock')].down_time.values
        if len(valid_events) == 0:
            continue
        log = log[(log.down_time > valid_events.min() - start_margin)
                  & (log['down_time'] <= valid_events.max() + end_margin)].copy()
        log['event_id'] = range(len(log))
        result_df.append(log)

    result = pd.concat(result_df, ignore_index=True)

    return result


In [38]:
class CleanPreprocessor:
    def cleaning(self,df):
        df = label_encoding(df)
        df = remove_procrastination_time(df)
        df = df[df['activity'] != 'Nonproduction' ].reset_index(drop=True)
        cols = ['down_event', 'up_event', 'text_change']

        df.loc[:, cols] = df.loc[:, cols].apply(
            lambda s: s.astype('string').map(lambda x: ftfy.fix_text(x) if x is not pd.NA else x)
        )
        
        drop_events = ['LeftClick','RightClick']
        df = df[~df['down_event'].isin(drop_events)]
        df['event_id'] = df.groupby('id').cumcount() + 1 # reset event_id
        df.reset_index(inplace=True,drop=True)
        return df
    

cleaner = CleanPreprocessor()
df = cleaner.cleaning(df)
train_df = cleaner.cleaning(train_df)

        


100%|██████████| 3/3 [00:00<00:00, 999.99it/s]
100%|██████████| 2471/2471 [00:03<00:00, 708.01it/s]


KeyboardInterrupt: 

In [39]:
# Dont want to waste my time running Unicode Cleaning so just read first 
# TODO DELETE THIS WHEN SUBMITTING
train_df = pd.read_csv('data/train_logs_clean.csv')

-------
### Text Essay Rebuilding
Work is taken from `text_process.py`

In [40]:
class TextProcessor:
    PUNCTUATION_MAP = {
        "SPACE": " ",
        "COMMA": ",",
        "DOUBLE_QUOTE": '"',
        "PERIOD": ".",
        "PARENTHESES_OPEN": "(",
        "PARENTHESES_CLOSE": ")",
        "SQUARE_BRACKET_OPEN": "[",
        "SQUARE_BRACKET_CLOSE": "]",
        "CURLY_BRACKET_OPEN": "{",
        "CURLY_BRACKET_CLOSE": "}",
        "EXCLAMATION_MARK": "!",
        "QUESTION_MARK": "?",
    }

    def insert_text(self, text, s, pos):
        return "".join((text[:pos], s, text[pos:]))

    def remove_text(self, text, s, pos):
        return "".join((text[:pos], text[pos + len(s):]))

    def replace_text(self, text, s1, s2, pos):
        return "".join((text[:pos], s2, text[pos + len(s1):]))

    def move_text(self, text, s, pos1, pos2):
        text = self.remove_text(text, s, pos1)
        text = self.insert_text(text, s, pos2)
        return text

    def split_to_word(self, s):
        s = s.lower()
        char_sep = "@"
        punctuation_chars = list(self.PUNCTUATION_MAP.values())
        for pun in punctuation_chars:
            s = s.replace(pun, char_sep)
        s_arr = re.split(char_sep, s)
        s_arr = [w for w in s_arr if w.strip()]  # Keep non-empty words
        return s_arr

    def split_to_sentence(self, s):
        s = s.lower()
        char_sep = "@"
        punctuation = [".", "!", "?"]
        for punc in punctuation:
            s = s.replace(punc, char_sep)
        s_arr = re.split(char_sep, s)
        s_arr = [w for w in s_arr if w.strip()]  # Keep non-empty sentences
        return s_arr

    def split_to_paragraph(self, s):
        s = s.lower()
        s_arr = re.split(r'n\s*n+', s)
        s_arr = [w for w in s_arr if w.strip()]  # Keep non-empty paragraphs
        return s_arr

    def change_punctuation(self, text):
        reverse_map = {v: k.lower()
                       for k, v in self.PUNCTUATION_MAP.items()}
        result = []
        for char in text:
            if char in reverse_map:
                result.append(' ' + reverse_map[char] + ' ')
            else:
                result.append(char)
        output = "".join(result)
        output = re.sub(r"\s+", " ", output).strip()

        return output

class EssayConstructor:
    def __init__(self):
        self.text_processor = TextProcessor()

    def recon_writing(self, df):
        res_all = []
        len_texts = []
        sentence_counts = []
        paragraph_counts = []

        res = ""
        prev_idx = ""

        temp_df = df[['id', 'activity', 'up_event', 'text_change',
                      'cursor_position', 'word_count']].values

        for row in tqdm(temp_df):
            idx = str(row[0])
            activity, up_event, text_change = str(
                row[1]), str(row[2]), str(row[3])
            cursor_position, _ = int(row[4]), int(row[5])

            # new idx
            if idx != prev_idx:
                if prev_idx != "":
                    # append first essay data
                    res_all.append(res)
                    len_texts.append(len_text)
                    sentence_counts.append(sentence_count)
                    paragraph_counts.append(paragraph_count)

                res, len_text, sentence_count, paragraph_count = "", 0, 0, 0
                prev_idx = idx

            if activity != "Nonproduction":
                # replace the newline character to n
                text_change = text_change.replace("@", "/").replace("\n", "n")

                if (activity == "Input") | (activity == "Paste"):
                    res = self.text_processor.insert_text(
                        res, text_change, cursor_position - len(text_change)
                    )

                elif activity == "Remove/Cut":
                    res = self.text_processor.remove_text(
                        res, text_change, cursor_position
                    )

                elif activity == "Replace":
                    before, after = text_change.split(" => ")
                    res = self.text_processor.replace_text(
                        res, before, after, cursor_position - len(after)
                    )

                elif "Move" in activity:
                    pos = [int(s) for s in re.findall(r"\d+", activity)]
                    # pos 0 start pos1 end pos2 start pos3 end
                    res = self.text_processor.move_text(
                        res, text_change, pos[0], pos[2]
                    )

                len_text = len(res)
                sentence_count = len(
                    self.text_processor.split_to_sentence(res))
                paragraph_count = len(
                    self.text_processor.split_to_paragraph(res))

            prev_up_event = up_event

        # append last essay data
        res_all.append(res)
        len_texts.append(len_text)
        sentence_counts.append(sentence_count)
        paragraph_counts.append(paragraph_count)

        return res_all, len_texts, sentence_counts, paragraph_counts


In [41]:
essay_constructor = EssayConstructor()
reconstructed_texts, len_texts, sentence_counts, paragraph_counts = essay_constructor.recon_writing(
    df)
idx = df["id"].unique()
result_df = pd.DataFrame({"id": idx, "text": reconstructed_texts, "len_text": len_texts,
                         "sentence_count": sentence_counts, "paragraph_count": paragraph_counts})

extracted_text = result_df

reconstructed_texts, len_texts, sentence_counts, paragraph_counts = essay_constructor.recon_writing(
    train_df)
idx = train_df["id"].unique()
extracted_text_train = pd.DataFrame({"id": idx, "text": reconstructed_texts, "len_text": len_texts,
                         "sentence_count": sentence_counts, "paragraph_count": paragraph_counts})


100%|██████████| 6/6 [00:00<?, ?it/s]
100%|██████████| 7702047/7702047 [00:40<00:00, 187927.81it/s]


-------------------
## Feature Engineering
### Behaviour Feature

This part is taken from `feature_extraction.ipynb`

## 2. Feature Extraction Functions

Extract different behavioural features from keystroke logs.
We want to capture:

- pauses (when people are thinking)
- bursts (when they're typing continuously)
- editing behaviour (how much they revise)
- cursor movement (planning vs going back to edit)

### 2.1 Base Features


In [42]:
def extract_features(df):
    """Pull out the main features from the log data"""
    
    # Count up events and get basic stats
    features = df.groupby("id").agg(
        events_count=('event_id', 'count'),
        total_time=('up_time', 'max'),
        total_chars=('word_count', 'max'),
        mean_action_time=('action_time', 'mean'),
        std_action_time=('action_time', 'std'),
        max_action_time=('action_time', 'max'),
        min_action_time=('action_time', 'min'),
        
        # Count different types of actions
        backspace_count=('activity', lambda x: (x == "Remove/Cut").sum()),
        paste_count=('activity', lambda x: (x == "Paste").sum()),
        input_count=('activity', lambda x: (x == "Input").sum()),
        move_count=('activity', lambda x: x.str.contains("Move", na=False).sum()),
        replace_count=('activity', lambda x: (x == "Replace").sum()),
        nonproduction_count=('activity', lambda x: (x == "Nonproduction").sum()),
        
        # Where the cursor was
        cursor_pos_mean=('cursor_position', 'mean'),
        cursor_pos_std=('cursor_position', 'std'),
        cursor_pos_max=('cursor_position', 'max'),
        
        # Word count stats
        word_count_mean=('word_count', 'mean'),
        word_count_std=('word_count', 'std'),
        word_count_diff=('word_count', lambda x: x.max() - x.min()),
    ).reset_index()
    
    # Replace any missing values with 0
    features = features.fillna(0)
    
    # Calculate some ratios
    features['chars_per_min'] = features['total_chars'] / (features['total_time'] / 60000 + 1e-6)
    features['events_per_min'] = features['events_count'] / (features['total_time'] / 60000 + 1e-6)
    features['backspace_ratio'] = features['backspace_count'] / (features['input_count'] + 1)
    features['paste_ratio'] = features['paste_count'] / (features['events_count'] + 1)
    features['replace_ratio'] = features['replace_count'] / (features['events_count'] + 1)
    features['nonproduction_ratio'] = features['nonproduction_count'] / (features['events_count'] + 1)
    features['revision_ratio'] = (features['backspace_count'] + features['replace_count']) / (features['total_chars'] + 1)
    
    return features

### 2.2 Pause Features


In [43]:
def pause_features(df):
    """Get features about pauses (gaps between keystrokes)"""
    df = df.sort_values(["id", "down_time"]).copy()
    df["iki"] = df.groupby("id")["down_time"].diff()
    
    # Count pauses at different thresholds (2s, 5s, 10s)
    pause_2s = df.groupby("id")["iki"].apply(lambda x: (x > 2000).sum()).rename("pause_2s_count")
    pause_5s = df.groupby("id")["iki"].apply(lambda x: (x > 5000).sum()).rename("pause_5s_count")
    pause_10s = df.groupby("id")["iki"].apply(lambda x: (x > 10000).sum()).rename("pause_10s_count")
    
    # Basic pause stats
    mean_pause = df.groupby("id")["iki"].mean().rename("mean_pause")
    median_pause = df.groupby("id")["iki"].median().rename("median_pause")
    std_pause = df.groupby("id")["iki"].std().rename("std_pause")
    max_pause = df.groupby("id")["iki"].max().rename("max_pause")
    min_pause = df.groupby("id")["iki"].min().rename("min_pause")
    
    return pause_2s, pause_5s, pause_10s, mean_pause, median_pause, std_pause, max_pause, min_pause


def burst_features(df):
    """Get features about bursts (when they're typing continuously)"""
    df = df.sort_values(["id", "down_time"]).copy()
    df["iki"] = df.groupby("id")["down_time"].diff()
    df["burst"] = (df["iki"] > 2000).astype(int)
    df["burst_id"] = df.groupby("id")["burst"].cumsum()
    
    burst_len = df.groupby(["id", "burst_id"]).size()
    avg_burst = burst_len.groupby("id").mean().rename("avg_burst")
    max_burst = burst_len.groupby("id").max().rename("max_burst")
    std_burst = burst_len.groupby("id").std().rename("std_burst")
    
    return avg_burst, max_burst, std_burst


def p_burst_features(df):
    """Get P-burst features (how many words per burst)"""
    df = df.sort_values(["id", "down_time"]).copy()
    df["iki"] = df.groupby("id")["down_time"].diff()
    
    # P-bursts: pauses longer than 2s
    df["p_burst"] = (df["iki"] > 2000).astype(int)
    df["p_burst_id"] = df.groupby("id")["p_burst"].cumsum()
    
    # How many words in each burst
    p_burst_words = df.groupby(["id", "p_burst_id"])["word_count"].apply(lambda x: x.max() - x.min())
    avg_words_per_p_burst = p_burst_words.groupby("id").mean().rename("avg_words_per_p_burst")
    
    return avg_words_per_p_burst

### 2.3 Activity Sequence & Text Change Features


In [44]:
def activity_sequence_features(df):
    """Get features from activity patterns and transitions"""
    features = []
    
    for id_val in df['id'].unique():
        id_df = df[df['id'] == id_val].sort_values('down_time')
        activities = id_df['activity'].values
        
        # Track how activities transition from one to another
        transitions = {}
        for i in range(len(activities) - 1):
            transition = f"{activities[i]}->{activities[i+1]}"
            transitions[transition] = transitions.get(transition, 0) + 1
        
        # Common patterns
        input_to_remove = transitions.get('Input->Remove/Cut', 0)
        remove_to_input = transitions.get('Remove/Cut->Input', 0)
        input_to_input = transitions.get('Input->Input', 0)
        paste_to_input = transitions.get('Paste->Input', 0)
        
        # Find the longest streaks of the same activity
        max_input_streak = 0
        max_remove_streak = 0
        current_input_streak = 0
        current_remove_streak = 0
        
        for act in activities:
            if act == 'Input':
                current_input_streak += 1
                max_input_streak = max(max_input_streak, current_input_streak)
                current_remove_streak = 0
            elif act == 'Remove/Cut':
                current_remove_streak += 1
                max_remove_streak = max(max_remove_streak, current_remove_streak)
                current_input_streak = 0
            else:
                current_input_streak = 0
                current_remove_streak = 0
        
        # How varied are the activities
        unique_activities = len(set(activities))
        activity_switches = sum(1 for i in range(len(activities)-1) if activities[i] != activities[i+1])
        
        features.append({
            'id': id_val,
            'input_to_remove_trans': input_to_remove,
            'remove_to_input_trans': remove_to_input,
            'input_to_input_trans': input_to_input,
            'paste_to_input_trans': paste_to_input,
            'max_input_streak': max_input_streak,
            'max_remove_streak': max_remove_streak,
            'unique_activities': unique_activities,
            'activity_switches': activity_switches,
            'activity_switch_rate': activity_switches / len(activities) if len(activities) > 0 else 0
        })
    
    return pd.DataFrame(features)


def text_change_features(df):
    """Features about how the text changes"""
    df = df.sort_values(['id', 'down_time']).copy()
    
    # How much text was added or removed
    df['text_change'] = df.groupby('id')['word_count'].diff().fillna(0)
    
    features = df.groupby('id').agg(
        total_text_produced=('text_change', lambda x: x[x > 0].sum()),
        total_text_removed=('text_change', lambda x: abs(x[x < 0].sum())),
        text_production_rate=('text_change', lambda x: x[x > 0].mean()),
        text_removal_rate=('text_change', lambda x: x[x < 0].mean()),
        max_text_addition=('text_change', 'max'),
        max_text_removal=('text_change', 'min'),
        text_volatility=('text_change', 'std'),
        positive_text_changes=('text_change', lambda x: (x > 0).sum()),
        negative_text_changes=('text_change', lambda x: (x < 0).sum()),
    ).reset_index()
    
    # Calculate some more useful ratios
    features['text_removal_ratio'] = features['total_text_removed'] / (features['total_text_produced'] + 1)
    features['net_text_production'] = features['total_text_produced'] - features['total_text_removed']
    features['text_efficiency'] = features['total_text_produced'] / (features['positive_text_changes'] + 1)
    
    return features

### 2.4 Temporal & Velocity Features


In [45]:
def time_based_features(df):
    """Features based on when things happen (early, middle, late)"""
    df = df.sort_values(['id', 'down_time']).copy()
    
    # Split the writing session into three parts
    df['time_percentile'] = df.groupby('id')['down_time'].rank(pct=True)
    
    features = []
    for id_val in df['id'].unique():
        id_df = df[df['id'] == id_val]
        
        # Split into early, middle, and late phases
        early_phase = id_df[id_df['time_percentile'] <= 0.33]
        middle_phase = id_df[(id_df['time_percentile'] > 0.33) & (id_df['time_percentile'] <= 0.67)]
        late_phase = id_df[id_df['time_percentile'] > 0.67]
        
        features.append({
            'id': id_val,
            'early_events': len(early_phase),
            'middle_events': len(middle_phase),
            'late_events': len(late_phase),
            'early_input_ratio': (early_phase['activity'] == 'Input').sum() / (len(early_phase) + 1),
            'middle_input_ratio': (middle_phase['activity'] == 'Input').sum() / (len(middle_phase) + 1),
            'late_input_ratio': (late_phase['activity'] == 'Input').sum() / (len(late_phase) + 1),
            'early_remove_ratio': (early_phase['activity'] == 'Remove/Cut').sum() / (len(early_phase) + 1),
            'late_remove_ratio': (late_phase['activity'] == 'Remove/Cut').sum() / (len(late_phase) + 1),
            'middle_paste_ratio': (middle_phase['activity'] == 'Paste').sum() / (len(middle_phase) + 1),
            'late_phase_activity': len(late_phase) / (len(id_df) + 1),
        })
    
    return pd.DataFrame(features)


def keystroke_velocity_features(df):
    """Features about typing speed"""
    df = df.sort_values(['id', 'down_time']).copy()
    df['iki'] = df.groupby('id')['down_time'].diff()
    
    # Only look at actual typing events
    input_df = df[df['activity'] == 'Input'].copy()
    
    if len(input_df) == 0:
        return pd.DataFrame()
    
    features = input_df.groupby('id').agg(
        input_iki_mean=('iki', 'mean'),
        input_iki_std=('iki', 'std'),
        input_iki_median=('iki', 'median'),
        input_iki_min=('iki', 'min'),
        input_iki_max=('iki', 'max'),
        fast_keystrokes=('iki', lambda x: (x < 100).sum()),
        moderate_keystrokes=('iki', lambda x: ((x >= 100) & (x <= 1000)).sum()),
        slow_keystrokes=('iki', lambda x: (x > 1000).sum()),
    ).reset_index()
    
    # How consistent is the typing
    features['keystroke_consistency'] = features['input_iki_std'] / (features['input_iki_mean'] + 1)
    features['fast_keystroke_ratio'] = features['fast_keystrokes'] / (features['fast_keystrokes'] + features['moderate_keystrokes'] + features['slow_keystrokes'] + 1)
    features['typing_rhythm_score'] = features['moderate_keystrokes'] / (features['fast_keystrokes'] + features['moderate_keystrokes'] + features['slow_keystrokes'] + 1)
    
    return features


def word_count_velocity_features(df):
    """Features about how word count changes"""
    df = df.sort_values(['id', 'down_time']).copy()
    
    features = []
    for id_val in df['id'].unique():
        id_df = df[df['id'] == id_val]
        
        word_counts = id_df['word_count'].values
        time_stamps = id_df['down_time'].values
        
        # How fast are words being added
        if len(word_counts) > 1:
            word_velocity = np.diff(word_counts) / (np.diff(time_stamps) + 1)
            
            features.append({
                'id': id_val,
                'avg_word_velocity': np.mean(word_velocity),
                'max_word_velocity': np.max(word_velocity),
                'min_word_velocity': np.min(word_velocity),
                'std_word_velocity': np.std(word_velocity),
                'positive_velocity_ratio': (word_velocity > 0).sum() / len(word_velocity)
            })
        else:
            features.append({
                'id': id_val,
                'avg_word_velocity': 0,
                'max_word_velocity': 0,
                'min_word_velocity': 0,
                'std_word_velocity': 0,
                'positive_velocity_ratio': 0
            })
    
    return pd.DataFrame(features)


def activity_timing_features(df):
    """How much time is spent on each type of activity"""
    df = df.sort_values(['id', 'down_time']).copy()
    
    features = []
    for id_val in df['id'].unique():
        id_df = df[df['id'] == id_val]
        
        # Add up time for each activity
        input_time = id_df[id_df['activity'] == 'Input']['action_time'].sum()
        remove_time = id_df[id_df['activity'] == 'Remove/Cut']['action_time'].sum()
        paste_time = id_df[id_df['activity'] == 'Paste']['action_time'].sum()
        nonprod_time = id_df[id_df['activity'] == 'Nonproduction']['action_time'].sum()
        
        total_time = id_df['action_time'].sum()
        
        features.append({
            'id': id_val,
            'input_time_total': input_time,
            'remove_time_total': remove_time,
            'paste_time_total': paste_time,
            'nonprod_time_total': nonprod_time,
            'input_time_ratio': input_time / (total_time + 1),
            'remove_time_ratio': remove_time / (total_time + 1),
            'productive_time_ratio': (input_time + paste_time) / (total_time + 1),
        })
    
    return pd.DataFrame(features)

### 2.5 Revision & Cursor Movement Features


In [46]:
def revision_pattern_features(df):
    """Features about revision behaviour and editing patterns"""
    df = df.sort_values(['id', 'down_time']).copy()
    
    features = []
    for id_val in df['id'].unique():
        id_df = df[df['id'] == id_val]
        
        # Where in the text are they making changes
        cursor_positions = id_df['cursor_position'].values
        activities = id_df['activity'].values
        word_counts = id_df['word_count'].values
        
        # Count edits at start, middle, and end
        revisions_start = 0
        revisions_middle = 0
        revisions_end = 0
        
        for i, (pos, act, wc) in enumerate(zip(cursor_positions, activities, word_counts)):
            if act in ['Remove/Cut', 'Replace'] and wc > 0:
                relative_pos = pos / (wc + 1)
                if relative_pos < 0.33:
                    revisions_start += 1
                elif relative_pos < 0.67:
                    revisions_middle += 1
                else:
                    revisions_end += 1
        
        # Look for write-then-edit cycles
        review_cycles = 0
        in_writing = False
        for act in activities:
            if act == 'Input':
                in_writing = True
            elif act in ['Remove/Cut', 'Replace'] and in_writing:
                review_cycles += 1
                in_writing = False
        
        # How often they go backwards to edit
        backward_movements = sum(1 for i in range(len(cursor_positions)-1) 
                                if cursor_positions[i+1] < cursor_positions[i])
        
        total_revisions = revisions_start + revisions_middle + revisions_end
        
        features.append({
            'id': id_val,
            'revisions_at_start': revisions_start,
            'revisions_at_middle': revisions_middle,
            'revisions_at_end': revisions_end,
            'total_revisions': total_revisions,
            'review_cycles': review_cycles,
            'backward_movements': backward_movements,
            'early_revision_ratio': revisions_start / (total_revisions + 1),
            'end_revision_ratio': revisions_end / (total_revisions + 1),
            'revision_density': total_revisions / (len(id_df) + 1),
        })
    
    return pd.DataFrame(features)


def cursor_movement_features(df):
    """Features about how the cursor moves around"""
    df = df.sort_values(['id', 'down_time']).copy()
    df['cursor_jump'] = df.groupby('id')['cursor_position'].diff().abs()
    
    features = df.groupby('id').agg(
        avg_cursor_jump=('cursor_jump', 'mean'),
        max_cursor_jump=('cursor_jump', 'max'),
        total_cursor_movement=('cursor_jump', 'sum'),
        small_cursor_jumps=('cursor_jump', lambda x: (x <= 5).sum()),
        medium_cursor_jumps=('cursor_jump', lambda x: ((x > 5) & (x <= 50)).sum()),
        large_cursor_jumps=('cursor_jump', lambda x: (x > 50).sum()),
        cursor_jump_std=('cursor_jump', 'std'),
    ).reset_index()
    
    # Where is the cursor most of the time
    cursor_at_end = df.groupby('id').apply(
        lambda x: (x['cursor_position'] == x['word_count']).sum() / len(x)
    ).rename('cursor_at_end_ratio')
    
    cursor_at_start = df.groupby('id').apply(
        lambda x: (x['cursor_position'] == 0).sum() / len(x)
    ).rename('cursor_at_start_ratio')
    
    features = features.merge(cursor_at_end, on='id', how='left')
    features = features.merge(cursor_at_start, on='id', how='left')
    
    # Are they mostly writing forwards
    features['forward_writing_tendency'] = features['cursor_at_end_ratio']
    features['navigation_complexity'] = features['large_cursor_jumps'] / (features['total_cursor_movement'] + 1)
    
    return features

### 2.6 Rolling Window & Distribution Features


In [47]:
def rolling_features(df, window=10):
    """Look at trends over time using a sliding window"""
    df = df.sort_values(['id', 'down_time']).copy()
    
    features = []
    for id_val in df['id'].unique():
        id_df = df[df['id'] == id_val]
        
        if len(id_df) < window:
            features.append({
                'id': id_val,
                'action_time_rolling_mean': id_df['action_time'].mean(),
                'action_time_rolling_std': id_df['action_time'].std(),
                'word_count_rolling_trend': 0,
                'action_time_trend': 0,
                'action_time_acceleration': 0
            })
            continue
        
        # Calculate moving averages
        action_rolling = id_df['action_time'].rolling(window=window, min_periods=1)
        word_rolling = id_df['word_count'].rolling(window=window, min_periods=1)
        
        # Are things speeding up or slowing down
        word_trend = (word_rolling.mean().iloc[-1] - word_rolling.mean().iloc[0]) if len(id_df) >= window else 0
        action_trend = (action_rolling.mean().iloc[-1] - action_rolling.mean().iloc[0]) if len(id_df) >= window else 0
        
        features.append({
            'id': id_val,
            'action_time_rolling_mean': action_rolling.mean().mean(),
            'action_time_rolling_std': action_rolling.std().mean(),
            'word_count_rolling_trend': word_trend,
            'action_time_trend': action_trend,
            'action_time_acceleration': action_rolling.mean().diff().mean()
        })
    
    return pd.DataFrame(features)


def action_time_distribution_features(df):
    """Statistical properties of action times"""
    features = df.groupby('id')['action_time'].agg([
        ('action_time_q25', lambda x: x.quantile(0.25)),
        ('action_time_q75', lambda x: x.quantile(0.75)),
        ('action_time_iqr', lambda x: x.quantile(0.75) - x.quantile(0.25)),
        ('action_time_skew', lambda x: x.skew()),
        ('action_time_kurtosis', lambda x: x.kurtosis()),
    ]).reset_index()
    
    return features

### 2.7 Advanced Event Timing Features


## 3. Main Feature Builder


In [48]:
def build_all_features(df):
    """
    Main function to build all features from log data
    
    Parameters:
    -----------
    df : DataFrame
        Input log data with columns: id, event_id, down_time, up_time, 
        action_time, activity, cursor_position, word_count
    
    Returns:
    --------
    DataFrame with all extracted features
    """
    print("Building all features...")
    
    # Base features
    print("  - Base features")
    features = extract_features(df)
    
    # Pause features
    print("  - Pause features")
    pause_feats = pause_features(df)
    for feat in pause_feats:
        features = features.merge(feat, on="id", how="left")
    
    # Burst features
    print("  - Burst features")
    burst_feats = burst_features(df)
    for feat in burst_feats:
        features = features.merge(feat, on="id", how="left")
    
    # P-burst features
    print("  - P-burst features")
    p_burst_feat = p_burst_features(df)
    features = features.merge(p_burst_feat, on="id", how="left")
    
    # Activity sequence features
    print("  - Activity sequence features")
    activity_seq_feat = activity_sequence_features(df)
    features = features.merge(activity_seq_feat, on="id", how="left")
    
    # Text change features
    print("  - Text change features")
    text_feat = text_change_features(df)
    features = features.merge(text_feat, on="id", how="left")
    
    # Time-based features
    print("  - Time-based features")
    time_feat = time_based_features(df)
    features = features.merge(time_feat, on="id", how="left")
    
    # Keystroke velocity features
    print("  - Keystroke velocity features")
    keystroke_feat = keystroke_velocity_features(df)
    if not keystroke_feat.empty:
        features = features.merge(keystroke_feat, on="id", how="left")
    
    # Revision pattern features
    print("  - Revision pattern features")
    revision_feat = revision_pattern_features(df)
    features = features.merge(revision_feat, on="id", how="left")
    
    # Cursor movement features
    print("  - Cursor movement features")
    cursor_feat = cursor_movement_features(df)
    features = features.merge(cursor_feat, on="id", how="left")
    
    # Rolling features
    print("  - Rolling window features")
    rolling_feat = rolling_features(df, window=10)
    features = features.merge(rolling_feat, on="id", how="left")
    
    # Action time distribution features
    print("  - Action time distribution features")
    action_dist_feat = action_time_distribution_features(df)
    features = features.merge(action_dist_feat, on="id", how="left")
    
    # Word count velocity features
    print("  - Word count velocity features")
    word_vel_feat = word_count_velocity_features(df)
    features = features.merge(word_vel_feat, on="id", how="left")
    
    # Activity timing features
    print("  - Activity timing features")
    activity_time_feat = activity_timing_features(df)
    features = features.merge(activity_time_feat, on="id", how="left")
    
    # Fill NaN and inf values
    features = features.fillna(0)
    features = features.replace([np.inf, -np.inf], 0)
    
    print(f"\nTotal features extracted: {features.shape[1] - 1}")  # -1 for id column
    print(f"Total samples: {features.shape[0]}")
    
    return features

## 4. Load Data & Extract Features


In [49]:
# Load cleaned training logs


logs = df
print(f"Loaded {len(logs)} rows")
print(f"Unique IDs: {logs['id'].nunique()}")
print(f"\nColumns: {list(logs.columns)}")
logs.head()

Loaded 6 rows
Unique IDs: 3

Columns: ['id', 'event_id', 'down_time', 'up_time', 'action_time', 'activity', 'down_event', 'up_event', 'text_change', 'cursor_position', 'word_count', 'id_encoded']


Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count,id_encoded
0,0000aaaa,1,0,85,85,Input,Space,Space,,0,0,0
1,0000aaaa,2,60000,60087,87,Input,Space,Space,,1,0,0
2,2222bbbb,1,0,67,67,Input,q,q,q,0,1,1
3,2222bbbb,2,0,46,46,Input,q,q,q,1,1,1
4,4444cccc,1,0,94,94,Input,Space,Space,,0,0,2


In [50]:
# Extract all behavioural features
Behavioral_features_temp = build_all_features(logs)

Building all features...
  - Base features
  - Pause features
  - Burst features
  - P-burst features
  - Activity sequence features
  - Text change features
  - Time-based features
  - Keystroke velocity features
  - Revision pattern features
  - Cursor movement features
  - Rolling window features
  - Action time distribution features
  - Word count velocity features
  - Activity timing features

Total features extracted: 122
Total samples: 3


## 5. Inspect Results


In [51]:
# Display first few rows
print(f"Feature matrix shape: {Behavioral_features_temp.shape}")
print(f"\nFeature names ({len(Behavioral_features_temp.columns)} total):")
print(list(Behavioral_features_temp.columns))
Behavioral_features_temp.head()

Feature matrix shape: (3, 123)

Feature names (123 total):
['id', 'events_count', 'total_time', 'total_chars', 'mean_action_time', 'std_action_time', 'max_action_time', 'min_action_time', 'backspace_count', 'paste_count', 'input_count', 'move_count', 'replace_count', 'nonproduction_count', 'cursor_pos_mean', 'cursor_pos_std', 'cursor_pos_max', 'word_count_mean', 'word_count_std', 'word_count_diff', 'chars_per_min', 'events_per_min', 'backspace_ratio', 'paste_ratio', 'replace_ratio', 'nonproduction_ratio', 'revision_ratio', 'pause_2s_count', 'pause_5s_count', 'pause_10s_count', 'mean_pause', 'median_pause', 'std_pause', 'max_pause', 'min_pause', 'avg_burst', 'max_burst', 'std_burst', 'avg_words_per_p_burst', 'input_to_remove_trans', 'remove_to_input_trans', 'input_to_input_trans', 'paste_to_input_trans', 'max_input_streak', 'max_remove_streak', 'unique_activities', 'activity_switches', 'activity_switch_rate', 'total_text_produced', 'total_text_removed', 'text_production_rate', 'text_rem

Unnamed: 0,id,events_count,total_time,total_chars,mean_action_time,std_action_time,max_action_time,min_action_time,backspace_count,paste_count,...,min_word_velocity,std_word_velocity,positive_velocity_ratio,input_time_total,remove_time_total,paste_time_total,nonprod_time_total,input_time_ratio,remove_time_ratio,productive_time_ratio
0,0000aaaa,2,60087,0,86.0,1.414214,87,85,0,0,...,0.0,0.0,0.0,172,0,0,0,0.99422,0.0,0.99422
1,2222bbbb,2,67,1,56.5,14.849242,67,46,0,0,...,0.0,0.0,0.0,113,0,0,0,0.991228,0.0,0.991228
2,4444cccc,2,94,1,75.0,26.870058,94,56,0,0,...,1.0,0.0,1.0,150,0,0,0,0.993377,0.0,0.993377


In [52]:
# Check for any issues
print("Missing values per column:")
print(Behavioral_features_temp.isnull().sum().sum())
print("\nInfinite values per column:")
print(np.isinf(Behavioral_features_temp.select_dtypes(include=[np.number])).sum().sum())
print("\nBasic statistics:")
Behavioral_features_temp.describe().T

Missing values per column:
0

Infinite values per column:
0

Basic statistics:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
events_count,3.0,2.000000,0.000000,2.000000,2.000000,2.000000,2.000000,2.000000
total_time,3.0,20082.666667,34644.771558,67.000000,80.500000,94.000000,30090.500000,60087.000000
total_chars,3.0,0.666667,0.577350,0.000000,0.500000,1.000000,1.000000,1.000000
mean_action_time,3.0,72.500000,14.908052,56.500000,65.750000,75.000000,80.500000,86.000000
std_action_time,3.0,14.377838,12.734468,1.414214,8.131728,14.849242,20.859650,26.870058
...,...,...,...,...,...,...,...,...
paste_time_total,3.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
nonprod_time_total,3.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
input_time_ratio,3.0,0.992942,0.001543,0.991228,0.992303,0.993377,0.993799,0.994220
remove_time_ratio,3.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


## 6. Save Features


In [53]:
Behavioural_features = Behavioral_features_temp


## Summary

This notebook extracts **comprehensive behavioural features** from keystroke logging data. The features capture:

### Feature Categories (150+ features total):

1. **Base Features**: Event counts, total time, typing speed, activity ratios
2. **Pause Features**: Gaps between keystrokes at different thresholds (2s, 5s, 10s)
3. **Burst Features**: When they're typing continuously and how fluently
4. **Activity Sequence**: How activities transition from one to another, streaks, variety
5. **Text Change**: How fast they produce/remove text, editing efficiency
6. **Temporal Patterns**: What they do in early/middle/late stages
7. **Keystroke Velocity**: Typing speed variations, rhythm, consistency
8. **Word Count Velocity**: How the word count changes over time
9. **Activity Timing**: How much time on each type of activity
10. **Revision Patterns**: Where they edit, review cycles, going backwards
11. **Cursor Movement**: How they navigate around, jump distances
12. **Rolling Window**: Trends and changes in typing behaviour
13. **Distribution Features**: Statistical properties (skew, kurtosis, IQR)

### Output:

- `data/train_behaviour_features.csv` - One row per essay ID with all behavioural features

### Next Steps:

- Combine with text features from `FeatureExtraction_Essay.ipynb`
- Merge with TF-IDF/SVD features from `tfidf/tfidf.ipynb`
- Build predictive models using these features


------------

### Essay Text Feature

In [54]:
# reuse code from text_process
import sys
sys.path.append('..')
import numpy as np

def Text_Feature_Extraction(extracted_text):
    features = extracted_text.drop('text',axis=1)
    texts = extracted_text['text']
    processor = TextProcessor()
    for i in range(0,texts.shape[0]):
        words = processor.split_to_word(texts[i])
        sentences = processor.split_to_sentence(texts[i])
        word_lengths = [len(w) for w in words]
        sent_lengths = [len(processor.split_to_word(s)) for s in sentences]
        
        features.loc[i,'word_count'] = len(word_lengths)
        if len(word_lengths) > 0:
            features.loc[i,'word_length_mean'] = sum(word_lengths)/len(word_lengths)
            features.loc[i,'word_length_std'] = pd.Series(word_lengths).std()
        else:
            features.loc[i,'word_length_mean'] = 0
            features.loc[i,'word_length_std'] = 0
        
        if len(sent_lengths) > 0:
            features.loc[i,'sent_length_mean'] = sum(sent_lengths)/len(sent_lengths)
            features.loc[i,'sent_length_std'] = pd.Series(sent_lengths).std()
        else:
            features.loc[i,'sent_length_mean'] = 0
            features.loc[i,'sent_length_std'] = 0
    return features

Text_Essay_Features = Text_Feature_Extraction(extracted_text)
Train_Text_Essay_Features = Text_Feature_Extraction(extracted_text_train)

In [55]:
Text_Essay_Features

Unnamed: 0,id,len_text,sentence_count,paragraph_count,word_count,word_length_mean,word_length_std,sent_length_mean,sent_length_std
0,0000aaaa,2,0,0,0.0,0.0,0.0,0.0,0.0
1,2222bbbb,2,1,1,1.0,2.0,,1.0,
2,4444cccc,2,1,1,1.0,1.0,,1.0,


In [56]:
Train_Text_Essay_Features

Unnamed: 0,id,len_text,sentence_count,paragraph_count,word_count,word_length_mean,word_length_std,sent_length_mean,sent_length_std
0,001519c8,1528,14,3,256.0,4.835938,2.472682,18.285714,6.497675
1,0022f953,1675,16,1,330.0,3.900000,2.140718,20.625000,13.608208
2,0042269b,2587,19,6,408.0,5.254902,2.743538,21.473684,5.263801
3,0059420b,1154,13,1,208.0,4.485577,2.633816,16.000000,6.493587
4,0075873a,1425,16,5,255.0,4.450980,2.437516,15.937500,8.667708
...,...,...,...,...,...,...,...,...,...
2466,ffb8c745,1634,13,1,276.0,4.717391,2.606256,21.230769,6.622611
2467,ffbef7e5,2335,30,6,444.0,4.155405,2.163872,14.800000,7.526505
2468,ffccd6fd,2761,4,3,203.0,4.497537,2.464349,50.750000,10.719919
2469,ffec5b38,2552,27,1,417.0,4.997602,2.915887,15.444444,5.631869


---------
### TF-IDF Feature Extraction

### Works taken from `texts_tfidf.ipynb`

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pickle
from sklearn.decomposition import TruncatedSVD



texts_train = extracted_text_train[['id', 'text']]
texts_test = extracted_text[['id', 'text']]


# refactor version, completely separate train and test


# only fit on train
vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(1, 5),
    max_features=30000,
    dtype=np.float32,
)
X_train_tfidf = vectorizer.fit_transform(texts_train['text'])

n_features = X_train_tfidf.shape[1]
svdsize = min(64, max(1, n_features - 1))

svd = TruncatedSVD(
    n_components=svdsize,
    random_state=42,
    n_iter=7
)
X_train_svd = svd.fit_transform(X_train_tfidf)


# train_svd_df = pd.DataFrame(
#     X_train_svd,
#     columns=[f'{i:02d}' for i in range(svdsize)]
# )
# train_svd_df.insert(0, 'id', texts_train['id'].values)
# train_svd_df.to_csv("/data/train_tfidf_svd.csv", index=False)


In [58]:
X_test_tfidf = vectorizer.transform(texts_test['text'])
X_test_svd = svd.transform(X_test_tfidf)


svdsize = X_test_svd.shape[1]
test_svd_df = pd.DataFrame(
    X_test_svd,
    columns=[f'{i:02d}' for i in range(svdsize)]
)
test_svd_df.insert(0, 'id', texts_test['id'].values)
TFIDF_Features = test_svd_df

### Works taken from `operation_tfidf.ipynb` 

In [59]:
# === TRAIN ===
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD


activity_df = train_df[['id', 'activity']]

print(train_df.head(3))


def rebuild_text(grp):
    buf = []
    for op in grp['activity']:
        buf.append(op[0])
    return "".join(buf)

operations = (
    activity_df.groupby('id')
               .apply(rebuild_text)
               .reset_index(name='operation')
)

print(operations.head(3))

# 3) TF-IDF（在训练集上 fit）
vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(1, 5),
    max_features=30000,
    dtype=np.float32,
)
X_tfidf = vectorizer.fit_transform(operations['operation'])


# 4) SVD（在训练集上 fit）
n_features = X_tfidf.shape[1]
svdsize = min(64, n_features - 1)

svd = TruncatedSVD(
    n_components=svdsize,
    random_state=42,
    n_iter=7
)
X_svd = svd.fit_transform(X_tfidf)

svd_df = pd.DataFrame(
    X_svd,
    columns=[f'{i:02d}' for i in range(svdsize)]
)
svd_df.insert(0, 'id', operations['id'].values)

print(svd_df.head())
train_svd_df_operation = svd_df


   Unnamed: 0        id  event_id  down_time  up_time  action_time activity  \
0           0  001519c8         1      60147    60238           91    Input   
1           1  001519c8         2      60657    60784          127    Input   
2           2  001519c8         3      60757    60861          104    Input   

  down_event up_event text_change  cursor_position  word_count  id_encoded  
0          q        q           q                1           1           0  
1          q        q           q                2           1           0  
2          q        q           q                3           1           0  
         id                                          operation
0  001519c8  IIIIIIIIIIIIIIIRIIIIIIIIIIIIIIRIIIIIIIIIIIIIII...
1  0022f953  IIIIIIIIIIIIIIIIIIIRRRRRRRRRRRIIIIIIIIIIIIIIRR...
2  0042269b  IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...
         id        00        01        02        03        04        05  \
0  001519c8  0.997578  0.063804 -0.024434 -0.005

In [60]:
# === TEST ===
import pandas as pd
import numpy as np
import pickle



test_df_activity = df[['id', 'activity']]



test_operations = (
    test_df_activity.groupby('id')
           .apply(rebuild_text)
           .reset_index(name='operation')
)

# 4) only transform, do not fit on tests
X_test_tfidf = vectorizer.transform(test_operations['operation'])
X_test_svd   = svd.transform(X_test_tfidf)

svdsize = X_test_svd.shape[1]
test_svd_df = pd.DataFrame(
    X_test_svd,
    columns=[f'{i:02d}' for i in range(svdsize)]
)
test_svd_df.insert(0, 'id', test_operations['id'].values)
TFIDF_Features_Operations = test_svd_df

## Concat Data and send to model

In [61]:

def merge_preprocessed_data(data_dir='data'):
    dataset_behaviour = Behavioural_features
    dataset_text = Text_Essay_Features
    dataset_tfidf_text = TFIDF_Features
    dataset_tfidf_operation = TFIDF_Features_Operations

    # merge on 'id'
    merged = dataset_behaviour.merge(dataset_text, on='id', how='inner')

    # rename column name
    tfidf_text_renamed = dataset_tfidf_text.rename(
        columns={col: f'tfidf_text_{col}' if col != 'id' else col
                 for col in dataset_tfidf_text.columns}
    )
    tfidf_operation_renamed = dataset_tfidf_operation.rename(
        columns={col: f'tfidf_operation_{col}' if col != 'id' else col
                 for col in dataset_tfidf_operation.columns}
    )

    merged = merged.merge(tfidf_text_renamed, on='id', how='inner')
    merged = merged.merge(tfidf_operation_renamed, on='id', how='inner')


    return merged


if __name__ == '__main__':
    merged_df = merge_preprocessed_data("data")
    

## Test Submission only

In [62]:
# === TEST ONLY: load model(s) -> predict on test -> write submission ===
import os
import joblib
import numpy as np
import pandas as pd

# ---------------- paths ----------------
sub_path  = 'submission.csv'

# ---------------- load test ----------------
df_test = merged_df
test_ids = df_test["id"].values
X_test = df_test.drop(columns=["id"])


model_file_candidate = "lgbm_.pkl"


model_obj = None
if os.path.exists(model_file_candidate):
    model_obj = joblib.load(model_file_candidate)
    print(f"Loaded model from: {model_file_candidate}")
if model_obj is None:
    raise FileNotFoundError("No saved model found. Expected one of: " + ", ".join(model_file_candidate))

def predict_with_model_obj(model_obj, X):
    # Multiple Fold Model
    if isinstance(model_obj, (list, tuple)):
        preds = np.mean([m.predict(X) for m in model_obj], axis=0)
        return preds
    # Single Model
    return model_obj.predict(X)

test_preds = predict_with_model_obj(model_obj,X_test)

# ---------------- write submission ----------------
submission = pd.DataFrame({"id": test_ids, "score": test_preds})
submission.to_csv(sub_path, index=False)
print(f"Submission saved to: {sub_path}")

Loaded model from: lgbm_.pkl
Submission saved to: submission.csv
