# Integrated Notebook for Task WritingProcess
## Data Preprocessing

---------
### Idle Removing and Time Regularization from `preprocess.py`

In [None]:
KAGGLE_ENVIRONMENT = False


if KAGGLE_ENVIRONMENT:
    TEST_DATA_PATH = "/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv"
    TRAIN_DATA_PATH = "/kaggle/input/trainlogs-unicodefixed/train_logs_raw_unicode_fixed.csv"
    TRAIN_DATA_PREPROCESSED_PATH = "/kaggle/input/train-preprocessed/train_preprocessed.csv"
    TRAIN_SCORE_PATH = "/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv"
    SUBMISSION_PATH  = '/kaggle/working/submission.csv'
else:
    TEST_DATA_PATH = "data/test_logs.csv"
    TRAIN_DATA_PATH = "data/train_logs_raw_unicode_fixed.csv"
    TRAIN_DATA_PREPROCESSED_PATH = "data/train_preprocessed.csv"
    TRAIN_SCORE_PATH = "data/train_logs.csv"
    SUBMISSION_PATH  = 'data/submission.csv'

`ftfy` takes tons of time to run so I temporary disabled it on train set, a pre-processed dataset is uploaded to replace it. 

In [1]:

# Solve ftfy dependency
if KAGGLE_ENVIRONMENT:
    !pip install /kaggle/input/ftfypkg/ftfy_pkg/wcwidth-0.2.14-py2.py3-none-any.whl --no-index --find-links /kaggle/input/ftfypkg/ftfy_pkg

    !pip install /kaggle/input/ftfypkg/ftfy_pkg/ftfy-6.3.1-py3-none-any.whl --no-index --find-links /kaggle/input/ftfypkg/ftfy_pkg



Looking in links: /kaggle/input/ftfypkg/ftfy_pkg
Processing /kaggle/input/ftfypkg/ftfy_pkg/wcwidth-0.2.14-py2.py3-none-any.whl
Installing collected packages: wcwidth
  Attempting uninstall: wcwidth
    Found existing installation: wcwidth 0.2.13
    Uninstalling wcwidth-0.2.13:
      Successfully uninstalled wcwidth-0.2.13
Successfully installed wcwidth-0.2.14
Looking in links: /kaggle/input/ftfypkg/ftfy_pkg
Processing /kaggle/input/ftfypkg/ftfy_pkg/ftfy-6.3.1-py3-none-any.whl
Installing collected packages: ftfy
Successfully installed ftfy-6.3.1


In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
import ftfy
import warnings
from pathlib import Path
import re

warnings.filterwarnings('ignore')

In [3]:
class Preprocess:

    def label_encoding(self, df, col="id"):
        label_encoder = LabelEncoder()
        label_encoder.fit(df[col])
        df[col + "_encoded"] = label_encoder.transform(df[col])
        return df

    # remove time that the author havent start writing or is resting
    # reference: remove_margin for https://www.kaggle.com/code/tomooinubushi/1st-place-solution-training-and-inference-code

    def remove_start_and_end_time(
        self, df, start_margin=2 * 60 * 1000, end_margin=2 * 60 * 1000
    ):
        df = df[df["up_event"] != "Unidentified"].reset_index(drop=True)
        result_df = []
        grouped_df = df.groupby("id_encoded")

        for _, log in tqdm(grouped_df):
            valid_events = log[
                (log.activity != "Nonproduction")
                | (log.up_event != "Shift")
                | (log.up_event != "CapsLock")
            ].down_time.values
            if len(valid_events) == 0:
                continue
            log = log[
                (log.down_time > valid_events.min() - start_margin)
                & (log["down_time"] <= valid_events.max() + end_margin)
            ].copy()
            log["event_id"] = range(len(log))
            result_df.append(log)

        result = pd.concat(result_df, ignore_index=True)

        return result

    def remove_rest_time(
        self, df, time_margin=1 * 60 * 1000, action_margin=5 * 60 * 1000
    ):
        down_times, up_times = [], []
        prev_idx = -1
        result_df = df[["id_encoded", "down_time", "up_time"]].values
        for row in tqdm(result_df):
            idx, down_time, up_time = int(row[0]), int(row[1]), int(row[2])
            if prev_idx != idx:
                prev_down_time = down_time
                prev_corrected_down_time = 0
            gap_down_time = np.clip(down_time - prev_down_time, 0, time_margin)
            action_time = np.clip(up_time - down_time, 0, action_margin)

            new_down_time = prev_corrected_down_time + gap_down_time
            new_up_time = new_down_time + action_time
            down_times.append(new_down_time)
            up_times.append(new_up_time)
            prev_idx, prev_corrected_down_time, prev_down_time = (
                idx,
                new_down_time,
                down_time,
            )
        df["down_time"], df["up_time"] = down_times, up_times
        return df

In [None]:
preprocessor = Preprocess()
# ------------------ Config dataset (In submission we only have test file) ----------------------------
df = pd.read_csv(TEST_DATA_PATH)
# ------------------ Config dataset (TFIDF has to be fit on train and transform on test) ----------------------------
train_df = pd.read_csv(TRAIN_DATA_PATH)
# Replacing the original dataset, for fast processing and scoring.
train_score_df = pd.read_csv(TRAIN_SCORE_PATH)





df = preprocessor.label_encoding(df)
df = preprocessor.remove_start_and_end_time(df)
df = preprocessor.remove_rest_time(df)

train_df = preprocessor.label_encoding(train_df)
train_df = preprocessor.remove_start_and_end_time(train_df)
train_df = preprocessor.remove_rest_time(train_df)




-----------
### Event,Unicode Cleaning from `Preprocessing.ipynb`

In [None]:
def label_encoding(df, col="id"):
    label_encoder = LabelEncoder()
    label_encoder.fit(df[col])
    df[col + "_encoded"] = label_encoder.transform(df[col])
    return df


# remove time that the author havent start writing or is resting
# reference: remove_margin for https://www.kaggle.com/code/tomooinubushi/1st-place-solution-training-and-inference-code

def remove_procrastination_time(df, start_margin=2*60*1000, end_margin=2*60*1000):
    df = df[df['up_event'] != 'Unidentified'].reset_index(drop=True)
    result_df = []
    grouped_df = df.groupby('id_encoded')

    for _, log in tqdm(grouped_df):
        valid_events = log[(log.activity != 'Nonproduction') & (
            log.up_event != 'Shift') & (log.up_event != 'CapsLock')].down_time.values
        if len(valid_events) == 0:
            continue
        log = log[(log.down_time > valid_events.min() - start_margin)
                  & (log['down_time'] <= valid_events.max() + end_margin)].copy()
        log['event_id'] = range(len(log))
        result_df.append(log)

    result = pd.concat(result_df, ignore_index=True)

    return result


In [None]:
class CleanPreprocessor:
    def cleaning(self,df,skipUnicodeFixing=False):
        df = label_encoding(df)
        df = remove_procrastination_time(df)
        df = df[df['activity'] != 'Nonproduction' ].reset_index(drop=True)
        cols = ['down_event', 'up_event', 'text_change']
        if not skipUnicodeFixing:
            df.loc[:, cols] = df.loc[:, cols].apply(
                lambda s: s.astype('string').map(lambda x: ftfy.fix_text(x) if x is not pd.NA else x)
            )
        
        drop_events = ['LeftClick','RightClick']
        df = df[~df['down_event'].isin(drop_events)]
        df['event_id'] = df.groupby('id').cumcount() + 1 # reset event_id
        df.reset_index(inplace=True,drop=True)
        return df
    

cleaner = CleanPreprocessor()
df = cleaner.cleaning(df)
train_df = cleaner.cleaning(train_df,skipUnicodeFixing=True)

        


-------
### Text Essay Rebuilding
Work is taken from `text_process.py`

In [None]:
class TextProcessor:
    PUNCTUATION_MAP = {
        "SPACE": " ",
        "COMMA": ",",
        "DOUBLE_QUOTE": '"',
        "PERIOD": ".",
        "PARENTHESES_OPEN": "(",
        "PARENTHESES_CLOSE": ")",
        "SQUARE_BRACKET_OPEN": "[",
        "SQUARE_BRACKET_CLOSE": "]",
        "CURLY_BRACKET_OPEN": "{",
        "CURLY_BRACKET_CLOSE": "}",
        "EXCLAMATION_MARK": "!",
        "QUESTION_MARK": "?",
    }

    def insert_text(self, text, s, pos):
        return "".join((text[:pos], s, text[pos:]))

    def remove_text(self, text, s, pos):
        return "".join((text[:pos], text[pos + len(s):]))

    def replace_text(self, text, s1, s2, pos):
        return "".join((text[:pos], s2, text[pos + len(s1):]))

    def move_text(self, text, s, pos1, pos2):
        text = self.remove_text(text, s, pos1)
        text = self.insert_text(text, s, pos2)
        return text

    def split_to_word(self, s):
        s = s.lower()
        char_sep = "@"
        punctuation_chars = list(self.PUNCTUATION_MAP.values())
        for pun in punctuation_chars:
            s = s.replace(pun, char_sep)
        s_arr = re.split(char_sep, s)
        s_arr = [w for w in s_arr if w.strip()]  # Keep non-empty words
        return s_arr

    def split_to_sentence(self, s):
        s = s.lower()
        char_sep = "@"
        punctuation = [".", "!", "?"]
        for punc in punctuation:
            s = s.replace(punc, char_sep)
        s_arr = re.split(char_sep, s)
        s_arr = [w for w in s_arr if w.strip()]  # Keep non-empty sentences
        return s_arr

    def split_to_paragraph(self, s):
        s = s.lower()
        s_arr = re.split(r'n\s*n+', s)
        s_arr = [w for w in s_arr if w.strip()]  # Keep non-empty paragraphs
        return s_arr

    def change_punctuation(self, text):
        reverse_map = {v: k.lower()
                       for k, v in self.PUNCTUATION_MAP.items()}
        result = []
        for char in text:
            if char in reverse_map:
                result.append(' ' + reverse_map[char] + ' ')
            else:
                result.append(char)
        output = "".join(result)
        output = re.sub(r"\s+", " ", output).strip()

        return output

class EssayConstructor:
    def __init__(self):
        self.text_processor = TextProcessor()

    def recon_writing(self, df):
        res_all = []
        len_texts = []
        sentence_counts = []
        paragraph_counts = []

        res = ""
        prev_idx = ""

        temp_df = df[['id', 'activity', 'up_event', 'text_change',
                      'cursor_position', 'word_count']].values

        for row in tqdm(temp_df):
            idx = str(row[0])
            activity, up_event, text_change = str(
                row[1]), str(row[2]), str(row[3])
            cursor_position, _ = int(row[4]), int(row[5])

            # new idx
            if idx != prev_idx:
                if prev_idx != "":
                    # append first essay data
                    res_all.append(res)
                    len_texts.append(len_text)
                    sentence_counts.append(sentence_count)
                    paragraph_counts.append(paragraph_count)

                res, len_text, sentence_count, paragraph_count = "", 0, 0, 0
                prev_idx = idx

            if activity != "Nonproduction":
                # replace the newline character to n
                text_change = text_change.replace("@", "/").replace("\n", "n")

                if (activity == "Input") | (activity == "Paste"):
                    res = self.text_processor.insert_text(
                        res, text_change, cursor_position - len(text_change)
                    )

                elif activity == "Remove/Cut":
                    res = self.text_processor.remove_text(
                        res, text_change, cursor_position
                    )

                elif activity == "Replace":
                    before, after = text_change.split(" => ")
                    res = self.text_processor.replace_text(
                        res, before, after, cursor_position - len(after)
                    )

                elif "Move" in activity:
                    pos = [int(s) for s in re.findall(r"\d+", activity)]
                    # pos 0 start pos1 end pos2 start pos3 end
                    res = self.text_processor.move_text(
                        res, text_change, pos[0], pos[2]
                    )

                len_text = len(res)
                sentence_count = len(
                    self.text_processor.split_to_sentence(res))
                paragraph_count = len(
                    self.text_processor.split_to_paragraph(res))

            prev_up_event = up_event

        # append last essay data
        res_all.append(res)
        len_texts.append(len_text)
        sentence_counts.append(sentence_count)
        paragraph_counts.append(paragraph_count)

        return res_all, len_texts, sentence_counts, paragraph_counts


In [None]:
essay_constructor = EssayConstructor()
reconstructed_texts, len_texts, sentence_counts, paragraph_counts = essay_constructor.recon_writing(
    df)
idx = df["id"].unique()
result_df = pd.DataFrame({"id": idx, "text": reconstructed_texts, "len_text": len_texts,
                         "sentence_count": sentence_counts, "paragraph_count": paragraph_counts})

extracted_text = result_df

reconstructed_texts, len_texts, sentence_counts, paragraph_counts = essay_constructor.recon_writing(
    train_df)
idx = train_df["id"].unique()
extracted_text_train = pd.DataFrame({"id": idx, "text": reconstructed_texts, "len_text": len_texts,
                         "sentence_count": sentence_counts, "paragraph_count": paragraph_counts})


-------------------
## Feature Engineering
### Behaviour Feature

This part is taken from `feature_extraction.ipynb`

## 2. Feature Extraction Functions

Extract different behavioural features from keystroke logs.
We want to capture:

- pauses (when people are thinking)
- bursts (when they're typing continuously)
- editing behaviour (how much they revise)
- cursor movement (planning vs going back to edit)

### 2.1 Base Features


In [None]:
train_df

In [None]:
def extract_features(df):
    """Pull out the main features from the log data"""
    
    # Count up events and get basic stats
    features = df.groupby("id").agg(
        events_count=('event_id', 'count'),
        total_time=('up_time', 'max'),
        total_chars=('word_count', 'max'),
        mean_action_time=('action_time', 'mean'),
        std_action_time=('action_time', 'std'),
        max_action_time=('action_time', 'max'),
        min_action_time=('action_time', 'min'),
        
        # Count different types of actions
        backspace_count=('activity', lambda x: (x == "Remove/Cut").sum()),
        paste_count=('activity', lambda x: (x == "Paste").sum()),
        input_count=('activity', lambda x: (x == "Input").sum()),
        move_count=('activity', lambda x: x.str.contains("Move", na=False).sum()),
        replace_count=('activity', lambda x: (x == "Replace").sum()),
        nonproduction_count=('activity', lambda x: (x == "Nonproduction").sum()),
        
        # Where the cursor was
        cursor_pos_mean=('cursor_position', 'mean'),
        cursor_pos_std=('cursor_position', 'std'),
        cursor_pos_max=('cursor_position', 'max'),
        
        # Word count stats
        word_count_mean=('word_count', 'mean'),
        word_count_std=('word_count', 'std'),
        word_count_diff=('word_count', lambda x: x.max() - x.min()),
    ).reset_index()
    
    # Replace any missing values with 0
    features = features.fillna(0)
    
    # Calculate some ratios
    features['chars_per_min'] = features['total_chars'] / (features['total_time'] / 60000 + 1e-6)
    features['events_per_min'] = features['events_count'] / (features['total_time'] / 60000 + 1e-6)
    features['backspace_ratio'] = features['backspace_count'] / (features['input_count'] + 1)
    features['paste_ratio'] = features['paste_count'] / (features['events_count'] + 1)
    features['replace_ratio'] = features['replace_count'] / (features['events_count'] + 1)
    features['nonproduction_ratio'] = features['nonproduction_count'] / (features['events_count'] + 1)
    features['revision_ratio'] = (features['backspace_count'] + features['replace_count']) / (features['total_chars'] + 1)
    
    return features

### 2.2 Pause Features


In [None]:
def pause_features(df):
    """Get features about pauses (gaps between keystrokes)"""
    df = df.sort_values(["id", "down_time"]).copy()
    df["iki"] = df.groupby("id")["down_time"].diff()
    
    # Count pauses at different thresholds (2s, 5s, 10s)
    pause_2s = df.groupby("id")["iki"].apply(lambda x: (x > 2000).sum()).rename("pause_2s_count")
    pause_5s = df.groupby("id")["iki"].apply(lambda x: (x > 5000).sum()).rename("pause_5s_count")
    pause_10s = df.groupby("id")["iki"].apply(lambda x: (x > 10000).sum()).rename("pause_10s_count")
    
    # Basic pause stats
    mean_pause = df.groupby("id")["iki"].mean().rename("mean_pause")
    median_pause = df.groupby("id")["iki"].median().rename("median_pause")
    std_pause = df.groupby("id")["iki"].std().rename("std_pause")
    max_pause = df.groupby("id")["iki"].max().rename("max_pause")
    min_pause = df.groupby("id")["iki"].min().rename("min_pause")
    
    return pause_2s, pause_5s, pause_10s, mean_pause, median_pause, std_pause, max_pause, min_pause


def burst_features(df):
    """Get features about bursts (when they're typing continuously)"""
    df = df.sort_values(["id", "down_time"]).copy()
    df["iki"] = df.groupby("id")["down_time"].diff()
    df["burst"] = (df["iki"] > 2000).astype(int)
    df["burst_id"] = df.groupby("id")["burst"].cumsum()
    
    burst_len = df.groupby(["id", "burst_id"]).size()
    avg_burst = burst_len.groupby("id").mean().rename("avg_burst")
    max_burst = burst_len.groupby("id").max().rename("max_burst")
    std_burst = burst_len.groupby("id").std().rename("std_burst")
    
    return avg_burst, max_burst, std_burst


def p_burst_features(df):
    """Get P-burst features (how many words per burst)"""
    df = df.sort_values(["id", "down_time"]).copy()
    df["iki"] = df.groupby("id")["down_time"].diff()
    
    # P-bursts: pauses longer than 2s
    df["p_burst"] = (df["iki"] > 2000).astype(int)
    df["p_burst_id"] = df.groupby("id")["p_burst"].cumsum()
    
    # How many words in each burst
    p_burst_words = df.groupby(["id", "p_burst_id"])["word_count"].apply(lambda x: x.max() - x.min())
    avg_words_per_p_burst = p_burst_words.groupby("id").mean().rename("avg_words_per_p_burst")
    
    return avg_words_per_p_burst

### 2.3 Activity Sequence & Text Change Features


In [None]:
def activity_sequence_features(df):
    """Get features from activity patterns and transitions"""
    features = []
    
    for id_val in df['id'].unique():
        id_df = df[df['id'] == id_val].sort_values('down_time')
        activities = id_df['activity'].values
        
        # Track how activities transition from one to another
        transitions = {}
        for i in range(len(activities) - 1):
            transition = f"{activities[i]}->{activities[i+1]}"
            transitions[transition] = transitions.get(transition, 0) + 1
        
        # Common patterns
        input_to_remove = transitions.get('Input->Remove/Cut', 0)
        remove_to_input = transitions.get('Remove/Cut->Input', 0)
        input_to_input = transitions.get('Input->Input', 0)
        paste_to_input = transitions.get('Paste->Input', 0)
        
        # Find the longest streaks of the same activity
        max_input_streak = 0
        max_remove_streak = 0
        current_input_streak = 0
        current_remove_streak = 0
        
        for act in activities:
            if act == 'Input':
                current_input_streak += 1
                max_input_streak = max(max_input_streak, current_input_streak)
                current_remove_streak = 0
            elif act == 'Remove/Cut':
                current_remove_streak += 1
                max_remove_streak = max(max_remove_streak, current_remove_streak)
                current_input_streak = 0
            else:
                current_input_streak = 0
                current_remove_streak = 0
        
        # How varied are the activities
        unique_activities = len(set(activities))
        activity_switches = sum(1 for i in range(len(activities)-1) if activities[i] != activities[i+1])
        
        features.append({
            'id': id_val,
            'input_to_remove_trans': input_to_remove,
            'remove_to_input_trans': remove_to_input,
            'input_to_input_trans': input_to_input,
            'paste_to_input_trans': paste_to_input,
            'max_input_streak': max_input_streak,
            'max_remove_streak': max_remove_streak,
            'unique_activities': unique_activities,
            'activity_switches': activity_switches,
            'activity_switch_rate': activity_switches / len(activities) if len(activities) > 0 else 0
        })
    
    return pd.DataFrame(features)


def text_change_features(df):
    """Features about how the text changes"""
    df = df.sort_values(['id', 'down_time']).copy()
    
    # How much text was added or removed
    df['text_change'] = df.groupby('id')['word_count'].diff().fillna(0)
    
    features = df.groupby('id').agg(
        total_text_produced=('text_change', lambda x: x[x > 0].sum()),
        total_text_removed=('text_change', lambda x: abs(x[x < 0].sum())),
        text_production_rate=('text_change', lambda x: x[x > 0].mean()),
        text_removal_rate=('text_change', lambda x: x[x < 0].mean()),
        max_text_addition=('text_change', 'max'),
        max_text_removal=('text_change', 'min'),
        text_volatility=('text_change', 'std'),
        positive_text_changes=('text_change', lambda x: (x > 0).sum()),
        negative_text_changes=('text_change', lambda x: (x < 0).sum()),
    ).reset_index()
    
    # Calculate some more useful ratios
    features['text_removal_ratio'] = features['total_text_removed'] / (features['total_text_produced'] + 1)
    features['net_text_production'] = features['total_text_produced'] - features['total_text_removed']
    features['text_efficiency'] = features['total_text_produced'] / (features['positive_text_changes'] + 1)
    
    return features

### 2.4 Temporal & Velocity Features


In [None]:
def time_based_features(df):
    """Features based on when things happen (early, middle, late)"""
    df = df.sort_values(['id', 'down_time']).copy()
    
    # Split the writing session into three parts
    df['time_percentile'] = df.groupby('id')['down_time'].rank(pct=True)
    
    features = []
    for id_val in df['id'].unique():
        id_df = df[df['id'] == id_val]
        
        # Split into early, middle, and late phases
        early_phase = id_df[id_df['time_percentile'] <= 0.33]
        middle_phase = id_df[(id_df['time_percentile'] > 0.33) & (id_df['time_percentile'] <= 0.67)]
        late_phase = id_df[id_df['time_percentile'] > 0.67]
        
        features.append({
            'id': id_val,
            'early_events': len(early_phase),
            'middle_events': len(middle_phase),
            'late_events': len(late_phase),
            'early_input_ratio': (early_phase['activity'] == 'Input').sum() / (len(early_phase) + 1),
            'middle_input_ratio': (middle_phase['activity'] == 'Input').sum() / (len(middle_phase) + 1),
            'late_input_ratio': (late_phase['activity'] == 'Input').sum() / (len(late_phase) + 1),
            'early_remove_ratio': (early_phase['activity'] == 'Remove/Cut').sum() / (len(early_phase) + 1),
            'late_remove_ratio': (late_phase['activity'] == 'Remove/Cut').sum() / (len(late_phase) + 1),
            'middle_paste_ratio': (middle_phase['activity'] == 'Paste').sum() / (len(middle_phase) + 1),
            'late_phase_activity': len(late_phase) / (len(id_df) + 1),
        })
    
    return pd.DataFrame(features)


def keystroke_velocity_features(df):
    """Features about typing speed"""
    df = df.sort_values(['id', 'down_time']).copy()
    df['iki'] = df.groupby('id')['down_time'].diff()
    
    # Only look at actual typing events
    input_df = df[df['activity'] == 'Input'].copy()
    
    if len(input_df) == 0:
        return pd.DataFrame()
    
    features = input_df.groupby('id').agg(
        input_iki_mean=('iki', 'mean'),
        input_iki_std=('iki', 'std'),
        input_iki_median=('iki', 'median'),
        input_iki_min=('iki', 'min'),
        input_iki_max=('iki', 'max'),
        fast_keystrokes=('iki', lambda x: (x < 100).sum()),
        moderate_keystrokes=('iki', lambda x: ((x >= 100) & (x <= 1000)).sum()),
        slow_keystrokes=('iki', lambda x: (x > 1000).sum()),
    ).reset_index()
    
    # How consistent is the typing
    features['keystroke_consistency'] = features['input_iki_std'] / (features['input_iki_mean'] + 1)
    features['fast_keystroke_ratio'] = features['fast_keystrokes'] / (features['fast_keystrokes'] + features['moderate_keystrokes'] + features['slow_keystrokes'] + 1)
    features['typing_rhythm_score'] = features['moderate_keystrokes'] / (features['fast_keystrokes'] + features['moderate_keystrokes'] + features['slow_keystrokes'] + 1)
    
    return features


def word_count_velocity_features(df):
    """Features about how word count changes"""
    df = df.sort_values(['id', 'down_time']).copy()
    
    features = []
    for id_val in df['id'].unique():
        id_df = df[df['id'] == id_val]
        
        word_counts = id_df['word_count'].values
        time_stamps = id_df['down_time'].values
        
        # How fast are words being added
        if len(word_counts) > 1:
            word_velocity = np.diff(word_counts) / (np.diff(time_stamps) + 1)
            
            features.append({
                'id': id_val,
                'avg_word_velocity': np.mean(word_velocity),
                'max_word_velocity': np.max(word_velocity),
                'min_word_velocity': np.min(word_velocity),
                'std_word_velocity': np.std(word_velocity),
                'positive_velocity_ratio': (word_velocity > 0).sum() / len(word_velocity)
            })
        else:
            features.append({
                'id': id_val,
                'avg_word_velocity': 0,
                'max_word_velocity': 0,
                'min_word_velocity': 0,
                'std_word_velocity': 0,
                'positive_velocity_ratio': 0
            })
    
    return pd.DataFrame(features)


def activity_timing_features(df):
    """How much time is spent on each type of activity"""
    df = df.sort_values(['id', 'down_time']).copy()
    
    features = []
    for id_val in df['id'].unique():
        id_df = df[df['id'] == id_val]
        
        # Add up time for each activity
        input_time = id_df[id_df['activity'] == 'Input']['action_time'].sum()
        remove_time = id_df[id_df['activity'] == 'Remove/Cut']['action_time'].sum()
        paste_time = id_df[id_df['activity'] == 'Paste']['action_time'].sum()
        nonprod_time = id_df[id_df['activity'] == 'Nonproduction']['action_time'].sum()
        
        total_time = id_df['action_time'].sum()
        
        features.append({
            'id': id_val,
            'input_time_total': input_time,
            'remove_time_total': remove_time,
            'paste_time_total': paste_time,
            'nonprod_time_total': nonprod_time,
            'input_time_ratio': input_time / (total_time + 1),
            'remove_time_ratio': remove_time / (total_time + 1),
            'productive_time_ratio': (input_time + paste_time) / (total_time + 1),
        })
    
    return pd.DataFrame(features)

### 2.5 Revision & Cursor Movement Features


In [None]:
def revision_pattern_features(df):
    """Features about revision behaviour and editing patterns"""
    df = df.sort_values(['id', 'down_time']).copy()
    
    features = []
    for id_val in df['id'].unique():
        id_df = df[df['id'] == id_val]
        
        # Where in the text are they making changes
        cursor_positions = id_df['cursor_position'].values
        activities = id_df['activity'].values
        word_counts = id_df['word_count'].values
        
        # Count edits at start, middle, and end
        revisions_start = 0
        revisions_middle = 0
        revisions_end = 0
        
        for i, (pos, act, wc) in enumerate(zip(cursor_positions, activities, word_counts)):
            if act in ['Remove/Cut', 'Replace'] and wc > 0:
                relative_pos = pos / (wc + 1)
                if relative_pos < 0.33:
                    revisions_start += 1
                elif relative_pos < 0.67:
                    revisions_middle += 1
                else:
                    revisions_end += 1
        
        # Look for write-then-edit cycles
        review_cycles = 0
        in_writing = False
        for act in activities:
            if act == 'Input':
                in_writing = True
            elif act in ['Remove/Cut', 'Replace'] and in_writing:
                review_cycles += 1
                in_writing = False
        
        # How often they go backwards to edit
        backward_movements = sum(1 for i in range(len(cursor_positions)-1) 
                                if cursor_positions[i+1] < cursor_positions[i])
        
        total_revisions = revisions_start + revisions_middle + revisions_end
        
        features.append({
            'id': id_val,
            'revisions_at_start': revisions_start,
            'revisions_at_middle': revisions_middle,
            'revisions_at_end': revisions_end,
            'total_revisions': total_revisions,
            'review_cycles': review_cycles,
            'backward_movements': backward_movements,
            'early_revision_ratio': revisions_start / (total_revisions + 1),
            'end_revision_ratio': revisions_end / (total_revisions + 1),
            'revision_density': total_revisions / (len(id_df) + 1),
        })
    
    return pd.DataFrame(features)


def cursor_movement_features(df):
    """Features about how the cursor moves around"""
    df = df.sort_values(['id', 'down_time']).copy()
    df['cursor_jump'] = df.groupby('id')['cursor_position'].diff().abs()
    
    features = df.groupby('id').agg(
        avg_cursor_jump=('cursor_jump', 'mean'),
        max_cursor_jump=('cursor_jump', 'max'),
        total_cursor_movement=('cursor_jump', 'sum'),
        small_cursor_jumps=('cursor_jump', lambda x: (x <= 5).sum()),
        medium_cursor_jumps=('cursor_jump', lambda x: ((x > 5) & (x <= 50)).sum()),
        large_cursor_jumps=('cursor_jump', lambda x: (x > 50).sum()),
        cursor_jump_std=('cursor_jump', 'std'),
    ).reset_index()
    
    # Where is the cursor most of the time
    cursor_at_end = df.groupby('id').apply(
        lambda x: (x['cursor_position'] == x['word_count']).sum() / len(x)
    ).rename('cursor_at_end_ratio')
    
    cursor_at_start = df.groupby('id').apply(
        lambda x: (x['cursor_position'] == 0).sum() / len(x)
    ).rename('cursor_at_start_ratio')
    
    features = features.merge(cursor_at_end, on='id', how='left')
    features = features.merge(cursor_at_start, on='id', how='left')
    
    # Are they mostly writing forwards
    features['forward_writing_tendency'] = features['cursor_at_end_ratio']
    features['navigation_complexity'] = features['large_cursor_jumps'] / (features['total_cursor_movement'] + 1)
    
    return features

### 2.6 Rolling Window & Distribution Features


In [None]:
def rolling_features(df, window=10):
    """Look at trends over time using a sliding window"""
    df = df.sort_values(['id', 'down_time']).copy()
    
    features = []
    for id_val in df['id'].unique():
        id_df = df[df['id'] == id_val]
        
        if len(id_df) < window:
            features.append({
                'id': id_val,
                'action_time_rolling_mean': id_df['action_time'].mean(),
                'action_time_rolling_std': id_df['action_time'].std(),
                'word_count_rolling_trend': 0,
                'action_time_trend': 0,
                'action_time_acceleration': 0
            })
            continue
        
        # Calculate moving averages
        action_rolling = id_df['action_time'].rolling(window=window, min_periods=1)
        word_rolling = id_df['word_count'].rolling(window=window, min_periods=1)
        
        # Are things speeding up or slowing down
        word_trend = (word_rolling.mean().iloc[-1] - word_rolling.mean().iloc[0]) if len(id_df) >= window else 0
        action_trend = (action_rolling.mean().iloc[-1] - action_rolling.mean().iloc[0]) if len(id_df) >= window else 0
        
        features.append({
            'id': id_val,
            'action_time_rolling_mean': action_rolling.mean().mean(),
            'action_time_rolling_std': action_rolling.std().mean(),
            'word_count_rolling_trend': word_trend,
            'action_time_trend': action_trend,
            'action_time_acceleration': action_rolling.mean().diff().mean()
        })
    
    return pd.DataFrame(features)


def action_time_distribution_features(df):
    """Statistical properties of action times"""
    features = df.groupby('id')['action_time'].agg([
        ('action_time_q25', lambda x: x.quantile(0.25)),
        ('action_time_q75', lambda x: x.quantile(0.75)),
        ('action_time_iqr', lambda x: x.quantile(0.75) - x.quantile(0.25)),
        ('action_time_skew', lambda x: x.skew()),
        ('action_time_kurtosis', lambda x: x.kurtosis()),
    ]).reset_index()
    
    return features

### 2.7 Advanced Event Timing Features


## 3. Main Feature Builder


In [None]:
def build_all_features(df):
    """
    Main function to build all features from log data
    
    Parameters:
    -----------
    df : DataFrame
        Input log data with columns: id, event_id, down_time, up_time, 
        action_time, activity, cursor_position, word_count
    
    Returns:
    --------
    DataFrame with all extracted features
    """
    print("Building all features...")
    
    # Base features
    print("  - Base features")
    features = extract_features(df)
    
    # Pause features
    print("  - Pause features")
    pause_feats = pause_features(df)
    for feat in pause_feats:
        features = features.merge(feat, on="id", how="left")
    
    # Burst features
    print("  - Burst features")
    burst_feats = burst_features(df)
    for feat in burst_feats:
        features = features.merge(feat, on="id", how="left")
    
    # P-burst features
    print("  - P-burst features")
    p_burst_feat = p_burst_features(df)
    features = features.merge(p_burst_feat, on="id", how="left")
    
    # Activity sequence features
    print("  - Activity sequence features")
    activity_seq_feat = activity_sequence_features(df)
    features = features.merge(activity_seq_feat, on="id", how="left")
    
    # Text change features
    print("  - Text change features")
    text_feat = text_change_features(df)
    features = features.merge(text_feat, on="id", how="left")
    
    # Time-based features
    print("  - Time-based features")
    time_feat = time_based_features(df)
    features = features.merge(time_feat, on="id", how="left")
    
    # Keystroke velocity features
    print("  - Keystroke velocity features")
    keystroke_feat = keystroke_velocity_features(df)
    if not keystroke_feat.empty:
        features = features.merge(keystroke_feat, on="id", how="left")
    
    # Revision pattern features
    print("  - Revision pattern features")
    revision_feat = revision_pattern_features(df)
    features = features.merge(revision_feat, on="id", how="left")
    
    # Cursor movement features
    print("  - Cursor movement features")
    cursor_feat = cursor_movement_features(df)
    features = features.merge(cursor_feat, on="id", how="left")
    
    # Rolling features
    print("  - Rolling window features")
    rolling_feat = rolling_features(df, window=10)
    features = features.merge(rolling_feat, on="id", how="left")
    
    # Action time distribution features
    print("  - Action time distribution features")
    action_dist_feat = action_time_distribution_features(df)
    features = features.merge(action_dist_feat, on="id", how="left")
    
    # Word count velocity features
    print("  - Word count velocity features")
    word_vel_feat = word_count_velocity_features(df)
    features = features.merge(word_vel_feat, on="id", how="left")
    
    # Activity timing features
    print("  - Activity timing features")
    activity_time_feat = activity_timing_features(df)
    features = features.merge(activity_time_feat, on="id", how="left")
    
    # Fill NaN and inf values
    features = features.fillna(0)
    features = features.replace([np.inf, -np.inf], 0)
    
    print(f"\nTotal features extracted: {features.shape[1] - 1}")  # -1 for id column
    print(f"Total samples: {features.shape[0]}")
    
    return features

## 4. Load Data & Extract Features


In [None]:
# Load cleaned training logs


logs = df
print(f"Loaded {len(logs)} rows")
print(f"Unique IDs: {logs['id'].nunique()}")
print(f"\nColumns: {list(logs.columns)}")
logs.head()

In [None]:
# Extract all behavioural features
Behavioral_features_temp = build_all_features(logs)

## 5. Inspect Results


In [None]:
# Display first few rows
print(f"Feature matrix shape: {Behavioral_features_temp.shape}")
print(f"\nFeature names ({len(Behavioral_features_temp.columns)} total):")
print(list(Behavioral_features_temp.columns))
Behavioral_features_temp.head()

In [None]:
# Check for any issues
print("Missing values per column:")
print(Behavioral_features_temp.isnull().sum().sum())
print("\nInfinite values per column:")
print(np.isinf(Behavioral_features_temp.select_dtypes(include=[np.number])).sum().sum())
print("\nBasic statistics:")
Behavioral_features_temp.describe().T

## 6. Save Features


In [None]:
Behavioural_features = Behavioral_features_temp


## Summary

This notebook extracts **comprehensive behavioural features** from keystroke logging data. The features capture:

### Feature Categories (150+ features total):

1. **Base Features**: Event counts, total time, typing speed, activity ratios
2. **Pause Features**: Gaps between keystrokes at different thresholds (2s, 5s, 10s)
3. **Burst Features**: When they're typing continuously and how fluently
4. **Activity Sequence**: How activities transition from one to another, streaks, variety
5. **Text Change**: How fast they produce/remove text, editing efficiency
6. **Temporal Patterns**: What they do in early/middle/late stages
7. **Keystroke Velocity**: Typing speed variations, rhythm, consistency
8. **Word Count Velocity**: How the word count changes over time
9. **Activity Timing**: How much time on each type of activity
10. **Revision Patterns**: Where they edit, review cycles, going backwards
11. **Cursor Movement**: How they navigate around, jump distances
12. **Rolling Window**: Trends and changes in typing behaviour
13. **Distribution Features**: Statistical properties (skew, kurtosis, IQR)

### Output:

- `data/train_behaviour_features.csv` - One row per essay ID with all behavioural features

### Next Steps:

- Combine with text features from `FeatureExtraction_Essay.ipynb`
- Merge with TF-IDF/SVD features from `tfidf/tfidf.ipynb`
- Build predictive models using these features


------------

### Essay Text Feature

In [None]:
# reuse code from text_process
import sys
sys.path.append('..')

import numpy as np

def Text_Feature_Extraction(extracted_text):
    features = extracted_text.drop('text',axis=1)
    texts = extracted_text['text']
    processor = TextProcessor()
    for i in range(0,texts.shape[0]):
        words = processor.split_to_word(texts[i])
        sentences = processor.split_to_sentence(texts[i])
        word_lengths = [len(w) for w in words]
        sent_lengths = [len(processor.split_to_word(s)) for s in sentences]
        
        features.loc[i,'word_count'] = len(word_lengths)
        if len(word_lengths) > 0:
            features.loc[i,'word_length_mean'] = sum(word_lengths)/len(word_lengths)
            features.loc[i,'word_length_std'] = pd.Series(word_lengths).std()
        else:
            features.loc[i,'word_length_mean'] = 0
            features.loc[i,'word_length_std'] = 0
        
        if len(sent_lengths) > 0:
            features.loc[i,'sent_length_mean'] = sum(sent_lengths)/len(sent_lengths)
            features.loc[i,'sent_length_std'] = pd.Series(sent_lengths).std()
        else:
            features.loc[i,'sent_length_mean'] = 0
            features.loc[i,'sent_length_std'] = 0
    return features

Text_Essay_Features = Text_Feature_Extraction(extracted_text)
Train_Text_Essay_Features = Text_Feature_Extraction(extracted_text_train)

In [None]:
Text_Essay_Features

In [None]:
Train_Text_Essay_Features

---------
### TF-IDF Feature Extraction

### Works taken from `texts_tfidf.ipynb`

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pickle
from sklearn.decomposition import TruncatedSVD



texts_train = extracted_text_train[['id', 'text']]
texts_test = extracted_text[['id', 'text']]


# refactor version, completely separate train and test


# only fit on train
vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(1, 5),
    max_features=30000,
    dtype=np.float32,
)
X_train_tfidf = vectorizer.fit_transform(texts_train['text'])

n_features = X_train_tfidf.shape[1]
svdsize = min(64, max(1, n_features - 1))

svd = TruncatedSVD(
    n_components=svdsize,
    random_state=42,
    n_iter=7
)
X_train_svd = svd.fit_transform(X_train_tfidf)


# train_svd_df = pd.DataFrame(
#     X_train_svd,
#     columns=[f'{i:02d}' for i in range(svdsize)]
# )
# train_svd_df.insert(0, 'id', texts_train['id'].values)
# train_svd_df.to_csv("/data/train_tfidf_svd.csv", index=False)


In [None]:
X_test_tfidf = vectorizer.transform(texts_test['text'])
X_test_svd = svd.transform(X_test_tfidf)


svdsize = X_test_svd.shape[1]
test_svd_df = pd.DataFrame(
    X_test_svd,
    columns=[f'{i:02d}' for i in range(svdsize)]
)
test_svd_df.insert(0, 'id', texts_test['id'].values)
TFIDF_Features = test_svd_df

### Works taken from `operation_tfidf.ipynb` 

In [None]:
# === TRAIN ===
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD


activity_df = train_df[['id', 'activity']]

print(train_df.head(3))


def rebuild_text(grp):
    buf = []
    for op in grp['activity']:
        buf.append(op[0])
    return "".join(buf)

operations = (
    activity_df.groupby('id')
               .apply(rebuild_text)
               .reset_index(name='operation')
)

print(operations.head(3))

# 3) TF-IDF fit on train
vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(1, 5),
    max_features=30000,
    dtype=np.float32,
)
X_tfidf = vectorizer.fit_transform(operations['operation'])


# 4) SVD fit on train
n_features = X_tfidf.shape[1]
svdsize = min(64, n_features - 1)

svd = TruncatedSVD(
    n_components=svdsize,
    random_state=42,
    n_iter=7
)
X_svd = svd.fit_transform(X_tfidf)

svd_df = pd.DataFrame(
    X_svd,
    columns=[f'{i:02d}' for i in range(svdsize)]
)
svd_df.insert(0, 'id', operations['id'].values)

print(svd_df.head())
train_svd_df_operation = svd_df


In [None]:
# rebuild raw operation texts


In [None]:
# === TEST ===
import pandas as pd
import numpy as np
import pickle



test_df_activity = df[['id', 'activity']]



test_operations = (
    test_df_activity.groupby('id')
           .apply(rebuild_text)
           .reset_index(name='operation')
)

# 4) only transform, do not fit on tests
X_test_tfidf = vectorizer.transform(test_operations['operation'])
X_test_svd   = svd.transform(X_test_tfidf)

svdsize = X_test_svd.shape[1]
test_svd_df = pd.DataFrame(
    X_test_svd,
    columns=[f'{i:02d}' for i in range(svdsize)]
)
test_svd_df.insert(0, 'id', test_operations['id'].values)
TFIDF_Features_Operations = test_svd_df

## Concat Data and send to model

In [None]:
def merge_preprocessed_data():
    dataset_behaviour = Behavioural_features
    dataset_text = Text_Essay_Features
    dataset_tfidf_text = TFIDF_Features
    dataset_tfidf_operation = TFIDF_Features_Operations

    # merge on 'id'
    merged = dataset_behaviour.merge(dataset_text, on='id', how='inner')

    # rename column name
    tfidf_text_renamed = dataset_tfidf_text.rename(
        columns={col: f'tfidf_text_{col}' if col != 'id' else col
                 for col in dataset_tfidf_text.columns}
    )
    tfidf_operation_renamed = dataset_tfidf_operation.rename(
        columns={col: f'tfidf_operation_{col}' if col != 'id' else col
                 for col in dataset_tfidf_operation.columns}
    )

    merged = merged.merge(tfidf_text_renamed, on='id', how='inner')
    merged = merged.merge(tfidf_operation_renamed, on='id', how='inner')


    return merged




merged_df = merge_preprocessed_data()
    

In [None]:
merged_df

## Extra Feature Extraction
work from `model_enhanced.py` `linking_38features.py`

In [None]:
from typing import Dict, List, Tuple, Optional
from sklearn.linear_model import (
    Ridge,
    ElasticNet,
    BayesianRidge,
    HuberRegressor,
    PoissonRegressor,
    PassiveAggressiveRegressor,
)
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from scipy.optimize import minimize
from tqdm import tqdm
import time
import numpy as np
import pandas as pd
import joblib
import pickle
import catboost as cb
import xgboost as xgb
import lightgbm as lgb
import os
import warnings

warnings.filterwarnings("ignore")
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"


class EnhancedEnsembleModel:
    def __init__(
        self,
        n_splits: int = 5,
        random_state: int = 42,
    ):
        self.n_splits = n_splits
        self.random_state = random_state
        self.kf = KFold(n_splits=n_splits, shuffle=True,
                        random_state=random_state)

        self.lgb_models: List[lgb.LGBMRegressor] = []
        self.xgb_models: List[xgb.XGBRegressor] = []
        self.catboost_models: List[cb.CatBoostRegressor] = []
        self.stacking_model: Optional[Ridge] = None

        # Advanced components (integrated by default)
        self.linear_models = self._init_linear_models()
        self.linear_scaler = MinMaxScaler()
        self.linear_optimal_weights = None

        # Classifier meta-feature mappings
        self.score_to_class = {
            4.0: 9, 3.5: 8, 4.5: 7, 3.0: 6, 2.5: 5,
            5.0: 4, 5.5: 1, 2.0: 3, 1.5: 2, 6.0: 1,
            1.0: 0, 0.5: 0,
        }
        self.class_to_score = {
            0: 1.0, 1: 6.0, 2: 1.5, 3: 2.0, 4: 5.0,
            5: 2.5, 6: 3.0, 7: 4.5, 8: 3.5, 9: 4.0,
        }
        self.n_classes = 10
        self.final_weights = None

        self.lgb_params = {
            "objective": "regression_l1",
            "metric": "rmse",
            "n_estimators": 12000,
            "verbosity": -1,
            "random_state": random_state,
            "reg_alpha": 0.007678095440286993,
            "reg_lambda": 0.34230534302168353,
            "colsample_bytree": 0.627061253588415,
            "subsample": 0.854942238828458,
            "learning_rate": 0.038697981947473245,
            "num_leaves": 22,
            "max_depth": 37,
            "min_child_samples": 18,
            "n_jobs": 4,
        }

        self.xgb_params = {
            "objective": "reg:squarederror",
            "eval_metric": "rmse",
            "n_estimators": 5000,
            "random_state": random_state,
            "learning_rate": 0.03,
            "max_depth": 8,
            "min_child_weight": 3,
            "subsample": 0.8,
            "colsample_bytree": 0.8,
            "reg_alpha": 0.1,
            "reg_lambda": 1.0,
            "n_jobs": 1,
            "tree_method": "hist",
        }

        self.catboost_params = {
            "iterations": 5000,
            "learning_rate": 0.03,
            "depth": 8,
            "l2_leaf_reg": 3,
            "random_state": random_state,
            "verbose": False,
            "thread_count": 4,
            "loss_function": "RMSE",
        }

    def _init_linear_models(self):
        return [
            ("LinearSVR", LinearSVR(
                C=0.9, loss="squared_epsilon_insensitive", max_iter=2000)),
            ("ElasticNet", ElasticNet(alpha=0.001, l1_ratio=0.5,
             random_state=self.random_state, selection="cyclic")),
            ("Ridge", Ridge(alpha=10)),
            ("PassiveAggressive", PassiveAggressiveRegressor(
                C=0.001, loss="squared_epsilon_insensitive")),
            ("Huber", HuberRegressor(epsilon=1.25, alpha=20)),
            ("Poisson", PoissonRegressor(alpha=0.01)),
            ("BayesianRidge", BayesianRidge()),
        ]

    def _generate_classifier_meta_features(self, X, y, X_test=None, pca_components=100):

        y_class = pd.Series(y).map(self.score_to_class).values

        print("Training MultinomialNB...")
        nb_oof, nb_test = self._train_classifier_meta(
            X, y_class, X_test, pca_components, "nb"
        )

        print("Training MLPClassifier...")
        mlp_oof, mlp_test = self._train_classifier_meta(
            X, y_class, X_test, pca_components, "mlp"
        )

        return {
            "nb_oof": nb_oof, "nb_test": nb_test,
            "mlp_oof": mlp_oof, "mlp_test": mlp_test,
        }

    def _train_classifier_meta(self, X, y_class, X_test, pca_components, model_type):
        # Apply PCA and square
        pca = PCA(n_components=pca_components, random_state=self.random_state)
        if X_test is not None:
            combined = pd.concat([X, X_test]) if isinstance(
                X, pd.DataFrame) else np.vstack([X, X_test])
            pca.fit(combined.fillna(0) if isinstance(
                combined, pd.DataFrame) else combined)
            X_pca = pca.transform(X.fillna(0) if isinstance(
                X, pd.DataFrame) else X) ** 2
            X_test_pca = pca.transform(X_test.fillna(0) if isinstance(
                X_test, pd.DataFrame) else X_test) ** 2
        else:
            X_pca = pca.fit_transform(
                X.fillna(0) if isinstance(X, pd.DataFrame) else X) ** 2
            X_test_pca = None

        # Cross-validation
        oof_prob = np.zeros((len(X), self.n_classes))
        test_prob = np.zeros((len(X_test), self.n_classes)
                             ) if X_test is not None else None

        skf = StratifiedKFold(n_splits=self.n_splits,
                              shuffle=True, random_state=self.random_state)

        for train_idx, val_idx in skf.split(X_pca, y_class):
            X_train, X_val = X_pca[train_idx], X_pca[val_idx]
            y_train, y_val = y_class[train_idx], y_class[val_idx]

            if model_type == "nb":
                model = MultinomialNB(alpha=1.0)
            else:
                model = MLPClassifier(
                    random_state=self.random_state, max_iter=300)

            model.fit(X_train, y_train)
            oof_prob[val_idx] = model.predict_proba(X_val)

            if test_prob is not None:
                test_prob += model.predict_proba(X_test_pca) / self.n_splits

        return oof_prob, test_prob

    def _compute_weighted_score(self, probabilities):
        weighted_sum = np.zeros(len(probabilities))
        for i in range(self.n_classes):
            weighted_sum += probabilities[:, i] * self.class_to_score[i]
        return weighted_sum

    def _train_linear_ensemble(self, X, y, X_test=None):
        # Scale features
        if X_test is not None:
            combined = pd.concat([X, X_test])
            self.linear_scaler.fit(combined)
            X_scaled = pd.DataFrame(
                self.linear_scaler.transform(X), columns=X.columns)
            X_test_scaled = pd.DataFrame(
                self.linear_scaler.transform(X_test), columns=X.columns)
        else:
            X_scaled = pd.DataFrame(
                self.linear_scaler.fit_transform(X), columns=X.columns)
            X_test_scaled = None

        # Train each model
        oof_predictions = np.zeros((len(X), len(self.linear_models)))
        test_predictions = np.zeros(
            (len(X_test), len(self.linear_models))) if X_test is not None else None

        skf = StratifiedKFold(n_splits=self.n_splits,
                              shuffle=True, random_state=self.random_state)

        for i, (name, model_template) in enumerate(tqdm(self.linear_models, desc="Training linear models")):
            oof_preds = np.zeros(len(X))
            test_preds = np.zeros(len(X_test)) if X_test is not None else None

            for train_idx, val_idx in skf.split(X_scaled, y.astype(str)):
                from sklearn.base import clone
                model = clone(model_template)

                X_train = X_scaled.iloc[train_idx].fillna(0)
                X_val = X_scaled.iloc[val_idx].fillna(0)
                y_train, y_val = y[train_idx], y[val_idx]

                model.fit(X_train, y_train)
                oof_preds[val_idx] = model.predict(X_val)

                if test_preds is not None:
                    test_preds += model.predict(X_test_scaled.fillna(0)
                                                ) / self.n_splits

            oof_preds = np.clip(oof_preds, 0, 6)
            oof_predictions[:, i] = oof_preds

            if test_preds is not None:
                test_preds = np.clip(test_preds, 0, 6)
                test_predictions[:, i] = test_preds

            rmse = np.sqrt(mean_squared_error(y, oof_preds))
            print(f"{name:20s} CV RMSE: {rmse:.5f}")

        self.linear_optimal_weights = self._optimize_weights(
            oof_predictions, y)

        ensemble_oof = np.dot(oof_predictions, self.linear_optimal_weights)
        ensemble_rmse = np.sqrt(mean_squared_error(y, ensemble_oof))
        print(f"Linear Ensemble CV RMSE: {ensemble_rmse:.5f}")

        if test_predictions is not None:
            ensemble_test = np.dot(
                test_predictions, self.linear_optimal_weights)
            return ensemble_oof, ensemble_test

        return ensemble_oof, None

    def _optimize_weights(self, predictions, y_true):
        def weighted_rmse(weights, preds, y):
            ensemble_pred = np.dot(preds, weights)
            return np.sqrt(mean_squared_error(y, ensemble_pred))

        n_models = predictions.shape[1]
        initial_weights = np.ones(n_models) / n_models
        constraints = {"type": "eq", "fun": lambda w: sum(w) - 1}
        bounds = [(0, 1)] * n_models

        opt_result = minimize(
            weighted_rmse,
            initial_weights,
            args=(predictions, y_true),
            method="SLSQP",
            bounds=bounds,
            constraints=constraints,
        )

        return opt_result.x

    def train_single_model(
        self,
        X_train: np.ndarray,
        y_train: np.ndarray,
        X_val: np.ndarray,
        y_val: np.ndarray,
        model_type: str,
    ) -> Tuple:
        if model_type == "lgb":
            model = lgb.LGBMRegressor(**self.lgb_params)
            X_train_np = X_train if isinstance(
                X_train, np.ndarray) else X_train.values
            X_val_np = X_val if isinstance(X_val, np.ndarray) else X_val.values
            model.fit(
                X_train_np,
                y_train,
                eval_set=[(X_val_np, y_val)],
                callbacks=[lgb.early_stopping(200, verbose=False)],
            )

        elif model_type == "xgb":
            model = xgb.XGBRegressor(
                **self.xgb_params, early_stopping_rounds=200)
            model.fit(X_train, y_train, eval_set=[
                      (X_val, y_val)], verbose=False)

        elif model_type == "catboost":
            model = cb.CatBoostRegressor(**self.catboost_params)
            model.fit(
                X_train,
                y_train,
                eval_set=[(X_val, y_val)],
                early_stopping_rounds=200,
                verbose=False,
            )

        X_val_pred = X_val if isinstance(X_val, np.ndarray) else X_val.values
        val_preds = model.predict(X_val_pred)
        val_score = np.sqrt(mean_squared_error(y_val, val_preds))

        return model, val_preds, val_score

    def fit(self, X: pd.DataFrame, y: np.ndarray, X_test: pd.DataFrame = None) -> Tuple[Dict[str, float], np.ndarray, np.ndarray]:
        print("=" * 70)
        print("ADVANCED ENHANCED ENSEMBLE MODEL")
        print("=" * 70)

        # Step 1: Generate classifier meta-features
        print("\n" + "=" * 70)
        print("STEP 1: Generating Classifier Meta-Features (NB + MLP)")
        print("=" * 70)

        meta_features = self._generate_classifier_meta_features(X, y, X_test)

        # Add meta-features to original features
        X_enhanced = X.copy() if isinstance(X, pd.DataFrame) else pd.DataFrame(X)
        X_test_enhanced = X_test.copy() if X_test is not None and isinstance(
            X_test, pd.DataFrame) else (pd.DataFrame(X_test) if X_test is not None else None)

        # Add NB features
        for i in range(meta_features["nb_oof"].shape[1]):
            X_enhanced[f"nb_prob_{i}"] = meta_features["nb_oof"][:, i]
            if X_test_enhanced is not None:
                X_test_enhanced[f"nb_prob_{i}"] = meta_features["nb_test"][:, i]
        X_enhanced["nb_weighted"] = self._compute_weighted_score(
            meta_features["nb_oof"])
        if X_test_enhanced is not None:
            X_test_enhanced["nb_weighted"] = self._compute_weighted_score(
                meta_features["nb_test"])

        # Add MLP features
        for i in range(meta_features["mlp_oof"].shape[1]):
            X_enhanced[f"mlp_prob_{i}"] = meta_features["mlp_oof"][:, i]
            if X_test_enhanced is not None:
                X_test_enhanced[f"mlp_prob_{i}"] = meta_features["mlp_test"][:, i]
        X_enhanced["mlp_weighted"] = self._compute_weighted_score(
            meta_features["mlp_oof"])
        if X_test_enhanced is not None:
            X_test_enhanced["mlp_weighted"] = self._compute_weighted_score(
                meta_features["mlp_test"])

        print(f"\nEnhanced feature shape: {X_enhanced.shape}")

        # Step 2: Train base ensemble (LGB + XGB + CatBoost) on enhanced features
        print("\n" + "=" * 70)
        print("STEP 2: Training Base Ensemble (LGB + XGB + CatBoost)")
        print("=" * 70)

        scores, ensemble_oof = self._train_base_ensemble(X_enhanced, y)
        ensemble_test = self._predict_base_ensemble(
            X_test_enhanced) if X_test_enhanced is not None else None

        # Step 3: Train linear model ensemble
        print("\n" + "=" * 70)
        print("STEP 3: Training Linear Model Ensemble")
        print("=" * 70)

        linear_oof, linear_test = self._train_linear_ensemble(
            X_enhanced, y, X_test_enhanced)

        # Step 4: Optimize final ensemble weights
        print("\n" + "=" * 70)
        print("STEP 4: Optimizing Final Ensemble Weights")
        print("=" * 70)

        combined_preds = np.column_stack([ensemble_oof, linear_oof])
        self.final_weights = self._optimize_weights(combined_preds, y)

        final_oof = ensemble_oof * \
            self.final_weights[0] + linear_oof * self.final_weights[1]
        final_rmse = np.sqrt(mean_squared_error(y, final_oof))

        print(
            f"\nFinal weights: Base Ensemble={self.final_weights[0]:.4f}, Linear={self.final_weights[1]:.4f}")
        print(f"Final CV RMSE: {final_rmse:.5f}")
        print("=" * 70)

        # Compute final test predictions
        final_test = None
        if ensemble_test is not None and linear_test is not None:
            final_test = ensemble_test * \
                self.final_weights[0] + linear_test * self.final_weights[1]

        scores["advanced_ensemble_oof"] = final_rmse

        return scores, final_oof, final_test

    def _train_base_ensemble(self, X: pd.DataFrame, y: np.ndarray) -> Tuple[Dict[str, float], np.ndarray]:
        X_np = X.values if isinstance(X, pd.DataFrame) else X

        oof_lgb = np.zeros(len(X))
        oof_xgb = np.zeros(len(X))
        oof_catboost = np.zeros(len(X))

        lgb_scores = []
        xgb_scores = []
        catboost_scores = []

        for fold, (train_idx, val_idx) in enumerate(self.kf.split(X_np, y)):
            print(f"\nFold {fold + 1}/{self.n_splits}")

            X_train, X_val = X_np[train_idx], X_np[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]

            start_time = time.time()
            lgb_model, lgb_val_preds, lgb_score = self.train_single_model(
                X_train, y_train, X_val, y_val, "lgb"
            )
            self.lgb_models.append(lgb_model)
            oof_lgb[val_idx] = lgb_val_preds
            lgb_scores.append(lgb_score)

            print(
                f"  LightGBM - Val RMSE: {lgb_score:.5f} ({time.time() - start_time:.1f}s)"
            )

            start_time = time.time()
            xgb_model, xgb_val_preds, xgb_score = self.train_single_model(
                X_train, y_train, X_val, y_val, "xgb"
            )
            self.xgb_models.append(xgb_model)
            oof_xgb[val_idx] = xgb_val_preds
            xgb_scores.append(xgb_score)

            print(
                f"  XGBoost   - Val RMSE: {xgb_score:.5f} ({time.time() - start_time:.1f}s)"
            )

            start_time = time.time()
            cb_model, cb_val_preds, cb_score = self.train_single_model(
                X_train, y_train, X_val, y_val, "catboost"
            )
            self.catboost_models.append(cb_model)
            oof_catboost[val_idx] = cb_val_preds
            catboost_scores.append(cb_score)

            print(
                f"  CatBoost  - Val RMSE: {cb_score:.5f} ({time.time() - start_time:.1f}s)"
            )

        lgb_oof_score = np.sqrt(mean_squared_error(y, oof_lgb))
        xgb_oof_score = np.sqrt(mean_squared_error(y, oof_xgb))
        catboost_oof_score = np.sqrt(mean_squared_error(y, oof_catboost))

        print("\n" + "=" * 50)
        print("Out-of-Fold Results:")
        print(f"  LightGBM:  {lgb_oof_score:.5f}")
        print(f"  XGBoost:   {xgb_oof_score:.5f}")
        print(f"  CatBoost:  {catboost_oof_score:.5f}")

        stacking_features = np.column_stack([oof_lgb, oof_xgb, oof_catboost])
        self.stacking_model = Ridge(alpha=1.0, random_state=self.random_state)
        self.stacking_model.fit(stacking_features, y)

        ensemble_preds = self.stacking_model.predict(stacking_features)
        ensemble_score = np.sqrt(mean_squared_error(y, ensemble_preds))

        print(f"  Stacked:   {ensemble_score:.5f}")
        print(
            f"\nStacking weights: LGB={self.stacking_model.coef_[0]:.3f}, "
            f"XGB={self.stacking_model.coef_[1]:.3f}, "
            f"CB={self.stacking_model.coef_[2]:.3f}"
        )

        return {
            "lgb_oof": lgb_oof_score,
            "xgb_oof": xgb_oof_score,
            "catboost_oof": catboost_oof_score,
            "ensemble_oof": ensemble_score,
        }, ensemble_preds

    def _predict_base_ensemble(self, X: pd.DataFrame) -> np.ndarray:
        X_np = X.values if isinstance(X, pd.DataFrame) else X

        lgb_preds = np.zeros(len(X))
        xgb_preds = np.zeros(len(X))
        catboost_preds = np.zeros(len(X))

        for lgb_model in self.lgb_models:
            lgb_preds += lgb_model.predict(X_np) / self.n_splits

        for xgb_model in self.xgb_models:
            xgb_preds += xgb_model.predict(X_np) / self.n_splits

        for cb_model in self.catboost_models:
            catboost_preds += cb_model.predict(X_np) / self.n_splits

        stacking_features = np.column_stack(
            [lgb_preds, xgb_preds, catboost_preds])
        final_preds = self.stacking_model.predict(stacking_features)

        return final_preds

    def predict(self, X: pd.DataFrame) -> np.ndarray:
        """
        Predict using the full advanced ensemble

        Args:
            X: Input features

        Returns:
            Final predictions
        """
        # Generate meta-features for prediction
        X_enhanced = X.copy() if isinstance(X, pd.DataFrame) else pd.DataFrame(X)

        # For prediction, we need to generate meta-features but we don't have y
        # So we skip meta-features in predict and only use base predictions
        # This is a limitation - ideally meta-features should be pre-computed

        # Use base ensemble prediction
        base_pred = self._predict_base_ensemble(X_enhanced)

        # If we don't have final weights trained, return base prediction
        if self.final_weights is None:
            return base_pred

        # Otherwise this would need the full pipeline which requires y for meta-features
        # For now, return base prediction
        return base_pred

    def save(self, path: str, use_pickle: bool = True):
        os.makedirs(path, exist_ok=True)

        if use_pickle:
            with open(os.path.join(path, "lgb_models.pkl"), "wb") as f:
                pickle.dump(self.lgb_models, f)
            with open(os.path.join(path, "xgb_models.pkl"), "wb") as f:
                pickle.dump(self.xgb_models, f)
            with open(os.path.join(path, "catboost_models.pkl"), "wb") as f:
                pickle.dump(self.catboost_models, f)
            with open(os.path.join(path, "stacking_model.pkl"), "wb") as f:
                pickle.dump(self.stacking_model, f)
            config = {
                "n_splits": self.n_splits,
                "random_state": self.random_state,
            }
            with open(os.path.join(path, "config.pkl"), "wb") as f:
                pickle.dump(config, f)
        else:
            joblib.dump(self.lgb_models, os.path.join(path, "lgb_models.pkl"))
            joblib.dump(self.xgb_models, os.path.join(path, "xgb_models.pkl"))
            joblib.dump(self.catboost_models, os.path.join(
                path, "catboost_models.pkl"))
            joblib.dump(self.stacking_model, os.path.join(
                path, "stacking_model.pkl"))
            config = {
                "n_splits": self.n_splits,
                "random_state": self.random_state,
            }
            joblib.dump(config, os.path.join(path, "config.pkl"))

        print(
            f"Model saved to {path} using {'pickle' if use_pickle else 'joblib'}")

    def load(self, path: str, use_pickle: bool = True):
        if use_pickle:
            with open(os.path.join(path, "lgb_models.pkl"), "rb") as f:
                self.lgb_models = pickle.load(f)
            with open(os.path.join(path, "xgb_models.pkl"), "rb") as f:
                self.xgb_models = pickle.load(f)
            with open(os.path.join(path, "catboost_models.pkl"), "rb") as f:
                self.catboost_models = pickle.load(f)
            with open(os.path.join(path, "stacking_model.pkl"), "rb") as f:
                self.stacking_model = pickle.load(f)
            with open(os.path.join(path, "config.pkl"), "rb") as f:
                config = pickle.load(f)
        else:
            self.lgb_models = joblib.load(os.path.join(path, "lgb_models.pkl"))
            self.xgb_models = joblib.load(os.path.join(path, "xgb_models.pkl"))
            self.catboost_models = joblib.load(
                os.path.join(path, "catboost_models.pkl"))
            self.stacking_model = joblib.load(
                os.path.join(path, "stacking_model.pkl"))
            config = joblib.load(os.path.join(path, "config.pkl"))

        self.n_splits = config["n_splits"]
        self.random_state = config["random_state"]

        print(
            f"Model loaded from {path} using {'pickle' if use_pickle else 'joblib'}")

    def load_lgbm_only(self, filepath: str):
        with open(filepath, "rb") as f:
            self.lgb_models = pickle.load(f)

        print(f"LightGBM models loaded from {filepath}")


def train_enhanced_ensemble(
    data_path: str = "../data/",
    save_path: str = "../data/",
    n_splits: int = 5,
    random_state: int = 42,
) -> pd.DataFrame:
    if not os.path.exists(data_path):
        data_path = "data/"

    train_file = f"{data_path}train_preprocessed.csv"
    test_file = f"{data_path}test_preprocessed.csv"

    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)

    df_train = df_train.drop("id", axis=1)
    test_ids = df_test["id"]
    df_test = df_test.drop("id", axis=1)

    X = df_train.iloc[:, :-1]
    y = df_train.iloc[:, -1].values
    X_test = df_test

    print(f"Training data shape: {X.shape}")
    print(f"Test data shape: {X_test.shape}")

    model = EnhancedEnsembleModel(
        n_splits=n_splits,
        random_state=random_state,
    )

    scores, _, test_preds = model.fit(X, y, X_test)

    model_save_path = f"{save_path}model_enhanced_ensemble"
    model.save(model_save_path, use_pickle=True)

    submission = pd.DataFrame({"id": test_ids, "score": test_preds})
    submission_path = f"{save_path}submission_enhanced_ensemble.csv"
    submission.to_csv(submission_path, index=False)

    print(f"\nSubmission saved to {submission_path}")
    print("Submission preview:")
    print(submission.head())
    print("\nScore statistics:")
    print(submission["score"].describe())

    return submission


In [None]:
# features from 38 th notebook
import pandas as pd
import numpy as np
import polars as pl
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, PCA
from sklearn.manifold import TSNE
from scipy.stats import skew, kurtosis
from tqdm import tqdm
import warnings
import pickle
import os

warnings.filterwarnings("ignore")


class CharacterNGramFeatureExtractor:
    def __init__(self, ngram_range=(1, 4), analyzer="char_wb", max_features=None):
        self.ngram_range = ngram_range
        self.analyzer = analyzer
        self.max_features = max_features
        self.vectorizer = CountVectorizer(
            ngram_range=ngram_range, analyzer=analyzer, max_features=max_features
        )

    def fit_transform(self, train_essays, test_essays):
        combined = pd.concat([train_essays["essay"], test_essays["essay"]])
        self.vectorizer.fit(combined)

        X_train = self.vectorizer.transform(train_essays["essay"])
        X_train_dense = X_train.todense()

        train_features = pd.DataFrame()
        for i in range(X_train_dense.shape[1]):
            L = list(X_train_dense[:, i])
            train_features[f"char_ngram_{i}"] = [int(x) for x in L]
        train_features["id"] = train_essays["id"].values

        X_test = self.vectorizer.transform(test_essays["essay"])
        X_test_dense = X_test.todense()

        test_features = pd.DataFrame()
        for i in range(X_test_dense.shape[1]):
            L = list(X_test_dense[:, i])
            test_features[f"char_ngram_{i}"] = [int(x) for x in L]
        test_features["id"] = test_essays["id"].values

        return train_features, test_features


class LDATopicFeatureExtractor:
    def __init__(self, n_topics=6, max_iter=10, random_state=42):
        self.n_topics = n_topics
        self.max_iter = max_iter
        self.random_state = random_state
        self.models = {}
        self.vectorizers = {}

    def fit_transform(self, train_essays, test_essays):
        train_topics = pd.DataFrame({"id": train_essays["id"]})
        test_topics = pd.DataFrame({"id": test_essays["id"]})

        # Combine for fitting
        combined = pd.concat([train_essays, test_essays])

        # 1 stopwords
        train_topics, test_topics = self._fit_lda_variant(
            combined,
            train_essays,
            test_essays,
            train_topics,
            test_topics,
            "word_topics",
            CountVectorizer(stop_words="english"),
        )

        # 2 char wb
        train_topics, test_topics = self._fit_lda_variant(
            combined,
            train_essays,
            test_essays,
            train_topics,
            test_topics,
            "char_topics",
            CountVectorizer(analyzer="char_wb"),
        )

        # 3. ngram = (5, 6)
        train_topics, test_topics = self._fit_lda_variant(
            combined,
            train_essays,
            test_essays,
            train_topics,
            test_topics,
            "char_ngram_topics",
            CountVectorizer(analyzer="char_wb", ngram_range=(5, 6)),
        )

        return train_topics, test_topics

    def _fit_lda_variant(
        self,
        combined_essays,
        train_essays,
        test_essays,
        train_topics,
        test_topics,
        prefix,
        vectorizer,
    ):
        vectorizer.fit(combined_essays["essay"])
        self.vectorizers[prefix] = vectorizer

        train_words = pd.DataFrame(
            vectorizer.transform(train_essays["essay"]).toarray()
        )
        test_words = pd.DataFrame(vectorizer.transform(test_essays["essay"]).toarray())

        lda = LatentDirichletAllocation(
            n_components=self.n_topics,
            max_iter=self.max_iter,
            random_state=self.random_state,
            verbose=False,
        )

        combined_words = pd.concat([train_words, test_words])
        lda.fit(combined_words)

        self.models[prefix] = lda

        topic_cols = [f"{prefix}_{i}" for i in range(self.n_topics)]
        train_topics[topic_cols] = lda.transform(train_words)
        test_topics[topic_cols] = lda.transform(test_words)

        return train_topics, test_topics


class TSNEFeatureExtractor:
    def __init__(
        self, perplexities=[20, 50, 80], n_components=2, random_state=42, n_jobs=-1
    ):
        self.perplexities = perplexities
        self.n_components = n_components
        self.random_state = random_state
        self.n_jobs = n_jobs

    def fit_transform(self, train_features, test_features):
        train_tsne = pd.DataFrame()
        test_tsne = pd.DataFrame()

        combined = pd.concat([train_features, test_features])

        print("tsne")
        for perplexity in self.perplexities:
            tsne = TSNE(
                n_components=self.n_components,
                random_state=self.random_state,
                perplexity=perplexity,
                n_jobs=self.n_jobs,
                verbose=False,
            )

            embeddings = tsne.fit_transform(combined.fillna(0))

            train_size = len(train_features)
            for i in range(self.n_components):
                col_name = f"tsne_p{perplexity}_{i}"
                train_tsne[col_name] = embeddings[:train_size, i]
                test_tsne[col_name] = embeddings[train_size:, i]

        return train_tsne, test_tsne


class PolarsFeatureExtractor:
    def __init__(self):
        self.num_cols = [
            "down_time",
            "up_time",
            "action_time",
            "cursor_position",
            "word_count",
        ]
        self.activities = ["Input", "Remove/Cut", "Nonproduction", "Replace", "Paste"]
        self.events = [
            "q",
            "Space",
            "Backspace",
            "Shift",
            "ArrowRight",
            "Leftclick",
            "ArrowLeft",
            ".",
            ",",
            "ArrowDown",
            "ArrowUp",
            "Enter",
            "CapsLock",
            "'",
            "Delete",
            "Unidentified",
        ]
        self.text_changes = [
            "q",
            " ",
            ".",
            ",",
            "\n",
            "'",
            '"',
            "-",
            "?",
            ";",
            "=",
            "/",
            "\\",
            ":",
            "n",
        ]

    def extract_features(self, df):
        if isinstance(df, pd.DataFrame):
            df = pl.from_pandas(df)

        feats = self._count_by_values(df, "activity", self.activities)
        feats = feats.join(
            self._count_by_values(df, "text_change", self.text_changes),
            on="id",
            how="left",
        )
        feats = feats.join(
            self._count_by_values(df, "down_event", self.events), on="id", how="left"
        )
        feats = feats.join(
            self._count_by_values(df, "up_event", self.events), on="id", how="left"
        )
        feats = feats.join(self._input_word_stats(df), on="id", how="left")
        feats = feats.join(self._numerical_stats(df), on="id", how="left")
        feats = feats.join(self._categorical_stats(df), on="id", how="left")
        feats = feats.join(self._idle_time_features(df), on="id", how="left")
        feats = feats.join(self._p_burst_features(df), on="id", how="left")
        feats = feats.join(self._r_burst_features(df), on="id", how="left")

        return feats.to_pandas()

    def _count_by_values(self, df, colname, values):
        fts = df.select(pl.col("id").unique(maintain_order=True))
        for i, value in enumerate(values):
            tmp_df = df.group_by("id").agg(
                pl.col(colname).is_in([value]).sum().alias(f"{colname}_{i}_cnt")
            )
            fts = fts.join(tmp_df, on="id", how="left")
        return fts

    def _input_word_stats(self, df):
        temp = df.filter(
            (~pl.col("text_change").str.contains("=>"))
            & (pl.col("text_change") != "NoChange")
        )
        temp = temp.group_by("id").agg(
            pl.col("text_change").str.concat("").str.extract_all(r"q+")
        )
        temp = temp.with_columns(
            input_word_count=pl.col("text_change").list.len(),
            input_word_length_mean=pl.col("text_change").map_elements(
                lambda x: np.mean([len(i) for i in x] if len(x) > 0 else 0),
                return_dtype=pl.Float64,
            ),
            input_word_length_max=pl.col("text_change").map_elements(
                lambda x: np.max([len(i) for i in x] if len(x) > 0 else 0),
                return_dtype=pl.Float64,
            ),
            input_word_length_std=pl.col("text_change").map_elements(
                lambda x: np.std([len(i) for i in x] if len(x) > 0 else 0),
                return_dtype=pl.Float64,
            ),
            input_word_length_median=pl.col("text_change").map_elements(
                lambda x: np.median([len(i) for i in x] if len(x) > 0 else 0),
                return_dtype=pl.Float64,
            ),
            input_word_length_skew=pl.col("text_change").map_elements(
                lambda x: skew([len(i) for i in x] if len(x) > 0 else 0),
                return_dtype=pl.Float64,
            ),
        )
        return temp.drop("text_change")

    def _numerical_stats(self, df):
        return df.group_by("id").agg(
            [
                pl.sum("action_time").alias("action_time_sum"),
                *[pl.mean(col).alias(f"{col}_mean") for col in self.num_cols],
                *[pl.std(col).alias(f"{col}_std") for col in self.num_cols],
                *[pl.median(col).alias(f"{col}_median") for col in self.num_cols],
                *[pl.min(col).alias(f"{col}_min") for col in self.num_cols],
                *[pl.max(col).alias(f"{col}_max") for col in self.num_cols],
                *[
                    pl.quantile(col, 0.5).alias(f"{col}_quantile")
                    for col in self.num_cols
                ],
            ]
        )

    def _categorical_stats(self, df):
        return df.group_by("id").agg(
            [
                pl.n_unique("activity").alias("activity_nunique"),
                pl.n_unique("down_event").alias("down_event_nunique"),
                pl.n_unique("up_event").alias("up_event_nunique"),
                pl.n_unique("text_change").alias("text_change_nunique"),
            ]
        )

    def _idle_time_features(self, df):
        temp = df.with_columns(
            pl.col("up_time").shift().over("id").alias("up_time_lagged")
        )
        temp = temp.with_columns(
            (pl.col("down_time") - pl.col("up_time_lagged"))
            .abs()
            .truediv(1000)
            .fill_null(0)
            .alias("time_diff")
        )
        temp = temp.filter(pl.col("activity").is_in(["Input", "Remove/Cut"]))

        return temp.group_by("id").agg(
            [
                pl.max("time_diff").alias("inter_key_largest_latency"),
                pl.median("time_diff").alias("inter_key_median_latency"),
                pl.mean("time_diff").alias("mean_pause_time"),
                pl.std("time_diff").alias("std_pause_time"),
                pl.sum("time_diff").alias("total_pause_time"),
                pl.col("time_diff")
                .filter((pl.col("time_diff") > 0.5) & (pl.col("time_diff") < 1))
                .count()
                .alias("pauses_half_sec"),
                pl.col("time_diff")
                .filter((pl.col("time_diff") > 1) & (pl.col("time_diff") < 1.5))
                .count()
                .alias("pauses_1_sec"),
                pl.col("time_diff")
                .filter((pl.col("time_diff") > 1.5) & (pl.col("time_diff") < 2))
                .count()
                .alias("pauses_1_half_sec"),
                pl.col("time_diff")
                .filter((pl.col("time_diff") > 2) & (pl.col("time_diff") < 3))
                .count()
                .alias("pauses_2_sec"),
                pl.col("time_diff")
                .filter(pl.col("time_diff") > 3)
                .count()
                .alias("pauses_3_sec"),
            ]
        )

    def _p_burst_features(self, df):
        temp = df.with_columns(
            pl.col("up_time").shift().over("id").alias("up_time_lagged")
        )
        temp = temp.with_columns(
            (pl.col("down_time") - pl.col("up_time_lagged"))
            .abs()
            .truediv(1000)
            .fill_null(0)
            .alias("time_diff")
        )
        temp = temp.filter(pl.col("activity").is_in(["Input", "Remove/Cut"]))
        temp = temp.with_columns((pl.col("time_diff") < 2).alias("time_diff"))
        temp = temp.with_columns(
            pl.when(pl.col("time_diff") & pl.col("time_diff").is_last_distinct())
            .then(pl.count())
            .over(pl.col("time_diff").rle_id())
            .alias("P_bursts")
        )
        temp = temp.drop_nulls()

        return temp.group_by("id").agg(
            [
                pl.mean("P_bursts").alias("P_bursts_mean"),
                pl.std("P_bursts").alias("P_bursts_std"),
                pl.count("P_bursts").alias("P_bursts_count"),
                pl.median("P_bursts").alias("P_bursts_median"),
                pl.max("P_bursts").alias("P_bursts_max"),
                pl.first("P_bursts").alias("P_bursts_first"),
                pl.last("P_bursts").alias("P_bursts_last"),
            ]
        )

    def _r_burst_features(self, df):
        temp = df.filter(pl.col("activity").is_in(["Input", "Remove/Cut"]))
        temp = temp.with_columns(
            pl.col("activity").is_in(["Remove/Cut"]).alias("activity")
        )
        temp = temp.with_columns(
            pl.when(pl.col("activity") & pl.col("activity").is_last_distinct())
            .then(pl.count())
            .over(pl.col("activity").rle_id())
            .alias("R_bursts")
        )
        temp = temp.drop_nulls()

        return temp.group_by("id").agg(
            [
                pl.mean("R_bursts").alias("R_bursts_mean"),
                pl.std("R_bursts").alias("R_bursts_std"),
                pl.median("R_bursts").alias("R_bursts_median"),
                pl.max("R_bursts").alias("R_bursts_max"),
                pl.first("R_bursts").alias("R_bursts_first"),
                pl.last("R_bursts").alias("R_bursts_last"),
            ]
        )


In [None]:
import os
import sys
import pandas as pd
import numpy as np
import warnings
from pathlib import Path

warnings.filterwarnings("ignore")


def run_integration_pipeline(
    data_path="data/",
    use_existing_features=True,
    use_advanced_ensemble=True,
    use_tabpfn=False,
    n_splits=5,
    random_state=42,
):
    '''

        Definitions
        train_logs: raw train data
        test_logs: raw test data
        train_scores: y


        train_essays: masked paragraphs written
        test_essays: masked paragraphs written

        test_basic:
    '''


    train_logs = pd.read_csv(TRAIN_DATA_PATH)
    test_logs = pd.read_csv(TEST_DATA_PATH)
    train_scores = train_score_df
    train_essays = extracted_text_train
    print("Loaded cleaned csv")
    train_essays = train_essays.rename(columns={"text": "essay"})
    test_essays = extracted_text[
        ["id", "text"]
    ]
    test_essays = test_essays.rename(columns={"text": "essay"})
    print("Loaded reconstructed essays")
    train_basic = pd.read_csv(TRAIN_DATA_PREPROCESSED_PATH)
    test_basic = merged_df
    print("Loaded existing preprocessed features")

    # Extract IDs and scores

    train_ids = train_basic["id"].values
    test_ids = test_basic["id"].values
    y = train_basic["score"].values

    train_basic = train_basic.drop(columns=["id", "score"], errors="ignore")
    test_basic = test_basic.drop(columns=["id"], errors="ignore")
    print("Loaded Basic Features")
    # ngram
    char_ngram_extractor = CharacterNGramFeatureExtractor(
        ngram_range=(1, 4),
        analyzer="char_wb",
        max_features=1200,
    )

    train_char_ngrams, test_char_ngrams = char_ngram_extractor.fit_transform(
        train_essays, test_essays
    )

    train_char_ngrams = train_char_ngrams.drop(columns=["id"])
    test_char_ngrams = test_char_ngrams.drop(columns=["id"])

    variance = train_char_ngrams.var()
    top_features = variance.nlargest(500).index
    train_char_ngrams = train_char_ngrams[top_features]
    test_char_ngrams = test_char_ngrams[top_features]

    train_basic = pd.concat(
        [
            train_basic.reset_index(drop=True),
            train_char_ngrams.reset_index(drop=True),
        ],
        axis=1,
    )
    test_basic = pd.concat(
        [
            test_basic.reset_index(drop=True),
            test_char_ngrams.reset_index(drop=True),
        ],
        axis=1,
    )
    print("N grams complete")

    # lda
    lda_extractor = LDATopicFeatureExtractor(
        n_topics=6, max_iter=10, random_state=random_state
    )

    train_lda, test_lda = lda_extractor.fit_transform(
        train_essays, test_essays)

    train_lda = train_lda.drop(columns=["id"])
    test_lda = test_lda.drop(columns=["id"])
    train_basic = pd.concat(
        [train_basic.reset_index(drop=True), train_lda.reset_index(drop=True)],
        axis=1,
    )
    test_basic = pd.concat(
        [test_basic.reset_index(drop=True), test_lda.reset_index(drop=True)], axis=1
    )

    # polars
    polars_extractor = PolarsFeatureExtractor()

    train_polars = polars_extractor.extract_features(train_logs)
    test_polars = polars_extractor.extract_features(test_logs)

    train_polars_temp = (
        train_polars.set_index("id").loc[train_ids].reset_index(drop=True)
    )
    test_polars_temp = test_polars.set_index(
        "id").loc[test_ids].reset_index(drop=True)

    polars_cols = [
        c for c in train_polars_temp.columns if c not in train_basic.columns]
    train_polars_temp = train_polars_temp[polars_cols]
    test_polars_temp = test_polars_temp[polars_cols]

    train_basic = pd.concat(
        [
            train_basic.reset_index(drop=True),
            train_polars_temp.reset_index(drop=True),
        ],
        axis=1,
    )
    test_basic = pd.concat(
        [
            test_basic.reset_index(drop=True),
            test_polars_temp.reset_index(drop=True),
        ],
        axis=1,
    )
    # tsne
    tsne_extractor = TSNEFeatureExtractor(
        perplexities=[20, 50, 80],
        n_components=2,
        random_state=random_state,
        n_jobs=-1,
    )

    train_tsne, test_tsne = tsne_extractor.fit_transform(
        train_basic, test_basic)

    train_basic = pd.concat(
        [train_basic.reset_index(drop=True),
         train_tsne.reset_index(drop=True)],
        axis=1,
    )
    test_basic = pd.concat(
        [test_basic.reset_index(drop=True), test_tsne.reset_index(drop=True)],
        axis=1,
    )
    print("Extra Features completed")

    # training
    ensemble = EnhancedEnsembleModel(
        n_splits=n_splits, random_state=random_state)

    print("fit on data")
    scores, train_preds, test_preds = ensemble.fit(train_basic, y, test_basic)

    submission = pd.DataFrame({"id": test_ids, "score": test_preds})
    print("submission complete")
    return submission




submission = run_integration_pipeline()


## Test Submission only

In [None]:
# === TEST ONLY: load model(s) -> predict on test -> write submission ===
import os
import joblib
import numpy as np
import pandas as pd
import time

# ---------------- paths ----------------


# ---------------- load test ----------------
# df_test = merged_df
# test_ids = df_test["id"].values
# X_test = df_test.drop(columns=["id"])


# model_file_candidate = "/kaggle/input/lgbm-ensemble/scikitlearn/default/1/lgbm.pkl"


# model_obj = None
# if os.path.exists(model_file_candidate):
#     model_obj = joblib.load(model_file_candidate)
#     print(f"Loaded model from: {model_file_candidate}")
# if model_obj is None:
#     raise FileNotFoundError("No saved model found. Expected one of: " + ", ".join(model_file_candidate))

# def predict_with_model_obj(model_obj, X):
#     # Multiple Fold Model
#     if isinstance(model_obj, (list, tuple)):
#         preds = np.mean([m.predict(X) for m in model_obj], axis=0)
#         return preds
#     # Single Model
#     return model_obj.predict(X)

# test_preds = predict_with_model_obj(model_obj,X_test)

# # ---------------- write submission ----------------
# submission = pd.DataFrame({"id": test_ids, "score": test_preds})
submission.to_csv(SUBMISSION_PATH, index=False)
print(f"Submission saved to: {SUBMISSION_PATH}\nSubmission Time {time.asctime( time.localtime(time.time()) )}")