In [22]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import torch.nn as nn
import torch
from torch import optim

import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/linking-writing-processes-to-writing-quality/sample_submission.csv
/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv
/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv
/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv


# ***Competition Overview:***

**Dataset:** The competition dataset comprises approximately 5000 logs of user inputs. These logs are generated while users are composing essays.
**Task:** The goal of this competition is to predict the score that an essay received based on the log of user inputs. The scores are on a scale of 0 to 6, indicating the quality or effectiveness of the essay.
**File and Field Information:**
The competition provides a CSV file called train_logs.csv, which contains the following fields:

* id: This is a unique identifier for each essay.
* event_id: An index indicating the order of events in the log, ordered chronologically.
* down_time: The time when the down event (e.g., keypress or mouse click) occurred, measured in milliseconds.
* up_time: The time when the up event (e.g., key release or mouse release) occurred, measured in milliseconds.
* action_time: The duration of the event, which is the difference between down_time and up_time.
* activity: The category of activity that the event belongs to. It can have values like "Nonproduction," "Input," "Remove/Cut," "Paste," "Replace," or "Move From [x1, y1] To [x2, y2]".
* down_event: The name of the event when the key or mouse is pressed.
* up_event: The name of the event when the key or mouse is released.
* text_change: The text that changed as a result of the event (if any). This field represents the alteration made to the essay text.
* cursor_position: The character index of the text cursor after the event.
* word_count: The word count of the essay after the event.

**Objective:**
Participants in this competition are tasked with using the provided log data to build a predictive model. This model should take the log events as input and predict the essay's score on the 0 to 6 scale.

In [3]:
df = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv')
df_score = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv')

In [4]:
test = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv')

In [23]:
test.head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,0000aaaa,1,338433,338518,85,Input,Space,Space,,0,0
1,0000aaaa,2,760073,760160,87,Input,Space,Space,,1,0
2,2222bbbb,1,711956,712023,67,Input,q,q,q,0,1
3,2222bbbb,2,290502,290548,46,Input,q,q,q,1,1
4,4444cccc,1,635547,635641,94,Input,Space,Space,,0,0


In [6]:
df.head(2)

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0


In [7]:
df_score.head(5)

Unnamed: 0,id,score
0,001519c8,3.5
1,0022f953,3.5
2,0042269b,6.0
3,0059420b,2.0
4,0075873a,4.0


In [9]:
df.shape, df.activity.nunique()

((8405898, 11), 50)

In [10]:
df[df['id']=='001519c8']

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1
...,...,...,...,...,...,...,...,...,...,...,...
2552,001519c8,2553,1781786,1781841,55,Remove/Cut,Backspace,Backspace,q,555,255
2553,001519c8,2554,1781917,1781991,74,Remove/Cut,Backspace,Backspace,q,554,255
2554,001519c8,2555,1782062,1782141,79,Remove/Cut,Backspace,Backspace,q,553,255
2555,001519c8,2556,1782922,1782985,63,Input,q,q,q,554,255


#### Activity 
* Nonproduction - The event does not alter the text in any way
* Input - The event adds text to the essay
* Remove/Cut - The event removes text from the essay
* Paste - The event changes the text through a paste input
* Replace - The event replaces a section of text with another string
* Move From [x1, y1] To [x2, y2] - The event moves a section of text spanning character index x1, y1 to a new location x2, y2

In [11]:
df.activity.value_counts()

activity
Input                                     6726796
Remove/Cut                                 970158
Nonproduction                              703851
Replace                                      4448
Paste                                         599
Move From [1306, 1371] To [1061, 1126]          2
Move From [13, 65] To [9, 61]                   1
Move From [274, 314] To [299, 339]              1
Move From [624, 625] To [845, 846]              1
Move From [1861, 2063] To [1766, 1968]          1
Move From [1766, 1968] To [1861, 2063]          1
Move From [2091, 2179] To [252, 340]            1
Move From [923, 1077] To [340, 494]             1
Move From [0, 1] To [590, 591]                  1
Move From [999, 1000] To [1000, 1001]           1
Move From [0, 75] To [1, 76]                    1
Move From [1651, 1769] To [1565, 1683]          1
Move From [61, 136] To [0, 75]                  1
Move From [623, 632] To [624, 633]              1
Move From [75, 134] To [304, 363]        

In [10]:
def move_from(x):
    if 'Move From' in x:
        return 'Move From'
    else:
        return x 

In [11]:
df.activity  = df['activity'].apply(lambda x : move_from(x))

In [12]:
test.activity  = test['activity'].apply(lambda x : move_from(x))

In [34]:
#df.activity.value_counts()
unique_down_events = list(df['down_event'].unique())
print(unique_down_events)
unique_activities = list(df['activity'].unique())
unique_activities.append('Unidentified')
print(unique_activities)



['Leftclick', 'Shift', 'q', 'Space', 'Backspace', '.', ',', 'Enter', 'ArrowLeft', "'", ';', 'ArrowRight', '-', '?', 'Tab', '"', 'ArrowUp', 'ArrowDown', 'Rightclick', '=', 'CapsLock', 'Control', 'c', 'v', '/', 'Delete', ':', 'z', '[', '$', '(', ')', '+', 'Home', 'End', '\\', 'Meta', '*', '&', 'AudioVolumeMute', 'x', '!', 'Insert', 'MediaPlayPause', 'NumLock', '%', 'V', '>', 'Alt', 'AudioVolumeUp', 'ContextMenu', 'AudioVolumeDown', 'a', '<', 'PageDown', ']', 'Middleclick', '@', 'F12', 'j', '\x96', 'Dead', 't', 's', 'n', 'y', '{', 'ScrollLock', '¿', 'Process', '}', 'MediaTrackPrevious', 'MediaTrackNext', 'F3', '^', 'Unidentified', 'Cancel', '2', 'i', 'd', 'r', 'e', '`', '\x9b', 'm', '#', '~', 'PageUp', 'T', 'A', 'b', 'S', 'ModeChange', '_', 'Escape', 'F11', 'Unknownclick', 'AltGraph', 'F10', 'h', 'F15', 'Clear', 'OS', 'F', 'C', 'o', 'Ä±', 'f', 'u', 'w', 'p', 'g', 'M', 'l', '|', 'â\x80\x93', 'I', '0', '1', '5', '\x97', 'Ë\x86', '¡', '\x80', 'Â´', 'Å\x9f', 'F2', 'ä', 'F1', 'Pause', 'F6']
['

# Creating Essay Dataset

In [None]:
class EssayDataset(Dataset):
    def __init__(self, train_df, score_df):
        self.train_df = train_df
        self.score_df = score_df
        self.unique_ids = train_df['id'].unique()
        self.columns_to_return = ['action_time', 'activity', 'down_event', 'word_count']
        self.down_event_to_index = {event: idx for idx, event in enumerate(unique_down_events)}
        self.unidentified_de_index = down_event_to_index.get('Unidentified', len(unique_down_events))
        self.activity_to_index =  {act: idx for idx, act in enumerate(unique_activities)}
        self.unidentified_a_index = activity_to_index.get('Unidentified', len(unique_activities))
        


    def __len__(self):
        return len(self.unique_ids)

    def __getitem__(self, idx):
        essay_id = self.unique_ids[idx]
        essay_data = self.train_df[self.train_df['id'] == essay_id]
        essay_data = essay_data.sort_values(by='event_id')[self.columns_to_return]
        
        # Normalize and replace categories with indices
        essay_data['action_time'] = (essay_data['action_time'] - essay_data['action_time'].min()) / (essay_data['action_time'].max() - essay_data['action_time'].min())
        essay_data['word_count'] = (essay_data['word_count'] - essay_data['word_count'].min()) / (essay_data['word_count'].max() - essay_data['word_count'].min())
        essay_data['down_event'] = essay_data['down_event'].map(self.down_event_to_index).fillna(self.unidentified_de_index).astype(int)
        essay_data['activity'] = essay_data['activity'].map(self.activity_to_index).fillna(self.unidentified_a_index).astype(int)
        
        essay_score = self.score_df[self.score_df['id'] == essay_id]['score'].values[0]
        essay_tensor = torch.Tensor(essay_data.values)
        return essay_tensor, essay_score

In [None]:
train_dataset = EssayDataset(df, df_score)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, drop_last=True)

# Define Input Embeddings


In [None]:
class InputEssayEmbeddings(nn.Module):
    def __init__(self):
        super(SequenceInputFeatures, self).__init__()

        # Embedding for the categorical columns
        self.embed_activity = nn.Embedding(len(unique_activities), 20)
        self.embed_down_event = nn.Embedding(len(unique_down_events), 20)
        
        # activities: 6
        # down_event: 131

    def forward(self, input_tensor):
        # Slice columns from the input tensor
        action_time_col = x[:, :, 0:1]  # [B, num_events, 1]
        activity_col = x[:, :, 1].long()  # [B, num_events]
        down_event_col = x[:, :, 2].long()    # [B, num_events]
        word_count_col = x[:, :, 3:4]  # [B, num_events, 1]

        # Pass through respective embedding layers
        embed_down_event = self.embed_down_event(down_event_col)  # [B, num_events, embed_dim_down_event]
        embed_activity = self.embed_activity(activity_col)        # [B, num_events, embed_dim_activity]
        
        # Concatenate them with the rest of the tensor
        out = torch.cat([action_time_col, embed_activity, embed_down_event, word_count_col], dim=2)
        return out # [B, num_events, 42]
    
class PositionalEncoding(nn.Module):
    def __init__(self, n_embed, seq_len):
        super().__init__()
        # Dict size
        self.emb = nn.Embedding(seq_len, n_embed)


    def forward(self, x, batched=False):
        """
        :param x: If using batching, should be [batch size, seq len, embedding dim]. Otherwise, [seq len, embedding dim]
        :return: a tensor of the same size with positional embeddings added in
        """
        # Second-to-last dimension will always be sequence length
        input_size = x.shape[-2]
        indices_to_embed = torch.tensor(np.asarray(range(0, input_size))).type(torch.LongTensor)
        if batched:
            # Use unsqueeze to form a [1, seq len, embedding dim] tensor -- broadcasting will ensure that this
            # gets added correctly across the batch
            emb_unsq = self.emb(indices_to_embed).unsqueeze(0)
            return x + emb_unsq
        else:
            return x + self.emb(indices_to_embed)

# Define Architecture Sub Components

In [None]:
class FeedFoward(nn.Module):
    def __init__(self, n_embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),
            nn.ReLU(),
            nn.Linear(4 * n_embed, n_embed),
        )

    def forward(self, x):
        return self.net(x)
    
class Head(nn.Module):
    def __init__(self, seq_length, n_embed, num_heads, n_internal):
        super().__init__()
        self.K = nn.Linear(n_embed, n_internal)
        self.Q = nn.Linear(n_embed, n_internal)
        self.V = nn.Linear(n_embed, n_internal)
        self.w0 = nn.Linear(n_internal, n_embed // num_heads)
        

    def forward(self, input_vecs):
        keys = self.K(input_vecs) # B, L, d_internal
        d_k = keys.shape[-1]
        queries = self.Q(input_vecs) # B, L, d_internal
        value = self.V(input_vecs) # B, L, d_internal
        
        weights = torch.matmul(queries, keys.transpose(-2, -1)) * d_k**-0.5# L, L
        attention = torch.softmax(weights, dim=-1)

        logit = torch.matmul(attention , value) # B, L, d_internal
        logit = self.w0(logit)
        return logit

class MultiHeadAttention(nn.Module):

    def __init__(self, seq_length, n_embed, num_heads, n_internal):
        super().__init__()
        self.heads = nn.ModuleList([Head(seq_length, n_embed, num_heads, n_internal) for _ in range(num_heads)])
        
        
    def forward(self, input_vecs):
        cls_tokens = []
        for head in self.heads:
            head_out = head(input_vecs)  
            cls_tokens.append(head_out[:, 0])
        cls_tokens_cat = torch.stack(cls_tokens, dim=1)
        return cls_tokens_cat # B, num_heads, n_embed

class MHAConvolution(nn.Module):
    def __init__(self, seq_length, n_embed, num_heads, n_internal, stride=1):
        super().__init__()
        self.stride = stride
        self.window_size = seq_length
        self.cls_token = nn.Parameter(torch.randn(1, 1, n_embed)) 
        self.pos_embedding = PositionalEncoding(n_embed, num_positions=seq_length)
        self.multi_head_attention = MultiHeadAttention( seq_length+1, n_embed, num_heads, n_internal)
        self.ffwd = FeedFoward(n_embed*num_heads)
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)

    def forward(self, input_vecs, batched=False): # B, long_seq_len, n_embed
        
        outputs = []
        for i in range(0, input_vecs.size(1) - self.window_size + 1, self.stride): # TODO: think about ways to downsample other than stride
            # prepend the cls token to the input
            local_window = input_vecs[:, i:i+self.window_size, :]  # [B, seq_length, embed_size]
            B, L, _ = local_window.size()
            cls_tokens_repeated = self.cls_token.repeat(B, 1, 1) # B, 1, n_embed
            cls_tokens_repeated_heads = cls_tokens_repeated.repeat(1, num_heads, 1) # B, num_heads, n_embed
            local_window = self.pos_embedding(local_window, batched=batched)
            local_window_cls = torch.cat([cls_tokens_repeated, local_window], dim=1)
            attention_out = self.multi_head_attention(self.ln1(local_window_cls)) # B, num_heads, n_embed, one cls embedding vector for each head
            attention_out += cls_tokens_repeated_heads # residual for the cls_tokens
            out = attention_out + self.ffwd(self.ln2(attention_out))
            out = out.view(B, -1) # B, n_heads*n_embed
            outputs.append(out) 
        return torch.stack(outputs, dim=1) # B, (long_seq_len-seq_len)/stride, n_heads(n_embed)
    

# Define the Model

In [None]:
class GlobalAveragingTransformer:
    def __init__(self, seq_len, n_embed, n_internal, num_layers, num_heads, n_scores):
        super().__init__()
        self.seq_len = seq_len
        self.mha_conv = MHAConvolution(seq_len, n_embed, num_heads, n_internal, stride=seq_len//2)
        self.classifier = nn.Linear(n_embed*num_heads, n_scores) # consider adding intermediat ffw layers
        self.embedding = InputEssayEmbeddings() 
        self.adaptive_pool = nn.AdaptiveAvgPool1d(1)
        
    def forward(self, input_tensor, batched=False):
        x = self.embedding(input_tensor)
        x = self.mha_conv(x) 
        x = self.adaptive_pool(x).squeeze(1) # B, n_embed*n_heads
        x = self.classifier(x) # B, n_scores
        if batched:
            return x
        else:
            return x.squeeze(0)

# Create DataLoaders

# Create Model

# Run Training Loop

In [20]:
df_agg = df.groupby(['id','activity']).agg({
    'event_id': np.max,
    'action_time' : [np.mean, np.sum, np.min, np.max],
    'word_count' : np.max,
    'cursor_position' : np.max
}).reset_index()
df_agg.head(10)
#df_agg.colum1ns = [i+"_"+j for i,j in df_agg.columns]

Unnamed: 0_level_0,id,activity,event_id,action_time,action_time,action_time,action_time,word_count,cursor_position
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,amax,mean,sum,amin,amax,amax,amax
0,001519c8,Input,2556,121.259204,243731,29,248,256,1495
1,001519c8,Move From,2516,0.0,0,0,0,256,466
2,001519c8,Nonproduction,2557,154.216667,18506,0,2259,256,1539
3,001519c8,Remove/Cut,2555,81.846523,34130,2,529,256,1482
4,001519c8,Replace,2482,125.142857,876,98,196,254,1448
5,0022f953,Input,2359,122.750774,237891,22,250,323,1613
6,0022f953,Nonproduction,2454,54.255906,13781,0,1758,321,1676
7,0022f953,Paste,2241,71.0,71,71,71,316,696
8,0022f953,Remove/Cut,2368,90.576923,23550,28,502,322,1597
9,0022f953,Replace,2325,98.0,98,98,98,320,80


In [None]:
test = test.groupby(['id','activity']).agg({
    'event_id': np.max,
    'action_time' : [np.mean, np.sum, np.min, np.max],
    'word_count' : np.max,
    'cursor_position' : np.max
}).reset_index()

test.columns = [i+"_"+j for i,j in test.columns]

In [None]:
feat = ['event_id_amax', 'action_time_mean',
       'action_time_sum', 'action_time_amin', 'action_time_amax',
       'word_count_amax', 'cursor_position_amax']
df_pvt = pd.pivot_table(df_agg, values =feat, index =['id_'],
                         columns =['activity_'], aggfunc = np.max).reset_index()

df_pvt.columns = [i+"_"+j for i,j in df_pvt.columns]

In [None]:
remaining_cols = list(set(feat) - set(test.columns))
if len(remaining_cols) != 0:
    for i in remaining_cols:
        test[i] = 0

In [None]:
test = pd.pivot_table(test, values = feat, index =['id_'],
                         columns =['activity_'], aggfunc = np.max).reset_index()

test.columns = [i+"_"+j for i,j in test.columns]

In [None]:
remaining_cols = list(set(df_pvt.columns) - set(test.columns))
if len(remaining_cols) != 0:
    for i in remaining_cols:
        test[i] = 0

In [None]:
test = test[df_pvt.columns]

In [None]:
test = test[df_pvt.columns]
df_pvt = df_pvt.rename(columns={'id__':'id'})
test = test.rename(columns={'id__':'id'})
df_pvt = df_pvt.merge(df_score, on = 'id', how = 'left')

In [None]:
sns.histplot(df_pvt['score'])
plt.figure(figsize=(15, 7))
sns.heatmap(df_pvt.drop(['id'],axis= 1).corr(), annot=False)

In [None]:
cols = ['action_time_amax_Input', 'action_time_amax_Move From',
       'action_time_amax_Nonproduction', 'action_time_amax_Paste',
       'action_time_amax_Remove/Cut', 'action_time_amax_Replace',
       'action_time_amin_Input', 'action_time_amin_Move From',
       'action_time_amin_Nonproduction', 'action_time_amin_Paste',
       'action_time_amin_Remove/Cut', 'action_time_amin_Replace',
       'action_time_mean_Input', 'action_time_mean_Move From',
       'action_time_mean_Nonproduction', 'action_time_mean_Paste',
       'action_time_mean_Remove/Cut', 'action_time_mean_Replace',
       'action_time_sum_Input', 'action_time_sum_Move From',
       'action_time_sum_Nonproduction', 'action_time_sum_Paste',
       'action_time_sum_Remove/Cut', 'action_time_sum_Replace',
       'cursor_position_amax_Input', 'cursor_position_amax_Move From',
       'cursor_position_amax_Nonproduction', 'cursor_position_amax_Paste',
       'cursor_position_amax_Remove/Cut', 'cursor_position_amax_Replace',
       'event_id_amax_Input', 'event_id_amax_Move From',
       'event_id_amax_Nonproduction', 'event_id_amax_Paste',
       'event_id_amax_Remove/Cut', 'event_id_amax_Replace',
       'word_count_amax_Input', 'word_count_amax_Move From',
       'word_count_amax_Nonproduction', 'word_count_amax_Paste',
       'word_count_amax_Remove/Cut', 'word_count_amax_Replace',]

In [None]:
for col in cols:
    print(f'Plots for {col}')
    plt.figure(figsize=(15, 5))

    plt.subplot(1,3, 1)  
    sns.boxplot(y=df_pvt[col], x=df_pvt['score'], color='#4082ed')
    plt.title("Scatterplot with score")

    plt.subplot(1, 3, 2)  
    sns.lineplot(y=df_pvt[col], x=df_pvt['score'], color='#40b9ed')
    plt.title("trend with score")

    plt.subplot(1, 3, 3)  
    sns.histplot(x=df_pvt[col], bins=50, kde=True, color='#40d3ed')
    plt.title(f"Histogram of {col}")

    plt.tight_layout() 
    plt.show()


# XGB model tuned using Optuna

In [None]:
X = df_pvt.drop(['id','score'], axis = 1)
y = df_pvt['score']

In [None]:
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial):
    params = {
        'objective': 'reg:squarederror',  
        'eval_metric': 'rmse',  
        'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
        'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 10),
        'subsample': trial.suggest_float('subsample', 0.7, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 0.9),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000)
    }

    model = xgb.XGBRegressor(**params)
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False)

    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)

    return rmse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)  

best_params = study.best_params
best_rmse = study.best_value

print(f'Best Parameters: {best_params}')
print(f'Best RMSE: {best_rmse}')

final_model = xgb.XGBRegressor(**best_params, random_state=42)
final_model.fit(X_train, y_train)

In [None]:
print('Train')
print(mean_squared_error(y_train,final_model.predict(X_train)))
print('Test')
print(mean_squared_error(y_test,final_model.predict(X_test)))

In [None]:
test_pred = final_model.predict(test.drop('id',axis = 1))
test = test[['id']]
test['score'] = test_pred

test.to_csv('submission.csv', index = False)