<a href="https://colab.research.google.com/github/yuriao/DataScienceProjects/blob/main/linking_typing_to_score_lgbm_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
tqdm.pandas()

from lightgbm import LGBMRegressor
from sklearn import model_selection, metrics
from sklearn.preprocessing import LabelEncoder

import string
from nltk.corpus import stopwords

import optuna
from lightgbm.callback import log_evaluation, early_stopping

# Introduction

In this notebook, we will use LGBMRegressor to examine the score of writing based on typeing activities
- in this version only numeric typing features are used

In [None]:
train = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv')
test = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv')
train_score = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv')

In [None]:
train=train.merge(train_score,on='id')

In [None]:
np.unique(train['down_event'])

array(['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-',
       '.', '/', '0', '1', '2', '5', ':', ';', '<', '=', '>', '?', '@',
       'A', 'Alt', 'AltGraph', 'ArrowDown', 'ArrowLeft', 'ArrowRight',
       'ArrowUp', 'AudioVolumeDown', 'AudioVolumeMute', 'AudioVolumeUp',
       'Backspace', 'C', 'Cancel', 'CapsLock', 'Clear', 'ContextMenu',
       'Control', 'Dead', 'Delete', 'End', 'Enter', 'Escape', 'F', 'F1',
       'F10', 'F11', 'F12', 'F15', 'F2', 'F3', 'F6', 'Home', 'I',
       'Insert', 'Leftclick', 'M', 'MediaPlayPause', 'MediaTrackNext',
       'MediaTrackPrevious', 'Meta', 'Middleclick', 'ModeChange',
       'NumLock', 'OS', 'PageDown', 'PageUp', 'Pause', 'Process',
       'Rightclick', 'S', 'ScrollLock', 'Shift', 'Space', 'T', 'Tab',
       'Unidentified', 'Unknownclick', 'V', '[', '\\', ']', '^', '_', '`',
       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'l', 'm', 'n',
       'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{',
       '|'

In [None]:
np.unique(train['up_event'])

array(['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-',
       '.', '/', '0', '1', '2', '5', ':', ';', '<', '=', '>', '?', '@',
       'A', 'Alt', 'AltGraph', 'ArrowDown', 'ArrowLeft', 'ArrowRight',
       'ArrowUp', 'AudioVolumeDown', 'AudioVolumeMute', 'AudioVolumeUp',
       'Backspace', 'C', 'Cancel', 'CapsLock', 'Clear', 'ContextMenu',
       'Control', 'Dead', 'Delete', 'End', 'Enter', 'Escape', 'F1', 'F10',
       'F11', 'F12', 'F15', 'F2', 'F3', 'F6', 'Home', 'Insert',
       'Leftclick', 'M', 'MediaPlayPause', 'MediaTrackNext',
       'MediaTrackPrevious', 'Meta', 'Middleclick', 'ModeChange',
       'NumLock', 'OS', 'PageDown', 'PageUp', 'Pause', 'Process',
       'Rightclick', 'S', 'ScrollLock', 'Shift', 'Space', 'T', 'Tab',
       'Unidentified', 'Unknownclick', 'V', '[', '\\', ']', '^', '_', '`',
       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
       '{', '|', '}'

# EDA: Check nans

In [None]:
print(train.isna().sum())
print(test.isna().sum())

id                 0
event_id           0
down_time          0
up_time            0
action_time        0
activity           0
down_event         0
up_event           0
text_change        0
cursor_position    0
word_count         0
score              0
dtype: int64
id                 0
event_id           0
down_time          0
up_time            0
action_time        0
activity           0
down_event         0
up_event           0
text_change        0
cursor_position    0
word_count         0
dtype: int64


# Preprocessing: encoding catagorical / verbal features, get feature/score matrixs

In [None]:
labelEncoderList=[]
for col in ['activity','down_event','up_event']:
    le=LabelEncoder()
    le.fit(train[col])
    train[col]=le.transform(train[col])
    labelEncoderList.append(le)

In [None]:
all_train_scores=train['score']
all_train_features=train.drop(['id','event_id','activity','down_event','up_event','score','text_change'],axis=1)

## LGBM model

In [None]:
# LGBM model, use sklearn's MultiOutputRegressor to zip 6 prediction models

def model_gen(param):
    model = LGBMRegressor(**param)
    return model

## Hyperparameter optimization with Optuna

In [None]:
def objective(trial, data=all_train_features, target=all_train_scores):
    train_x, test_x, train_y, test_y = model_selection.train_test_split(data, target, test_size=0.3, random_state=42)
    param = {
        'metric': 'rmse',
        'random_state': 42,
        'n_estimators': trial.suggest_int('n_estimators', 10, 500),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.001, 0.01, 0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.2, 0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.004, 0.008, 0.01, 0.02, 0.05, .1, 0.2, 0.5]),
        'max_depth': trial.suggest_categorical('max_depth', [10, 20,100, 150]),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
    }

    model = model_gen(param)

    model.fit(train_x, train_y)

    preds = model.predict(test_x)

    rmse = np.sqrt(metrics.mean_squared_error(test_y, preds))

    return rmse

In [None]:

#studies=[]

#optuna.logging.set_verbosity(optuna.logging.INFO)

#study = optuna.create_study(direction='minimize')
#study.optimize(objective, n_trials=50)
#print(f'Number of finished trials: {len(study.trials)}')
#print(f'Best trial: {study.best_trial.params}')
#print(f'Best score: {study.best_value}')

In [None]:
best_params={'n_estimators': 434, 'reg_alpha': 0.2095518332792688, 'reg_lambda': 0.26095334549724497, 'colsample_bytree': 0.3, 'subsample': 0.2, 'learning_rate': 0.02, 'max_depth': 150, 'min_child_samples': 21}

# training

In [None]:
LGBMModel=model_gen(best_params)

In [None]:
print(f'Fitting Model')
LGBMModel.fit(all_train_features, all_train_scores)


Fitting Model


# generate output

In [None]:
test_id=test['id']
test=test.drop(['id','event_id','text_change','activity','down_event','up_event'],axis=1)


In [None]:
#encodingCols=['activity','down_event','up_event']
#for i in range(0,len(labelEncoderList)):
#    test[encodingCols[i]]=labelEncoderList[i].transform(test[encodingCols[i]])

In [None]:
predictions = LGBMModel.predict(test)
test['predictions']=predictions
test['id']=test_id

In [None]:
test

Unnamed: 0,down_time,up_time,action_time,cursor_position,word_count,predictions,id
0,338433,338518,85,0,0,3.140039,0000aaaa
1,760073,760160,87,1,0,2.895859,0000aaaa
2,711956,712023,67,0,1,2.865882,2222bbbb
3,290502,290548,46,1,1,3.234341,2222bbbb
4,635547,635641,94,0,0,2.928482,4444cccc
5,184996,185052,56,1,1,3.351659,4444cccc


In [None]:
submission = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/sample_submission.csv')
test_pred=list(test.groupby('id').mean()['predictions'])
submission['score']=test_pred

In [None]:
submission

Unnamed: 0,id,score
0,0000aaaa,3.017949
1,2222bbbb,3.050112
2,4444cccc,3.14007


In [None]:
submission.to_csv("submission.csv", index=False)

In [None]:
print('done')

done
