Hello dear friends, in this notebook, we will try to train the model for these competitions. Let's see which model works better with this dataset, build our features, do cross-validation and make a prediction

# Imports

In [None]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import matplotlib.transforms as transforms
import seaborn as sns
from tqdm import tqdm
import pickle
from copy import deepcopy


# Loading datasets
Loading test and training datasets

In [None]:
train_df = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv')
train_scores = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv')
train_df.head()

In [None]:
train_df

Let's look at the unique values columns: down_event, activity, text_change

In [None]:
train_df['down_event'].unique()

In [None]:
train_df['activity'].unique()

In [None]:
train_df['text_change'].unique()

In [None]:
test_df = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv')
test_df

# Feature generate

Let's highlight the most default features to see how our model works

In [None]:
def summary_time(df):
    result = df.groupby('id')['action_time'].sum().reset_index()
    result.rename(columns={'action_time': 'summary_time'}, inplace=True)
    return result
def start_pause(df):
    result = df.groupby('id')['down_time'].min().reset_index()
    result.rename(columns={'down_time': 'start_pause'}, inplace=True)
    return result
def enter_click(df):
    copy_df = df
    copy_df['enter_click'] = (copy_df['down_event'] == 'Enter')
    copy_df = copy_df.groupby('id')['enter_click'].sum().reset_index()
    return copy_df
def space_click(df):
    copy_df = df
    copy_df['space_click'] = (copy_df['down_event'] == 'Space')
    copy_df = copy_df.groupby('id')['space_click'].sum().reset_index()
    return copy_df
def backspace_click(df):
    copy_df = df
    copy_df['backspace_click'] = (copy_df['down_event'] == 'Backspace')
    copy_df = copy_df.groupby('id')['backspace_click'].sum().reset_index()
    return copy_df
def symbol_length(df):
    result = df.groupby('id')['cursor_position'].max().reset_index()
    result.rename(columns={'cursor_position': 'symbol_length'}, inplace=True)
    return result
def text_length(df):
    result = df.groupby('id')['word_count'].max().reset_index()
    return result
def nonproduction_feature(df):
    result = df.groupby('id')['activity'].apply(lambda x: (x == 'Nonproduction').mean() * 100).reset_index()
    result.rename(columns={'activity': 'nonproduction_feature'}, inplace=True)
    return result
def input_feature(df):
    result = df.groupby('id')['activity'].apply(lambda x: (x == 'Input').mean() * 100).reset_index()
    result.rename(columns={'activity': 'input_feature'}, inplace=True)
    return result
def remove_feature(df):
    result = df.groupby('id')['activity'].apply(lambda x: (x == 'Remove/Cut').mean() * 100).reset_index()
    result.rename(columns={'activity': 'remove_feature'}, inplace=True)
    return result
def mean_action_time(df):
    result = df.groupby('id')['action_time'].mean().reset_index()
    result.rename(columns={'action_time': 'mean_action_time'}, inplace=True)
    return result

Сreate a dataset with our characteristics

In [None]:
def getDataset(train_df):
    new_df = summary_time(train_df)

    functions = [
        start_pause, enter_click, space_click,
        backspace_click, symbol_length, text_length, nonproduction_feature,
        input_feature, remove_feature, mean_action_time
    ]

    for func in functions:
        result_df = func(train_df)
        new_df = pd.merge(new_df, result_df, on='id', how='outer')

    return new_df

In [None]:
df = getDataset(train_df)
test = getDataset(test_df)

In [None]:
df

In [None]:
train_scores

In [None]:
df = pd.merge(df, train_scores, on='id', how='outer')

In [None]:
df

In [None]:
import catboost
from catboost import CatBoostRegressor

In [None]:
SEED = 1337

In [None]:
X = df.drop('score', axis=1)
X = X.drop('id',axis=1)
y = df['score']

In [None]:
X

# Models

As you understand from the title, let's see how the XGBoost, CatBoost, RandomForest models behave

In [None]:
cat_boost= CatBoostRegressor(iterations=1000, 
                          depth=6, 
                          learning_rate=0.1,verbose=False,
                          loss_function='RMSE',random_seed = SEED) 
xgb_boost = xgb.XGBRegressor(objective='reg:squarederror')
rf_model = RandomForestRegressor(n_estimators=100, random_state=SEED)

# Train and cross-validation

Let's create a function that will perform cross-validation and build a corresponding graph


In [None]:
def fit_and_validation(model,X,y):
    scores = []
    models = []
    n_splits = 10
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    np.random.seed(SEED)
    for i, (train_index, valid_index) in enumerate(kf.split(X, y)):

        print(f'Fold #{i}')

        X_train = X.iloc[train_index]
        y_train = y.iloc[train_index]

        model = deepcopy(model)

        model = model.fit(X_train, y_train)
        models.append(deepcopy(model))

        X_valid = X.iloc[valid_index]
        y_valid = y.iloc[valid_index]
        y_pred = model.predict(X_valid)
        fold_score = mean_squared_error(y_valid, y_pred)
        scores.append(fold_score)

        print(f'Mean Squared Error: {fold_score}')
    mean = np.mean(scores)
    std = np.std(scores)
    fig, ax = plt.subplots(figsize=(10,3))

    sns.scatterplot(x=scores, y=[0]*len(scores), ax=ax)

    trans = transforms.blended_transform_factory(ax.transData, ax.transAxes)

    ax.get_yaxis().set_visible(False)
    plt.axvline(x=mean)
    plt.text(mean+0.0005, 0.7, f'Mean:{mean:.4f}\nStd:{std:.4f}', transform=trans)
    plt.errorbar(x=mean, y=0, xerr=std, color='r')


    sns.despine(right=True, top=True, left=True, ax=ax)

# Catboost
Results for the CatBoost model

In [None]:
fit_and_validation(cat_boost, X,y)

# XGBoost
Results for the XGB model

In [None]:
fit_and_validation(xgb_boost, X,y)

# RandomForest
Results for the Random Forest model

In [None]:
fit_and_validation(rf_model, X,y)

In [None]:
rf_model.fit(X,y)

# Prediction

Let's make a prediction on a test dataset

In [None]:
ids = test['id']
test = test.drop(columns=['id'])

In [None]:
test

In [None]:
predictions = rf_model.predict(test)
predictions_df = pd.DataFrame({'id': ids, 'score': predictions})

In [None]:
predictions_df

In [None]:
predictions_df.to_csv('submission.csv', index=False)

# Conclusion
As a result of our experiment, we saw that the RandomForest model with these features works better than CatBoost or XGBoost. I hope that this laptop will help you in this competition. I will be glad to see your upvotes. Good luck!!!