In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
import logging
import random
import re
import copy
from tqdm import tqdm

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from scipy.stats import skew

from nltk.tokenize import sent_tokenize, word_tokenize
import optuna
import lightgbm as lgb
import xgboost as xgb
import shap
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Activation, Embedding
from tensorflow.keras.models import Sequential

# import torch
# import torch.nn as nn
# from transformers import AutoModelForMaskedLM, AutoTokenizer

np.set_printoptions(threshold=400)
pd.set_option('display.max_rows', 400)
pd.set_option('display.max_columns', 400)

warnings.simplefilter('ignore')
logging.disable(logging.ERROR)
sns.set_theme()
sns.set_palette('Set2')

In [None]:
train_logs_original = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv')
train_logs = train_logs_original.copy()
train_scores = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv')
test_logs = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv')
sample_submission = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/sample_submission.csv')
train_logs.head(10)

In [None]:
SHORT = False
LOCAL = False
seed = 42
def_number = 2

random.seed(seed)
np.random.seed(seed)
# torch.manual_seed(seed)
tf.random.set_seed(seed)

In [None]:
def q1(x):
    return x.quantile(0.25)
def q3(x):
    return x.quantile(0.75)

word_groupby_agg = {
    'word': ['count'],
    'word_len': ['max', 'mean', 'sum'],
    'word_exception': ['sum'],
    }
sentence_groupby_agg = {
    'sent': ['count'],
    'sent_len': ['min', 'max', 'mean', 'sum'], #
    'sent_word_count': ['min', 'max', 'mean'], # sum 無い方がよい
    'sent_exception': ['sum'],
    }
paragraph_groupby_agg = {
    'paragraph': ['count'],
    'paragraph_len': ['min', 'max', 'mean', 'sum'], #
    'paragraph_word_count': ['min', 'max', 'mean'], # sum 無い方がよい
    'paragraph_sent_count': ['max', 'mean', 'sum'],
    'paragraph_exception': ['sum'],
    }

essay_groupby_agg = {**word_groupby_agg, **sentence_groupby_agg, **paragraph_groupby_agg}

essay_col_names = []
for key, values in essay_groupby_agg.items():
    for value in values:
        essay_col_names.append(f'{key}_{value}')

In [None]:
activity_cols = ['Input', 'Nonproduction', 'Paste', 'Remove/Cut', 'Replace']

events_cols = [
    'Leftclick', 'Shift', 'q', 'Space', 'Backspace', '.', ',', 'Enter',
    'ArrowLeft', "'", ';', 'ArrowRight', '-', '?', 'Tab', '"',
    'ArrowUp', 'ArrowDown', 'Rightclick', '=', 'CapsLock', 'Control',
    '/', 'Delete', ':', '[', '$', '(', ')', '+', 'Home', 'End', '\\',
    'Meta', '*', '&', 'AudioVolumeMute', '!', 'Insert',
    'MediaPlayPause', 'NumLock', '%', '>', 'Alt', 'AudioVolumeUp',
    'ContextMenu', 'AudioVolumeDown', '<', 'PageDown', ']',
    'Middleclick', '@', 'F12', '\x96', 'Dead', '{', 'ScrollLock', '¿',
    'Process', '}', 'MediaTrackPrevious', 'MediaTrackNext', 'F3', '^',
    'Unidentified', 'Cancel', '2', '`', '\x9b', '#', '~', 'PageUp',
    'ModeChange', '_', 'Escape', 'F11', 'Unknownclick', 'AltGraph',
    'F10', 'F15', 'Clear', 'OS', 'Ä±', '|', 'â\x80\x93', '0', '1', '5',
    '\x97', 'Ë\x86', '¡', '\x80', 'Â´', 'Å\x9f', 'F2', 'ä', 'F1', 'Pause', 'F6',
    'a', 's', 'z', 'x', 'c', 'v']

text_change_cols = [
    'NoChange', 'q', ' ', '.', ',', '\n', "'", ';', '-', '?', '"', '=',
    '/', ':', '[', '$', '(', ')', '+', '\\', '*', '&', '!', '%', '>',
    '<', ']', '@', '\x96', '{', '\x94', '\x93', '¿', '}', '^', '`',
    '\x9b', '#', '~', '_', 'Ä±', '|', 'â\x80\x93', '\x97', 'Ë\x86',
    '¡', '\x80', 'Â´', 'Å\x9f', 'ä', '·',
    '=> q', '=> .', '=> ,', '=>  ', '=> -', '=> \n', '=> ;', '=> ?',
    '=> ]', '=> "', '=> !', '=> =', '=> \\', '=> :', '=> /', '=> (', '=> [', "=> '"
    ]

replace_text = [
    '=> q', '=> .', '=> ,', '=>  ', '=> -', '=> \n', '=> ;', '=> ?',
    '=> ]', '=> "', '=> !', '=> =', '=> \\', '=> :', '=> /', '=> (', '=> [', "=> '", '=> )',
    ]

# idとidの境目のindex
id_length = train_logs['id'].value_counts(sort=False).values
id_separator = []
for i, _ in enumerate(id_length):
    id_separator.append(sum(id_length[:i+1]))
id_separator.pop(-1)
id_separator.insert(0, 0)
id_length

In [None]:
events_main_cols = ['q', 'Space', 'Backspace', 'Shift', 'ArrowRight', 'Leftclick', 'ArrowLeft', '.', ',', 'ArrowDown', 'ArrowUp', 'Enter', 'CapsLock', "'", 'Delete', 'Unidentified', 'Control_action']
text_change_main_cols = ['q', ' ', '.', ',', '\n', "'", '"', '-', '?', ';', '=', '/', '\\', ':']

In [None]:
log_groupby_agg = {
    'event_id': ['max'],
    'down_time': ['first'],
    'up_time': ['last'],
    'activity': ['nunique'],
    'down_event': ['nunique'],
    'text_change': ['nunique'],
    'action_time': ['max', 'mean', 'quantile', 'skew', 'sum'],
    'cursor_position': ['max', 'mean'],
    'word_count': ['max'],
    'down_time_diff': ['max', 'min', 'mean', 'quantile', 'skew'],
    'up_time_diff': ['max', 'min', 'mean', 'quantile', 'skew'],
    'cursor_position_diff': ['max', 'mean'],
    'word_count_diff': ['max', 'mean'],
    'idol_time': ['max', 'min', 'mean', 'quantile', 'skew', 'sum']}

log_col_names = []
for key, values in log_groupby_agg.items():
    for value in values:
        log_col_names.append(f'{key}_{value}')
# len(log_col_names)

# def

In [None]:
class EssayConstructor:

    def processingInputs(self,currTextInput):
        # Where the essay content will be stored
        essayText = ''
        # Produces the essay
        for Input in currTextInput.values:
            # Input[0] = activity
            # Input[1] = cursor_position
            # Input[2] = text_change
            # Input[3] = id
            # If activity = Replace
            if Input[0] == 'Replace':
                # splits text_change at ' => '
                replaceTxt = Input[2].split(' => ')
                # DONT TOUCH
                essayText = essayText[:Input[1] - len(replaceTxt[1])] + replaceTxt[1] + essayText[Input[1] - len(replaceTxt[1]) + len(replaceTxt[0]):]
                continue

            # If activity = Paste
            if Input[0] == 'Paste':
                # DONT TOUCH
                essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]
                continue

            # If activity = Remove/Cut
            if Input[0] == 'Remove/Cut':
                # DONT TOUCH
                essayText = essayText[:Input[1]] + essayText[Input[1] + len(Input[2]):]
                continue

            # If activity = Move...
            if 'M' in Input[0]:
                # Gets rid of the "Move from to" text
                croppedTxt = Input[0][10:]
                # Splits cropped text by ' To '
                splitTxt = croppedTxt.split(' To ')
                # Splits split text again by ', ' for each item
                valueArr = [item.split(', ') for item in splitTxt]
                # Move from [2, 4] To [5, 7] = (2, 4, 5, 7)
                moveData = (int(valueArr[0][0][1:]), int(valueArr[0][1][:-1]), int(valueArr[1][0][1:]), int(valueArr[1][1][:-1]))
                # Skip if someone manages to activate this by moving to same place
                if moveData[0] != moveData[2]:
                    # Check if they move text forward in essay (they are different)
                    if moveData[0] < moveData[2]:
                        # DONT TOUCH
                        essayText = essayText[:moveData[0]] + essayText[moveData[1]:moveData[3]] + essayText[moveData[0]:moveData[1]] + essayText[moveData[3]:]
                    else:
                        # DONT TOUCH
                        essayText = essayText[:moveData[2]] + essayText[moveData[0]:moveData[1]] + essayText[moveData[2]:moveData[0]] + essayText[moveData[1]:]
                continue

            # If activity = input
            # DONT TOUCH
            essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]

        return essayText


    def getEssays(self,df):
        # Copy required columns
        textInputDf = copy.deepcopy(df[['id', 'activity', 'cursor_position', 'text_change']])
        # Get rid of text inputs that make no change
        textInputDf = textInputDf[textInputDf.activity != 'Nonproduction']
        # construct essay, fast
        tqdm.pandas()
        essay=textInputDf.groupby('id')[['activity','cursor_position', 'text_change']].progress_apply(lambda x: self.processingInputs(x))
        # to dataframe
        essayFrame=essay.to_frame().reset_index()
        essayFrame.columns=['id','essay']
        # Returns the essay series
        return essayFrame

def split_essays_into_words1(df):
    df = df.copy()
    df['word'] = df['essay'].apply(lambda x: re.split('[ \n.?!]', x))
    df = df.explode('word')
    df['word_len'] = df['word'].apply(lambda x: len(x))
    df['word_exception'] = df['word'].str.match('^[^q]$').astype(int)
    df = df[df['word_len'] != 0]

    return df

def split_essays_into_sentences1(df):
    df = df.copy()
    df['sent'] = df['essay'].apply(lambda x: re.split('[\n.?!]', x))
    df = df.explode('sent')
    df['sent'] = df['sent'].apply(lambda x: x.replace('\n', '').strip())
    # Number of characters in sentences
    df['sent_len'] = df['sent'].apply(lambda x: len(x))
    # Number of words in sentences
    df['sent_word_count'] = df['sent'].apply(lambda x: len(x.split(' ')))
    df['sent_exception'] = (df['sent_word_count'] < 3).astype(int)
    df = df[df['sent_len'] != 0].reset_index(drop=True)

    return df

def split_essays_into_paragraphs1(df):
    df = df.copy()
    df['paragraph'] = df['essay'].apply(lambda x: x.split('\n'))
    df = df.explode('paragraph')
    # Number of characters in paragraphs
    df['paragraph_len'] = df['paragraph'].apply(lambda x: len(x))
    # Number of words in paragraphs
    df['paragraph_word_count'] = df['paragraph'].apply(lambda x: len(x.replace('  ', ' ').split(' ')))
    # Number of sentences in paragraphs
    df['paragraph_sent_count'] = df['paragraph'].apply(lambda x: len(re.split('[.?!]', x)))
    df = df[df['paragraph_len'] != 0].reset_index(drop=True)
    df['paragraph_exception'] = (df['paragraph_word_count'] < 3).astype(int)

    return df

def split_essays_into_words2(df):
    df = df.copy()
    df['word'] = df['essay'].apply(word_tokenize)
    df = df.explode('word')
    split_index1 = df['word'].str.contains('q.qq', regex=False)
    split_index1 = split_index1.replace(np.nan, False)
    split_index2 = df['word'].str.contains('qq.q', regex=False)
    split_index2 = split_index2.replace(np.nan, False)
    split_index = split_index1 + split_index2
    df.loc[split_index, 'word'] = df.loc[split_index, 'word'].apply(lambda x: re.split('\.', x))
    df = df.explode('word')
    df = df.replace(np.nan, '')
    df['word'] = df['word'].apply(lambda x: re.split('[-/]', x))
    df = df.explode('word')
    df['word_len'] = df['word'].apply(lambda x: len(x))
    df = df[~df['word'].str.match('^[^q]$')]
    # df['word_exception'] = df['word'].str.match('^[^q]$').astype(int)
    df['word_exception'] = df['word'].str.match('[\W+]').astype(int)
    df = df[df['word'] != '']

    return df

def split_essays_into_sentences2(df):
    df = df.copy()
    df['sent'] = df['essay'].apply(sent_tokenize)
    df = df.explode('sent')
    df = df.replace(np.nan, '')
    # df = df[~df['sent'].str.match('^[^q]$')]
    df['sent_len'] = df['sent'].apply(lambda x: len(x))
    df['essay'] = df['sent']
    df['num_s'] = range(len(df))
    df_temp = split_essays_into_words2(df[['num_s', 'essay']])
    df_temp = df_temp.rename(columns={'word': 'sent_word_count'})
    df_count = pd.DataFrame(df_temp.groupby(['num_s', 'essay'])['sent_word_count'].agg('count')).reset_index()
    df_merged = df.merge(df_count, on='num_s', how='left').drop(['essay_x', 'essay_y', 'num_s'], axis=1)
    df_merged = df_merged[~df_merged['sent_word_count'].isna()]
    # df['sent_word_count'] = df[['essay']].apply(split_essays_into_words2).apply(lambda x: len(x))
    df_merged['sent_exception'] = ((df_merged['sent_word_count'] == 1)|(~df_merged['sent'].str.contains(' ', regex=False))).astype(int)

    return df_merged

def split_essays_into_paragraphs2(df):
    df = df.copy()
    df['paragraph'] = df['essay'].apply(lambda x: x.split('\n'))
    df = df.explode('paragraph')
    df = df[df['paragraph'] != '']
    df = df.replace(np.nan, '')
    df['num_p'] = range(len(df))
    df['paragraph_len'] = df['paragraph'].apply(lambda x: len(x))
    df['essay'] = df['paragraph']

    df_temp1 = split_essays_into_sentences2(df[['num_p', 'essay']])
    df_temp1 = df_temp1.rename(columns={'sent_word_count': 'paragraph_sent_count'})
    df_count1 = pd.DataFrame(df_temp1.groupby(['num_p'])['paragraph_sent_count'].agg('count')).reset_index()

    df_temp1['essay'] = df_temp1['sent']
    df_temp2 = split_essays_into_words2(df_temp1[['num_p', 'essay']])
    df_temp2 = df_temp2.rename(columns={'word': 'paragraph_word_count'})
    df_count2 = pd.DataFrame(df_temp2.groupby(['num_p', 'essay'])['paragraph_word_count'].agg('count')).reset_index()
    df_count3 = pd.DataFrame(df_count2.groupby(['num_p'])['paragraph_word_count'].agg('sum')).reset_index()

    df_merged1 = df.merge(df_count1, on='num_p', how='left')
    df_merged2 = df_merged1.merge(df_count3, on='num_p', how='left').drop(['num_p', 'essay'], axis=1)
    df_merged2 = df_merged2[~df_merged2['paragraph_word_count'].isna()]
    df_merged2['paragraph_exception'] = (df_merged2['paragraph_word_count'] == 1).astype(int)

    return df_merged2

def join_essay_df(df_word, df_sent, df_paragraph):
    # df_word['word_len'] = df_word['word_len'] - df_word['word_exception']
    # df_sent['sent_len'] = df_sent['sent_len'] - df_sent['sent_exception']
    # df_sent['sent_word_count'] = df_sent['sent_word_count'] - df_sent['sent_exception']
    # df_paragraph['paragraph_len'] = df_paragraph['paragraph_len'] - df_paragraph['paragraph_exception']
    # df_paragraph['paragraph_word_count'] = df_paragraph['paragraph_word_count'] - df_paragraph['paragraph_exception']
    # df_paragraph['paragraph_sent_count'] = df_paragraph['paragraph_sent_count'] - df_paragraph['paragraph_exception']

    df_word = df_word.groupby('id').agg(word_groupby_agg)
    df_sent = df_sent.groupby('id').agg(sentence_groupby_agg)
    df_paragraph = df_paragraph.groupby('id').agg(paragraph_groupby_agg)

    essay_df_all = df_word.join([df_sent, df_paragraph])
    essay_df_all.columns = essay_col_names

    return essay_df_all

def create_control_action(df):
    control_next_index = df.query('down_event == "Control"').index+1
    control_action_index = df.iloc[control_next_index].query("down_event == ['a', 's', 'z', 'x', 'c', 'v']").index
    # control_action = {
    #     'a': 'Control_a', 'A': 'Control_a',
    #     's': 'Control_s', 'S': 'Control_s',
    #     'z': 'Control_z', 'Z': 'Control_z',
    #     'x': 'Control_x', 'X': 'Control_x',
    #     'c': 'Control_c', 'C': 'Control_c',
    #     'v': 'Control_v', 'V': 'Control_v',}
    control_action = {
        'a': 'Control_action', 'A': 'Control_action',
        's': 'Control_action', 'S': 'Control_action',
        'z': 'Control_action', 'Z': 'Control_action',
        'x': 'Control_action', 'X': 'Control_action',
        'c': 'Control_action', 'C': 'Control_action',
        'v': 'Control_action', 'V': 'Control_action',}
    df.loc[control_action_index, 'down_event'] = df.loc[control_action_index, 'down_event'].replace(control_action)
    # df.loc[control_action_index, 'up_event'] = df.loc[control_action_index, 'up_event'].replace(control_action)
    return df

def clean_activity(df, col_values):

    # activity_df_cleaning = df[~df['activity'].isin(col_values)].copy()
    # # down_event == Leftclick の場合、activity を Nonproduction へ変更
    # # down_event == \w の場合、activity を Input へ変更
    # activity_index_Nonproduction = activity_df_cleaning.query('down_event == "Leftclick"').index
    # df.loc[activity_index_Nonproduction, 'activity'] = 'Nonproduction'
    # activity_index_Input = activity_df_cleaning[activity_df_cleaning['down_event'].str.match('^[A-Za-z0-9_]$')].index
    # # df.loc[activity_index_Input, 'activity'] = 'Input'
    # # df.loc[activity_index_Input, 'activity'] = 'Replace'
    # df.loc[activity_index_Input, 'activity'] = 'Nonproduction'

    # activity_df_cleaning = df[~df['activity'].isin(col_values)].copy()
    # # down_event == Leftclick の場合、activity を Nonproduction へ変更
    # # down_event == \w の場合、activity を Input へ変更
    # activity_index_Nonproduction = activity_df_cleaning.query('down_event == "Leftclick"').index
    # activity_index_Input = activity_df_cleaning[activity_df_cleaning['down_event'].str.match('^[A-Za-z0-9_]$')].index
    # df.loc[activity_index_Nonproduction, 'activity'] = 'Nonproduction'
    # # df.loc[activity_index_Input, 'activity'] = 'Input'
    # df.loc[activity_index_Input, 'activity'] = 'Replace'
    # activity_index_PaRe = df[~df['activity'].isin(['Input', 'Nonproduction', 'Remove/Cut'])].index
    # df.loc[activity_index_PaRe, 'activity'] = 'PaRe'

    # # create 'move' col
    # activity_move_index = df[~df['activity'].isin(col_values)].copy().index
    # df.loc[activity_move_index, 'activity'] = 'move'

    # return df['activity'].value_counts(), len(df['activity'].value_counts())

    # # activity move → Nonproductionへ変更
    # activity_index_move = df[~df['activity'].isin(col_values)].copy().index
    # df.loc[activity_index_move, 'activity'] = 'Nonproduction'
    return df

def clean_events(df, col_values):
    # qに書き換わってない文字の処理
    # df['down_event'].unique() == df['up_event'].unique() # True
    down_event_index_cleaning = df[~df['down_event'].isin(col_values)].copy().index
    up_event_index_cleaning = df[~df['up_event'].isin(col_values)].copy().index

    df.loc[down_event_index_cleaning, 'down_event'] = df.loc[down_event_index_cleaning, 'down_event'].replace('^[A-Za-z]$', 'q', regex=True)
    df.loc[up_event_index_cleaning, 'up_event'] = df.loc[up_event_index_cleaning, 'up_event'].replace('^[A-Za-z]$', 'q', regex=True)

    # return df['down_event'].value_counts(), len(df['down_event'].value_counts())
    return df

def replace_text_change(df, col_values):
    for i in range(len(col_values)):
        text_change_index_replace = df['text_change'].str.contains(col_values[i], regex=False)
        df.loc[text_change_index_replace, 'text_change'] = col_values[i][-1]
        # df.loc[text_change_index_replace, 'text_change'] = col_values[i]
        # df.loc[text_change_index_replace, 'text_change'] = '=> '

    # return df[df['activity'] == 'Replace']['text_change'].value_counts(), len(df[df['activity'] == 'Replace']['text_change'].value_counts())
    return df

def clean_text_change(df, col_values):
    text_change_df_cleaning = df[~df['text_change'].isin(col_values)].copy()
    for i in range(len(col_values)):
        text_change_index_cleaning = text_change_df_cleaning[text_change_df_cleaning['text_change'].str.contains(col_values[i], regex=False)].index
        df.loc[text_change_index_cleaning, 'text_change'] = col_values[i]

    # return df['text_change'].value_counts(), len(df['text_change'].value_counts())
    return df

def create_crosstab(df, col, col_values):
    df_crosstab = pd.crosstab(df['id'], df[col])
    index = df_crosstab.columns.isin(col_values)
    values = df_crosstab.columns[index]
    df_crosstab['others'] = df_crosstab.drop(values, axis=1).sum(axis=1)
    df_crosstab = df_crosstab[values].join(df_crosstab['others'])
    if df_crosstab['others'].sum() == 0:
        df_crosstab.drop('others', axis=1, inplace=True)
    df_crosstab = df_crosstab.add_prefix(col+'_')

    return df_crosstab

def calc_diff(df, col):
    df[f'{col}_diff'] = np.abs(df.loc[:, col].diff())

    # # 各idの初めの値をNaNへ変更
    # for id in id_separator:
    #     df.loc[id, f'{col}_diff'] = np.nan

    df.loc[df[df['event_id'] == 1].index, f'{col}_diff'] = np.nan

    return df

def plot_training_history(history, metrics):

    loss = history.history['loss']
    val_loss = history.history['val_loss']

    epochs = range(len(loss))

    plt.figure(figsize=(12, 6))

    # Plot loss
    plt.subplot(1, 2, 1)
    plt.plot(epochs, loss, label='Training Loss', color="blue")
    plt.plot(epochs, val_loss, label='Validation Loss', color="red")
    plt.title('Loss')
    plt.xlabel('Epochs')
    plt.legend()

    # Plot specified evaluation metrics on the same line
    for metric in metrics:
        train_metric_name = f'Training {metric.capitalize()}'
        val_metric_name = f'Validation {metric.capitalize()}'
        train_metric = history.history[metric]
        val_metric = history.history['val_' + metric]

        plt.subplot(1, 2, 2)
        plt.plot(epochs, train_metric, label=train_metric_name, color="green")
        plt.plot(epochs, val_metric, label=val_metric_name, color="orange")

    plt.title('Metrics')
    plt.xlabel('Epochs')
    plt.legend(loc='upper right')

    plt.tight_layout()
    plt.show()


def make_corr_sorted(df):
    corr_matrix = df.corr(method='pearson')
    df_new = corr_matrix.where(np.tril(corr_matrix, -1).astype(bool)).stack().reset_index()
    df_new.columns = ['var1', 'var2', 'corr']
    # df_new = np.abs(df_new)
    df_new = df_new.sort_values('corr', ascending=False)
    return df_new

if SHORT:
    train_logs = train_logs[:2000000]

if def_number == 1:
    split_essays_into_words = split_essays_into_words1
    split_essays_into_sentences = split_essays_into_sentences1
    split_essays_into_sentences = split_essays_into_paragraphs1
elif def_number == 2:
    split_essays_into_words = split_essays_into_words2
    split_essays_into_sentences = split_essays_into_sentences2
    split_essays_into_paragraphs = split_essays_into_paragraphs2

In [None]:
essayConstructor = EssayConstructor()
essay_df_train = essayConstructor.getEssays(train_logs_original)
essay_df_test = essayConstructor.getEssays(test_logs)

In [None]:
essay_df_word_train = split_essays_into_words(essay_df_train)
essay_df_sentence_train = split_essays_into_sentences(essay_df_train)
essay_df_paragraph_train = split_essays_into_paragraphs(essay_df_train)

essay_df_word_test = split_essays_into_words(essay_df_test)
essay_df_sentence_test = split_essays_into_sentences(essay_df_test)
essay_df_paragraph_test = split_essays_into_paragraphs(essay_df_test)

In [None]:
word_len_tick = [0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1000]
sent_len_tick = [0, 20, 40, 60, 80, 100, 150, 200, 10000]
sent_word_count_tick = [0, 4, 8, 12, 16, 20, 24, 28, 32, 10000]
paragraph_len_tick = [0, 50, 100, 200, 300, 400, 100000]
paragraph_sent_count_tick = [0, 1, 2, 3, 4, 1000]
paragraph_word_count_tick = [0, 20, 40, 60, 80, 10000]

essay_df_word_train['word_len_bin'] = pd.cut(essay_df_word_train['word_len'], word_len_tick)
essay_df_sentence_train['sent_len_bin'] = pd.cut(essay_df_sentence_train['sent_len'], sent_len_tick)
essay_df_sentence_train['sent_word_count_bin'] = pd.cut(essay_df_sentence_train['sent_word_count'], sent_word_count_tick)
essay_df_paragraph_train['paragraph_len_bin'] = pd.cut(essay_df_paragraph_train['paragraph_len'], paragraph_len_tick)
essay_df_paragraph_train['paragraph_sent_count_bin'] = pd.cut(essay_df_paragraph_train['paragraph_sent_count'], paragraph_sent_count_tick)
essay_df_paragraph_train['paragraph_word_count_bin'] = pd.cut(essay_df_paragraph_train['paragraph_word_count'], paragraph_word_count_tick)

essay_df_word_test['word_len_bin'] = pd.cut(essay_df_word_test['word_len'], word_len_tick)
essay_df_sentence_test['sent_len_bin'] = pd.cut(essay_df_sentence_test['sent_len'], sent_len_tick)
essay_df_sentence_test['sent_word_count_bin'] = pd.cut(essay_df_sentence_test['sent_word_count'], sent_word_count_tick)
essay_df_paragraph_test['paragraph_len_bin'] = pd.cut(essay_df_paragraph_test['paragraph_len'], paragraph_len_tick)
essay_df_paragraph_test['paragraph_sent_count_bin'] = pd.cut(essay_df_paragraph_test['paragraph_sent_count'], paragraph_sent_count_tick)
essay_df_paragraph_test['paragraph_word_count_bin'] = pd.cut(essay_df_paragraph_test['paragraph_word_count'], paragraph_word_count_tick)

In [None]:
word_len_bin_train = create_crosstab(essay_df_word_train, 'word_len_bin', essay_df_word_train['word_len_bin'].values)
sent_len_bin_train = create_crosstab(essay_df_sentence_train, 'sent_len_bin', essay_df_sentence_train['sent_len_bin'].values)
sent_word_count_bin_train = create_crosstab(essay_df_sentence_train, 'sent_word_count_bin', essay_df_sentence_train['sent_word_count_bin'].values)
paragraph_len_bin_train = create_crosstab(essay_df_paragraph_train, 'paragraph_len_bin', essay_df_paragraph_train['paragraph_len_bin'].values)
paragraph_sent_count_bin_train = create_crosstab(essay_df_paragraph_train, 'paragraph_sent_count_bin', essay_df_paragraph_train['paragraph_sent_count_bin'].values)
paragraph_word_count_bin_train = create_crosstab(essay_df_paragraph_train, 'paragraph_word_count_bin', essay_df_paragraph_train['paragraph_word_count_bin'].values)

word_len_bin_test = create_crosstab(essay_df_word_test, 'word_len_bin', essay_df_word_test['word_len_bin'].values)
sent_len_bin_test = create_crosstab(essay_df_sentence_test, 'sent_len_bin', essay_df_sentence_test['sent_len_bin'].values)
sent_word_count_bin_test = create_crosstab(essay_df_sentence_test, 'sent_word_count_bin', essay_df_sentence_test['sent_word_count_bin'].values)
paragraph_len_bin_test = create_crosstab(essay_df_paragraph_test, 'paragraph_len_bin', essay_df_paragraph_test['paragraph_len_bin'].values)
paragraph_sent_count_bin_test = create_crosstab(essay_df_paragraph_test, 'paragraph_sent_count_bin', essay_df_paragraph_test['paragraph_sent_count_bin'].values)
paragraph_word_count_bin_test = create_crosstab(essay_df_paragraph_test, 'paragraph_word_count_bin', essay_df_paragraph_test['paragraph_word_count_bin'].values)


In [None]:
essay_df_agg_train = join_essay_df(essay_df_word_train, essay_df_sentence_train, essay_df_paragraph_train)
essay_df_agg_test = join_essay_df(essay_df_word_test, essay_df_sentence_test, essay_df_paragraph_test)

In [None]:
train_logs.drop_duplicates(
    subset=['id', 'action_time', 'activity', 'down_event', 'up_event', 'text_change', 'cursor_position', 'word_count'],
    keep='first', inplace=True, ignore_index=True)

test_logs.drop_duplicates(
    subset=['id', 'action_time', 'activity', 'down_event', 'up_event', 'text_change', 'cursor_position', 'word_count'],
    keep='first', inplace=True, ignore_index=True)

In [None]:
# create_control_action
train_logs = create_control_action(train_logs)
test_logs = create_control_action(test_logs)

In [None]:
# train_logs = clean_activity(train_logs, activity_cols)
train_logs = clean_events(train_logs, events_cols)
train_logs = replace_text_change(train_logs, replace_text)
train_logs = clean_text_change(train_logs, text_change_cols)

# test_logs = clean_activity(test_logs, activity_cols)
test_logs = clean_events(test_logs, events_cols)
test_logs = replace_text_change(test_logs, replace_text)
test_logs = clean_text_change(test_logs, text_change_cols)

In [None]:
activity_crosstab_train = create_crosstab(train_logs, 'activity', activity_cols)
down_event_crosstab_train = create_crosstab(train_logs, 'down_event', events_main_cols)
# up_event_crosstab_train = create_crosstab(train_logs, 'up_event', events_main_cols)
text_change_crosstab_train = create_crosstab(train_logs, 'text_change', text_change_main_cols)

activity_crosstab_test = create_crosstab(test_logs, 'activity', activity_cols)
down_event_crosstab_test = create_crosstab(test_logs, 'down_event', events_main_cols)
# up_event_crosstab_test = create_crosstab(train_logs, 'up_event', events_main_cols)
text_change_crosstab_test = create_crosstab(test_logs, 'text_change', text_change_main_cols)

In [None]:
idol_time_tick = [-10000000, -200, -50, -0.01, 0.01, 50, 100, 150, 200, 300, 500, 5000, 50000, 100000000]

def calc_idol_time(df):
    df = df.copy()
    df_shift = df.shift(1, axis=0).shift(-1, axis=1)
    df['idol_time'] = df['down_time'] - df_shift['down_time']
    df.loc[df[df['event_id'] == 1].index, 'idol_time'] = np.nan
    # df['idol_time_bin'] = pd.cut(df['idol_time'], [-100000, 0.1, 50, 100, 150, 200, 300, 500, 10000, 100000, 100000000])
    df['idol_time_bin'] = pd.cut(df['idol_time'], idol_time_tick)
    return df

train_logs = calc_idol_time(train_logs)
test_logs = calc_idol_time(test_logs)

In [None]:
idol_time_bin_train = create_crosstab(train_logs, 'idol_time_bin', train_logs['idol_time_bin'].values)
idol_time_bin_test = create_crosstab(test_logs, 'idol_time_bin', train_logs['idol_time_bin'].values)

In [None]:
train_logs = calc_diff(train_logs, 'down_time')
train_logs = calc_diff(train_logs, 'up_time')
train_logs = calc_diff(train_logs, 'cursor_position')
train_logs = calc_diff(train_logs, 'word_count')

test_logs = calc_diff(test_logs, 'down_time')
test_logs = calc_diff(test_logs, 'up_time')
test_logs = calc_diff(test_logs, 'cursor_position')
test_logs = calc_diff(test_logs, 'word_count')

In [None]:
train = train_logs.groupby('id').agg(log_groupby_agg)
test = test_logs.groupby('id').agg(log_groupby_agg)

train = train.join([
    activity_crosstab_train, down_event_crosstab_train, text_change_crosstab_train, essay_df_agg_train,
    word_len_bin_train, sent_len_bin_train, sent_word_count_bin_train, paragraph_len_bin_train, paragraph_sent_count_bin_train, paragraph_word_count_bin_train,
    idol_time_bin_train])
test = test.join([
    activity_crosstab_test, down_event_crosstab_test, text_change_crosstab_test, essay_df_agg_test,
    word_len_bin_test, sent_len_bin_test, sent_word_count_bin_test, paragraph_len_bin_test, paragraph_sent_count_bin_test, paragraph_word_count_bin_test,
    idol_time_bin_test])

In [None]:
log_col_names_train = log_col_names.copy()
log_col_names_test = log_col_names.copy()
log_col_names_train.extend(train.iloc[:, len(log_col_names):].columns.tolist())
log_col_names_test.extend(test.iloc[:, len(log_col_names):].columns.tolist())

train.columns = log_col_names_train
test.columns = log_col_names_test

In [None]:
if not LOCAL:
    columns_diff = set(train.columns).difference(set(test.columns))
    down_event_start = [s for s in columns_diff if s.startswith('down_event')]
    text_change_start = [s for s in columns_diff if s.startswith('text_change')]

    train['down_event_others'] += train[down_event_start].sum(axis=1)
    train['text_change_others'] += train[text_change_start].sum(axis=1)

    train.drop(columns_diff, axis=1, inplace=True)
    print(train.shape[1] == test.shape[1])

In [None]:
def make_features(df):
    df = df.copy()
    df['total_time'] = df['up_time_last'] - df['down_time_first']
    df['event_count_per_s'] = df['event_id_max']*1000 / df['total_time']
    df['word_count_per_s'] = df['word_count']*1000 / df['total_time']
    df['word_len_per_s'] = df['word_len_sum']*1000 / df['total_time']

    # df['action_time_rate'] = df['action_time_sum'] / df['total_time']


    # df['word_count_per_i'] = df['word_count'] / df['event_id_max']
    # df['word_len_per_i'] = df['word_len_sum'] / df['event_id_max']
    # df['word_len_per_active'] = df['word_len_sum']*1000 / (df['total_time'] - df['idol_time_sum'])
    # df['word_count_per_active'] = df['word_count']*1000 / (df['total_time'] - df['idol_time_sum'])
    # df['event_count_per_active'] = df['event_id_max']*1000 / (df['total_time'] - df['idol_time_sum'])
    return df

In [None]:
train = make_features(train)
test = make_features(test)
train

In [None]:
# drop_cols = []
# # drop_cols = ['activity_nunique', 'activity_others']
# for col in drop_cols:
#     if col in train.columns:
#         train = train.drop(col, axis=1)

# for col in drop_cols:
#     if col in test.columns:
#         test = test.drop(col, axis=1)

# train

In [None]:
X = train.copy()
X_test = test.copy()
X.columns = range(len(train.columns))
X_test.columns = range(len(test.columns))
X

In [None]:
y = train_scores[['score']]

# lightgbm

In [None]:
# def objective(trial):
#     params = {
#     'objective': 'regression',
#     'boosting': 'gbdt',
#     'num_iterations': 10000,
#     # 'early_stopping_rounds': 100,
#     # 'learning_rate': 0.01,
#     'num_threads': -1,
#     'seed': seed,
#     'metric': 'rmse',
#     # 'categorical_feature': 0,
#     'feature_pre_filter': False,
#     'verbose': -1,
#     'learning_rate': trial.suggest_uniform('learning_rate', 0.001, 0.01),
#     'max_depth': trial.suggest_int('max_depth', 3, 8),
#     'num_leaves': trial.suggest_int('num_leaves', 4, 63),
#     'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 40),
#     'bagging_fraction': trial.suggest_float('bagging_fraction', 0.1, 1.0),
#     'bagging_freq': trial.suggest_int('bagging_freq', 1, 8),
#     'feature_fraction': trial.suggest_float('feature_fraction', 0.2, 1.0),
#     'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 1.0, log=True),
#     'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 1.0, log=True),
#     }

#     kf = KFold(n_splits=10, shuffle=True, random_state=seed)
# # skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

#     scores = []
#     # sub_lgb = []
#     for tr_idx, va_idx in tqdm(kf.split(X)):
#     # for tr_idx, va_idx in tqdm(skf.split(X, (y*2).astype('int'))):
#         X_train, y_train = X.iloc[tr_idx], y.iloc[tr_idx]
#         X_valid, y_valid = X.iloc[va_idx], y.iloc[va_idx]

#         lgb_train = lgb.Dataset(X_train, y_train)
#         lgb_eval = lgb.Dataset(X_valid, y_valid)

#         model_lgb = lgb.train(
#             params=params,
#             train_set=lgb_train,
#             valid_sets=[lgb_train, lgb_eval],
#             valid_names=['train', 'valid'],
#             callbacks=[
#                 lgb.early_stopping(stopping_rounds=200, verbose=False),
#                 lgb.log_evaluation(0)],
#             )

#         pred_lgb = model_lgb.predict(X_valid)
#         score = mean_squared_error(y_valid, pred_lgb, squared=False)
#         # print(score)

#         scores.append(score)
#         # if not LOCAL:
#         #     pred_lgb_sub = model_lgb.predict(X_test)
#         #     sub_lgb.append(pred_lgb_sub)

#     return np.mean(scores)

In [None]:
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=50)
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# optuna.logging.disable_default_handler()
# optuna.logging.disable_propagation()

# print('Number of finished trials:', len(study.trials))
# print('Best trial:', study.best_trial.params)

In [None]:
best_params = {'learning_rate': 0.00891857387628513, 'max_depth': 3, 'num_leaves': 11, 'min_data_in_leaf': 22, 'bagging_fraction': 0.9021932556004005, 'bagging_freq': 6, 'feature_fraction': 0.24419777482809613, 'lambda_l1': 8.665616840041069e-05, 'lambda_l2': 0.00018724504484036127}
params = {
    'objective': 'regression',
    'boosting': 'gbdt',
    'num_iterations': 10000,
    # 'early_stopping_rounds': 100,
    # 'learning_rate': 0.01,
    'num_threads': -1,
    'seed': seed,
    'metric': 'rmse',
    # 'categorical_feature': 0,
    'feature_pre_filter': False,
    'verbose': -1,
    **best_params
    }

In [None]:
kf = KFold(n_splits=10, shuffle=True, random_state=seed)
# skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

scores = []
sub_lgb = []
for tr_idx, va_idx in tqdm(kf.split(X)):
# for tr_idx, va_idx in tqdm(skf.split(X, (y*2).astype('int'))):
    X_train, y_train = X.iloc[tr_idx], y.iloc[tr_idx]
    X_valid, y_valid = X.iloc[va_idx], y.iloc[va_idx]

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid)

    model_lgb = lgb.train(
        params=params,
        train_set=lgb_train,
        valid_sets=[lgb_train, lgb_eval],
        valid_names=['train', 'valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=200, verbose=False),
            lgb.log_evaluation(0)],
        )

    pred_lgb = model_lgb.predict(X_valid)
    score = mean_squared_error(y_valid, pred_lgb, squared=False)
    print(score)

    scores.append(score)
    if not LOCAL:
        pred_lgb_sub = model_lgb.predict(X_test)
        sub_lgb.append(pred_lgb_sub)

np.mean(scores)

In [None]:
submission_lgb = test.copy()
submission_lgb['score'] = np.mean(sub_lgb, axis=0)
submission_lgb = submission_lgb.reset_index()[['id', 'score']]
submission_lgb

In [None]:
# submission_lgb.to_csv('submission.csv', index=False)

# xgboost

In [None]:
# def objective_xgb(trial):
#     params = {
#     'objective': 'reg:squarederror',
#     'booster': 'gbtree',
#     'num_round': 10000,
#     'early_stopping_rounds': 200,
#     'n_jobs': -1,
#     'seed': seed,
#     'eval_metric': 'rmse',
#     'use_label_encoder': False,
#     'verbosity': 0,
#     'gamma':trial.suggest_uniform('gamma', 0, 1),
#     'eta': trial.suggest_uniform('eta', 0.001, 0.01),
#     'max_depth': trial.suggest_int('max_depth', 3, 8),
#     # 'min_child_weight': 1,
#     'subsample': trial.suggest_float('subsample', 0.1, 1.0),
#     'sampling_method': 'uniform',
#     'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
#     'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
#     'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
#     }

#     kf = KFold(n_splits=10, shuffle=True, random_state=seed)
# # skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

#     scores = []
#     # sub_lgb = []
#     for tr_idx, va_idx in tqdm(kf.split(X)):
#     # for tr_idx, va_idx in tqdm(skf.split(X, (y*2).astype('int'))):
#         X_train, y_train = X.iloc[tr_idx], y.iloc[tr_idx]
#         X_valid, y_valid = X.iloc[va_idx], y.iloc[va_idx]

#         model_xgb = xgb.XGBRegressor(**params, n_estimators=10000)
#         model_xgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], verbose=False)

#         pred_xgb = model_xgb.predict(X_valid)
#         score = mean_squared_error(y_valid, pred_xgb, squared=False)
#         # print(score)

#         scores.append(score)
#         # if not LOCAL:
#         #     pred_lgb_sub = model_lgb.predict(X_test)
#         #     sub_lgb.append(pred_lgb_sub)

#     return np.mean(scores)

In [None]:
# study = optuna.create_study(direction='minimize')
# study.optimize(objective_xgb, n_trials=50)
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# optuna.logging.disable_default_handler()
# optuna.logging.disable_propagation()

# print('Number of finished trials:', len(study.trials))
# print('Best trial:', study.best_trial.params)

In [None]:
best_params_xgb = {'gamma': 0.6900843271687115, 'eta': 0.00802151967556311, 'max_depth': 3, 'subsample': 0.4162718899558869, 'colsample_bytree': 0.7273493920096383, 'reg_lambda': 0.0004613454595484141, 'reg_alpha': 0.38034803433101344}
params_xgb = {
    'objective': 'reg:squarederror',
    'booster': 'gbtree',
    'num_round': 10000,
    'early_stopping_rounds': 200,
    'n_jobs': -1,
    'seed': seed,
    'eval_metric': 'rmse',
    'use_label_encoder': False,
    'verbosity': 0,
    'sampling_method': 'uniform',
    **best_params_xgb
    }

In [None]:
kf = KFold(n_splits=10, shuffle=True, random_state=seed)
scores = []
sub_xgb = []

for tr_idx, va_idx in tqdm(kf.split(X)):

    X_train, y_train = X.iloc[tr_idx], y.iloc[tr_idx]
    X_valid, y_valid = X.iloc[va_idx], y.iloc[va_idx]

    model_xgb = xgb.XGBRegressor(**params_xgb, n_estimators=10000)
    model_xgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], verbose=False)

    pred_xgb = model_xgb.predict(X_valid)
    score = mean_squared_error(y_valid, pred_xgb, squared=False)
    print(score)

    scores.append(score)
    if not LOCAL:
        pred_xgb_sub = model_xgb.predict(X_test)
        sub_xgb.append(pred_xgb_sub)

np.mean(scores)

In [None]:
submission_xgb = test.copy()
submission_xgb['score'] = np.mean(sub_xgb, axis=0)
submission_xgb = submission_xgb.reset_index()[['id', 'score']]
submission_xgb

In [None]:
# submission = submission_lgb.copy()
# submission['score'] = submission_lgb['score']*0.5 + submission_xgb['score']*0.5
# submission

In [None]:
# submission.to_csv('submission.csv', index=False)

In [None]:
encoder = OneHotEncoder(drop='first', sparse=False)
y_onehot = encoder.fit_transform(y[['score']])
# test_onehot = encoder.transform(y['score'])
y_onehot = pd.DataFrame(y_onehot)
# test_onehot = pd.DataFrame(test_onehot)
y_onehot

In [None]:
scaler = MinMaxScaler((-1, 1))
# scaler = StandardScaler()
y_scaler = pd.DataFrame(scaler.fit_transform(y[['score']]), columns=['score'])
y_scaler

# optuna softsign

In [None]:
# EPOCHS = 64
# BATCH = 32
# PATIENCE = 8

# def objective(trial):
#     scores = []
#     sub_softsign = []
#     histories =[]

#     early_stopping = EarlyStopping(monitor='val_loss', patience=PATIENCE, verbose=0, mode='auto')
#     kf = KFold(n_splits=10, shuffle=True, random_state=seed)
#     # skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

#     for tr_idx, va_idx in tqdm(kf.split(X)):
#     # for tr_idx, va_idx in tqdm(skf.split(X, y)):

#         X_train, y_train = X.iloc[tr_idx].copy(), y_scaler.iloc[tr_idx].copy()
#         X_valid, y_valid = X.iloc[va_idx].copy(), y_scaler.iloc[va_idx].copy()


#         ### optuna
#         n_layers = trial.suggest_int("n_layers", 2, 5)
#         activation = trial.suggest_categorical("activation", ['tanh', 'softsign'])

#         model_softsign = Sequential()
#         model_softsign.add(BatchNormalization())


#         for i in range(n_layers):

#             num_hidden = int(trial.suggest_loguniform(f"n_units_l{i}", 4, 512))
#             model_softsign.add(Dense(num_hidden, activation="relu"))

#             dropout = trial.suggest_uniform(f"dropout_l{i}", 0.1, 0.5)
#             model_softsign.add(Dropout(rate=dropout))

#         model_softsign.add(Dense(1, activation=activation))

#         model_softsign.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=0.001), metrics=['mse'])
#         ######

# #         model_softsign = Sequential([
# #             BatchNormalization(),
# #             Dense(32, activation='relu'),
# #     #         Dropout(0.1),
# #     #         Dense(16, activation="relu"),
# #     #         BatchNormalization(),
# #             Dropout(0.2),
# #             Dense(8, activation='relu'),
# #             Dropout(0.2),
# #             Dense(1, activation='softsign')
# #         ])

# #         model_softsign.compile(
# #                         loss='mse',
# #                         optimizer=tf.keras.optimizers.Adam(lr=0.001),
# #                         metrics='mse')



#         history = model_softsign.fit(X_train, y_train,
#                                 epochs=EPOCHS,
#                                 validation_data=(X_valid, y_valid),
#                                 batch_size=BATCH,
#                                 verbose=0,
#                                 callbacks=[early_stopping])

#         histories.append(history)

#         pred_softsign = model_softsign.predict(X_valid)
# #         pred_softsign_sub = model_softsign.predict(X_test)

#         y_valid = scaler.inverse_transform(y_valid)
#         pred_softsign = scaler.inverse_transform(pred_softsign)
# #         pred_softsign_sub = scaler.inverse_transform(pred_softsign_sub)

#         score = mean_squared_error(y_valid, pred_softsign, squared=False)
# #         print(score)

#         scores.append(score)
# #         sub_softsign.append(pred_softsign_sub)

#     return np.mean(scores)

In [None]:
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=20)
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# optuna.logging.disable_default_handler()
# optuna.logging.disable_propagation()

# print('Number of finished trials:', len(study.trials))
# print('Best trial:', study.best_trial.params)

# softsign

In [None]:
# Best_trial = {'n_layers': 3,
#               'n_units_l0': 11.769698793553676,
#               'dropout_l0': 0.2561915202344865,
#               'n_units_l1': 34.242333905947795,
#               'dropout_l1': 0.158010956156462,
#               'n_units_l2': 20.898390415277618,
#               'dropout_l2': 0.3905351042071158} # e256, p32, b32

# Best_trial = {'n_layers': 2,
#               'activation': 'softsign',
#               'n_units_l0': 43.486640467445106,
#               'dropout_l0': 0.38013463195135017,
#               'n_units_l1': 7.108630583705047,
#               'dropout_l1': 0.31511780144407464} # e32, p8, b32

In [None]:
X_test = X_test.fillna(0)

sub_softsign_mean5 = []
for i in range(5):
    scores = []
    sub_softsign = []
    histories_softsign =[]

    early_stopping = EarlyStopping(monitor='val_loss', patience=8, verbose=0, mode='auto')
    kf = KFold(n_splits=10, shuffle=True, random_state=seed)
    # skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

    for tr_idx, va_idx in tqdm(kf.split(X)):
    # for tr_idx, va_idx in tqdm(skf.split(X, y)):

        X_train, y_train = X.iloc[tr_idx].copy(), y_scaler.iloc[tr_idx].copy()
        X_valid, y_valid = X.iloc[va_idx].copy(), y_scaler.iloc[va_idx].copy()

        model_softsign = Sequential([
            BatchNormalization(),
            Dense(43, activation='relu'),
            Dropout(0.38013463195135017),
            Dense(7, activation='relu'),
            Dropout(0.31511780144407464),
            Dense(1, activation='softsign')
        ])

        model_softsign.compile(
                        loss='mse',
                        optimizer=tf.keras.optimizers.Adam(lr=0.001),
                        metrics='mse')

        history = model_softsign.fit(X_train, y_train,
                                epochs=64,
                                validation_data=(X_valid, y_valid),
                                batch_size=32,
                                verbose=0,
                                callbacks=[early_stopping])

        histories_softsign.append(history)

        pred_softsign = model_softsign.predict(X_valid)
        pred_softsign_sub = model_softsign.predict(X_test)

        y_valid = scaler.inverse_transform(y_valid)
        pred_softsign = scaler.inverse_transform(pred_softsign)
        pred_softsign_sub = scaler.inverse_transform(pred_softsign_sub)

        score = mean_squared_error(y_valid, pred_softsign, squared=False)
        print(score)

        scores.append(score)
        sub_softsign.append(pred_softsign_sub)
        
    sub_softsign_mean5.append(np.mean(sub_softsign, axis=0))
    print(np.mean(scores))

In [None]:
# for his in histories_softsign:
#     plot_training_history(his, ['mse'])

In [None]:
submission_softsign = test.copy()
submission_softsign['score'] = np.mean(sub_softsign_mean5, axis=0)
submission_softsign = submission_softsign.reset_index()[['id', 'score']]
submission_softsign

In [None]:
# submission = submission_lgb.copy()
# submission['score'] = submission_lgb['score']*0.35 + submission_xgb['score']*0.35 + submission_softsign['score']*0.3
# submission

In [None]:
# submission.to_csv('submission.csv', index=False)

# tanh

In [None]:
# Best_trial = {'n_layers': 2,
#               'activation': 'tanh',
#               'n_units_l0': 75.4473985895891,
#               'dropout_l0': 0.48929099397385145,
#               'n_units_l1': 14.171146453451623,
#               'dropout_l1': 0.10736623767622827} # e64, p8, b32

In [None]:
X_test = X_test.fillna(0)

sub_tanh_mean5 = []
for i in range(5):
    scores = []
    sub_tanh = []
    histories_tanh =[]

    early_stopping = EarlyStopping(monitor='val_loss', patience=8, verbose=0, mode='auto')
    kf = KFold(n_splits=10, shuffle=True, random_state=seed)
    # skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

    for tr_idx, va_idx in tqdm(kf.split(X)):
    # for tr_idx, va_idx in tqdm(skf.split(X, y)):

        X_train, y_train = X.iloc[tr_idx].copy(), y_scaler.iloc[tr_idx].copy()
        X_valid, y_valid = X.iloc[va_idx].copy(), y_scaler.iloc[va_idx].copy()

        model_tanh = Sequential([
            BatchNormalization(),
            Dense(75, activation='relu'),
            Dropout(0.48929099397385145),
            Dense(14, activation='relu'),
            Dropout(0.10736623767622827),
            Dense(1, activation='tanh')
        ])

        model_tanh.compile(
                        loss='mse',
                        optimizer=tf.keras.optimizers.Adam(lr=0.001),
                        metrics='mse')

        history = model_tanh.fit(X_train, y_train,
                                epochs=64,
                                validation_data=(X_valid, y_valid),
                                batch_size=32,
                                verbose=0,
                                callbacks=[early_stopping])

        histories_tanh.append(history)

        pred_tanh = model_tanh.predict(X_valid)
        pred_tanh_sub = model_tanh.predict(X_test)

        y_valid = scaler.inverse_transform(y_valid)
        pred_tanh = scaler.inverse_transform(pred_tanh)
        pred_tanh_sub = scaler.inverse_transform(pred_tanh_sub)

        score = mean_squared_error(y_valid, pred_tanh, squared=False)
        print(score)

        scores.append(score)
        sub_tanh.append(pred_tanh_sub)

    sub_tanh_mean5.append(np.mean(sub_softsign, axis=0))
    print(np.mean(scores))

In [None]:
# submission_tanh = test.copy()
# submission_tanh['score'] = np.mean(sub_tanh_mean5, axis=0)
# submission_tanh = submission_tanh.reset_index()[['id', 'score']]
# submission_tanh

In [None]:
# submission_tanh.to_csv('submission.csv', index=False)

# optuna softmax

In [None]:
# def objective(trial):
#     scores = []
#     sub_softmax = []
#     histories =[]

#     early_stopping = EarlyStopping(monitor='val_loss', patience=8, verbose=0, mode='auto')
#     kf = KFold(n_splits=5, shuffle=True, random_state=seed)
#     # skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

#     for tr_idx, va_idx in tqdm(kf.split(X)):
#     # for tr_idx, va_idx in tqdm(skf.split(X, y)):

#         X_train, y_train = X.iloc[tr_idx].copy(), y_onehot.iloc[tr_idx].copy()
#         X_valid, y_valid = X.iloc[va_idx].copy(), y_onehot.iloc[va_idx].copy()

    
#         ### optuna
#         n_layers = trial.suggest_int("n_layers", 2, 4)
#         model_softmax = Sequential()
#         model_softmax.add(BatchNormalization())
        
#         for i in range(n_layers):
            
#             num_hidden = int(trial.suggest_loguniform(f"n_units_l{i}", 4, 512))
#             model_softmax.add(Dense(num_hidden, activation="relu"))

#             dropout = trial.suggest_uniform(f"dropout_l{i}", 0.1, 0.5)
#             model_softmax.add(Dropout(rate=dropout))

#         model_softmax.add(Dense(11, activation='softmax'))

#         model_softmax.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(lr=0.001), metrics='accuracy')
#         ######
        
# #     model_softmax = Sequential([
# #         BatchNormalization(),
# #         Dense(32, activation='relu'),
# #         Dropout(0.2),
# #         Dense(8, activation='relu'),
# #         Dropout(0.2),
# #         Dense(11, activation='softmax')
# #     ])

# #     model_softmax.compile(
# #                     loss='categorical_crossentropy',
# #                     optimizer=tf.keras.optimizers.Adam(lr=0.001),
# #                     metrics='accuracy')

        
#         history = model_softmax.fit(X_train, y_train,
#                                 epochs=64,
#                                 validation_data=(X_valid, y_valid),
#                                 batch_size=32,
#                                 verbose=0,
#                                 callbacks=[early_stopping])

#         histories.append(history)

#         pred_softmax = model_softmax.predict(X_valid)
# #         pred_softmax_sub = model_softmax.predict(X_test)

#         y_valid = scaler.inverse_transform(y_valid)
#         pred_softmax = scaler.inverse_transform(pred_softmax)
# #         pred_softmax_sub = scaler.inverse_transform(pred_softmax_sub)

#         score = mean_squared_error(y_valid, pred_softmax, squared=False)
# #         print(score)

#         scores.append(score)
# #         sub_softmax.append(pred_softmax_sub)

#     return np.mean(scores)

In [None]:
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=20)
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# optuna.logging.disable_default_handler()
# optuna.logging.disable_propagation()

# print('Number of finished trials:', len(study.trials))
# print('Best trial:', study.best_trial.params)

# **SoftMax**

In [None]:
# Best_trial = {'n_layers': 5, 
#               'n_units_l0': 7.267761804906096, 
#               'dropout_l0': 0.13836509283125703, 
#               'n_units_l1': 156.0726719057795, 
#               'dropout_l1': 0.19971336964529038, 
#               'n_units_l2': 33.02335979211516, 
#               'dropout_l2': 0.18671918564674286, 
#               'n_units_l3': 176.08582928249305, 
#               'dropout_l3': 0.10048457221026363, 
#               'n_units_l4': 32.47934365247748, 
#               'dropout_l4': 0.37677928323437515} # kf10

In [None]:
X_test = X_test.fillna(0)

scores = []
sub_softmax = []
histories_softmax = []

early_stopping = EarlyStopping(monitor='val_loss', patience=8, verbose=0, mode='auto')
kf = KFold(n_splits=10, shuffle=True, random_state=seed)
# skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

for tr_idx, va_idx in tqdm(kf.split(X)):
# for tr_idx, va_idx in tqdm(skf.split(X, (y*2).astype('int'))):

    X_train, y_train = X.iloc[tr_idx].copy(), y_onehot.iloc[tr_idx].copy()
    X_valid, y_valid = X.iloc[va_idx].copy(), y_onehot.iloc[va_idx].copy()

    model_softmax = Sequential([
        BatchNormalization(),
        Dense(7, activation='relu'),
        Dropout(0.13836509283125703),
        Dense(156, activation='relu'),
        Dropout(0.19971336964529038),
        Dense(33, activation='relu'),
        Dropout(0.18671918564674286),
        Dense(176, activation='relu'),
        Dropout(0.10048457221026363),
        Dense(32, activation='relu'),
        Dropout(0.37677928323437515),
        Dense(11, activation='softmax')
    ])

    model_softmax.compile(
                    loss='categorical_crossentropy',
                    optimizer=tf.keras.optimizers.Adam(lr=0.001),
                    metrics='accuracy')

    history = model_softmax.fit(X_train, y_train,
                            epochs=64,
                            validation_data=(X_valid, y_valid),
                            batch_size=32,
                            verbose=0,
                            callbacks=[early_stopping])

    histories_softmax.append(history)

    pred_softmax = model_softmax.predict(X_valid)
    pred_softmax_sub = model_softmax.predict(X_test)

    y_valid = encoder.inverse_transform(y_valid)
    pred_softmax = encoder.inverse_transform(pred_softmax)
    pred_softmax_sub = encoder.inverse_transform(pred_softmax_sub)

    score = mean_squared_error(y_valid, pred_softmax, squared=False)
    print(score)

    scores.append(score)
    sub_softmax.append(pred_softmax_sub)

print(np.mean(scores))

In [None]:
# submission_softmax = test.copy()
# submission_softmax['score'] = np.mean(sub_softmax, axis=0)
# submission_softmax = submission_softmax.reset_index()[['id', 'score']]
# submission_softmax

In [None]:
# submission_softmax.to_csv('submission.csv', index=False)

In [None]:
# submission = submission_softsign.copy()
# submission['score'] = submission_softsign['score']*0.5 + submission_tanh['score']*0.5
# submission

In [None]:
# submission.to_csv('submission.csv', index=False)

# stacking

In [None]:
# best_params = {'n_layers': 3, 
#                'n_units_l0': 17.289394927570733, 
#                'dropout_l0': 0.3888124753146273, 
#                'n_units_l1': 33.75866251284253, 
#                'dropout_l1': 0.10186989373897071, 
#                'n_units_l2': 48.8717155719031, 
#                'dropout_l2': 0.22462305056992732}

In [None]:
# X_test = X_test.fillna(0)
# pred_train_mean = []
# pred_test_mean = []

# for i in range(5):
#     scores = []
#     histories = []
#     preds_train = []
#     preds_test = []
#     val_indexes = []

#     early_stopping = EarlyStopping(monitor='val_loss', patience=8, verbose=0, mode='auto')
#     kf = KFold(n_splits=5, shuffle=True, random_state=seed)

#     for tr_idx, va_idx in tqdm(kf.split(X)):

#         X_train, y_train = X.iloc[tr_idx].copy(), y_onehot.iloc[tr_idx].copy()
#         X_valid, y_valid = X.iloc[va_idx].copy(), y_onehot.iloc[va_idx].copy()

#         model_softmax = Sequential([
#             BatchNormalization(),
#             Dense(17, activation='relu'),
#             Dropout(0.3888124753146273),
#             Dense(34, activation='relu'),
#             Dropout(0.10186989373897071),
#             Dense(51, activation='relu'),
#             Dropout(0.22462305056992732),
#             Dense(11, activation='softmax')
#         ])

#         model_softmax.compile(
#                         loss='categorical_crossentropy',
#                         optimizer=tf.keras.optimizers.Adam(lr=0.001),
#                         metrics='accuracy')

#         history = model_softmax.fit(X_train, y_train,
#                                 epochs=64,
#                                 validation_data=(X_valid, y_valid),
#                                 batch_size=32,
#                                 verbose=0,
#                                 callbacks=[early_stopping])

#         histories.append(history)

#         pred_softmax_train = model_softmax.predict(X_valid)
#         pred_softmax_test = model_softmax.predict(X_test)

#         y_valid = encoder.inverse_transform(y_valid)
#         pred_softmax_train = encoder.inverse_transform(pred_softmax_train)
#         pred_softmax_test = encoder.inverse_transform(pred_softmax_test)

#         preds_train.append(pred_softmax_train)
#         preds_test.append(pred_softmax_test)
#         val_indexes.append(va_idx)

#         score = mean_squared_error(y_valid, pred_softmax_train, squared=False)
#         print(score)

#         scores.append(score)

#     print(np.mean(scores))

#     val_indexes = np.concatenate(val_indexes)
#     preds_train = np.concatenate(preds_train, axis=0)
#     order = np.argsort(val_indexes)
#     pred_train = preds_train[order]
#     pred_test = np.mean(preds_test, axis=0)
    
#     pred_train_mean.append(pred_train)
#     pred_test_mean.append(pred_test)

In [None]:
# X['stacking'] = np.mean(pred_train_mean, axis=0)
# X_test['stacking'] = np.mean(pred_test_mean, axis=0)

# stacking_lgb

In [None]:
# kf = KFold(n_splits=10, shuffle=True, random_state=seed)

# scores = []
# sub_stacking_lgb = []
# for tr_idx, va_idx in tqdm(kf.split(X)):
#     X_train, y_train = X.iloc[tr_idx], y.iloc[tr_idx]
#     X_valid, y_valid = X.iloc[va_idx], y.iloc[va_idx]

#     lgb_train = lgb.Dataset(X_train, y_train)
#     lgb_eval = lgb.Dataset(X_valid, y_valid)

#     model_lgb = lgb.train(
#         params=params,
#         train_set=lgb_train,
#         valid_sets=[lgb_train, lgb_eval],
#         valid_names=['train', 'valid'],
#         callbacks=[
#             lgb.early_stopping(stopping_rounds=200, verbose=False),
#             lgb.log_evaluation(0)],
#         )

#     pred_lgb = model_lgb.predict(X_valid)
#     score = mean_squared_error(y_valid, pred_lgb, squared=False)
#     print(score)

#     scores.append(score)
    
#     if not LOCAL:
#         pred_lgb_sub = model_lgb.predict(X_test)
#         sub_stacking_lgb.append(pred_lgb_sub)

# np.mean(scores)

In [None]:
# submission_stacking_lgb = test.copy()
# submission_stacking_lgb['score'] = np.mean(sub_stacking_lgb, axis=0)
# submission_stacking_lgb = submission_stacking_lgb.reset_index()[['id', 'score']]
# submission_stacking_lgb

# stacking_xgb

In [None]:
# kf = KFold(n_splits=10, shuffle=True, random_state=seed)
# scores = []
# sub_stacking_xgb = []

# for tr_idx, va_idx in tqdm(kf.split(X)):

#     X_train, y_train = X.iloc[tr_idx], y.iloc[tr_idx]
#     X_valid, y_valid = X.iloc[va_idx], y.iloc[va_idx]

#     model_xgb = xgb.XGBRegressor(**params_xgb, n_estimators=10000)
#     model_xgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], verbose=False)

#     pred_xgb = model_xgb.predict(X_valid)
#     score = mean_squared_error(y_valid, pred_xgb, squared=False)
#     print(score)

#     scores.append(score)
    
#     if not LOCAL:
#         pred_xgb_sub = model_xgb.predict(X_test)
#         sub_stacking_xgb.append(pred_xgb_sub)

# np.mean(scores)

In [None]:
# submission_stacking_xgb = test.copy()
# submission_stacking_xgb['score'] = np.mean(sub_stacking_xgb, axis=0)
# submission_stacking_xgb = submission_stacking_xgb.reset_index()[['id', 'score']]
# submission_stacking_xgb

In [None]:
# submission = submission_lgb.copy()
# submission['score'] = submission_stacking_lgb['score']*0.5 + submission_softsign['score']*0.5
# submission

In [None]:
# submission.to_csv('submission.csv', index=False)

# Public note

In [None]:
submission_final = []
# copied from public note


In [None]:
# submission = submission_lgb.copy()
# submission['score'] = submission_lgb['score']*0.1 + submission_xgb['score']*0.1 + submission_softsign['score']*0.2 + submission_final['score']*0.6
# submission

In [None]:
submission = submission_lgb.copy()
submission['score'] = submission_softsign['score']*0.35 + submission_final['score']*0.65
submission

In [None]:
submission.to_csv('submission.csv', index=False)