# Welcome back !

Inspired from [this notebook](https://www.kaggle.com/fxalll/0-690-try-better-parameters)

Import Librairie

In [1]:
import gc
gc.enable()

import sys
sys.path.append("../input/tez-lib/")

import os

import numpy as np
import pandas as pd
import tez
import torch
import torch.nn as nn
from joblib import Parallel, delayed
from transformers import AutoConfig, AutoModel, AutoTokenizer

In [2]:
# Modules
import re
import sys; sys.path.append('../input/iterative-stratification/iterative-stratification-master')
import time

import datatable as dt
import nltk
import numpy as np
import pandas as pd
import torch
import transformers
from bs4 import BeautifulSoup
from datasets import Dataset
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from matplotlib import pyplot as plt
from matplotlib_venn import venn3
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, train_test_split
from termcolor import colored
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

In [3]:
target_id_map = {
    "B-Lead": 0,
    "I-Lead": 1,
    "B-Position": 2,
    "I-Position": 3,
    "B-Evidence": 4,
    "I-Evidence": 5,
    "B-Claim": 6,
    "I-Claim": 7,
    "B-Concluding Statement": 8,
    "I-Concluding Statement": 9,
    "B-Counterclaim": 10,
    "I-Counterclaim": 11,
    "B-Rebuttal": 12,
    "I-Rebuttal": 13,
    "O": 14,
    "PAD": -100,
}


id_target_map = {v: k for k, v in target_id_map.items()}

class args1:
    input_path = "../input/feedback-prize-2021/"
    model = "../input/longformerlarge4096/longformer-large-4096/"
    tez_model= "../input/fblongformerlarge1536/"
    output = "."
    batch_size = 8
    max_len = 4096
    
class args2:
    input_path = "../input/feedback-prize-2021/"
    model = "../input/longformerlarge4096/longformer-large-4096/"
    tez_model= "../input/tez-fb-large/"
    output = "."
    batch_size = 8
    max_len = 4096

In [4]:
class FeedbackDataset:
    def __init__(self, samples, max_len, tokenizer):
        self.samples = samples
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.length = len(samples)

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        input_ids = self.samples[idx]["input_ids"]
        # print(input_ids)
        # print(input_labels)

        # add start token id to the input_ids
        input_ids = [self.tokenizer.cls_token_id] + input_ids

        if len(input_ids) > self.max_len - 1:
            input_ids = input_ids[: self.max_len - 1]

        # add end token id to the input_ids
        input_ids = input_ids + [self.tokenizer.sep_token_id]
        attention_mask = [1] * len(input_ids)

        return {
            "ids": input_ids,
            "mask": attention_mask,
        }

In [5]:
class Collate:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        output = dict()
        output["ids"] = [sample["ids"] for sample in batch]
        output["mask"] = [sample["mask"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["ids"]]
            output["mask"] = [s + (batch_max - len(s)) * [0] for s in output["mask"]]
        else:
            output["ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["ids"]]
            output["mask"] = [(batch_max - len(s)) * [0] + s for s in output["mask"]]

        # convert to tensors
        output["ids"] = torch.tensor(output["ids"], dtype=torch.long)
        output["mask"] = torch.tensor(output["mask"], dtype=torch.long)

        return output

In [6]:
class FeedbackModel(tez.Model):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.model_name = model_name
        self.num_labels = num_labels
        config = AutoConfig.from_pretrained(model_name)

        hidden_dropout_prob: float = 0.2
        layer_norm_eps: float = 17589e-7
        config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "layer_norm_eps": layer_norm_eps,
                "add_pooling_layer": False,
            }
        )
        self.transformer = AutoModel.from_config(config)
        self.output = nn.Linear(config.hidden_size, self.num_labels)

    def forward(self, ids, mask):
        transformer_out = self.transformer(ids, mask)
        sequence_output = transformer_out.last_hidden_state
        logits = self.output(sequence_output)
        logits = torch.softmax(logits, dim=-1)
        return logits, 0, {}

In [7]:
def _prepare_test_data_helper(args, tokenizer, ids):
    test_samples = []
    for idx in ids:
        filename = os.path.join(args.input_path, "test", idx + ".txt")
        with open(filename, "r") as f:
            text = f.read()

        encoded_text = tokenizer.encode_plus(
            text,
            add_special_tokens=False,
            return_offsets_mapping=True,
        )
        input_ids = encoded_text["input_ids"]
        offset_mapping = encoded_text["offset_mapping"]

        sample = {
            "id": idx,
            "input_ids": input_ids,
            "text": text,
            "offset_mapping": offset_mapping,
        }

        test_samples.append(sample)
    return test_samples


def prepare_test_data(df, tokenizer, args):
    test_samples = []
    ids = df["id"].unique()
    ids_splits = np.array_split(ids, 4)

    results = Parallel(n_jobs=4, backend="multiprocessing")(
        delayed(_prepare_test_data_helper)(args, tokenizer, idx) for idx in ids_splits
    )
    for result in results:
        test_samples.extend(result)

    return test_samples

In [8]:
df = pd.read_csv(os.path.join("../input/feedback-prize-2021/", "sample_submission.csv"))
df_ids = df["id"].unique()

tokenizer = AutoTokenizer.from_pretrained(args1.model)
test_samples = prepare_test_data(df, tokenizer, args1)
collate = Collate(tokenizer=tokenizer)

raw_preds = []
for fold_ in range(10):
    current_idx = 0
    test_dataset = FeedbackDataset(test_samples, args1.max_len, tokenizer)
    
    if fold_ < 5:
        model = FeedbackModel(model_name=args1.model, num_labels=len(target_id_map) - 1)
        model.load(os.path.join(args1.tez_model, f"model_{fold_}.bin"), weights_only=True)
        preds_iter = model.predict(test_dataset, batch_size=args1.batch_size, n_jobs=-1, collate_fn=collate)
    else:
        model = FeedbackModel(model_name=args2.model, num_labels=len(target_id_map) - 1)
        model.load(os.path.join(args2.tez_model, f"model_{fold_-5}.bin"), weights_only=True)
        preds_iter = model.predict(test_dataset, batch_size=args2.batch_size, n_jobs=-1, collate_fn=collate)
        
    current_idx = 0
    
    for preds in preds_iter:
        preds = preds.astype(np.float16)
        preds = preds / 10
        if fold_ == 0:
            raw_preds.append(preds)
        else:
            raw_preds[current_idx] += preds
            current_idx += 1
    torch.cuda.empty_cache()
    gc.collect()

100%|██████████| 1/1 [00:02<00:00,  2.49s/it, stage=test]
100%|██████████| 1/1 [00:01<00:00,  1.52s/it, stage=test]
100%|██████████| 1/1 [00:01<00:00,  1.50s/it, stage=test]
100%|██████████| 1/1 [00:01<00:00,  1.59s/it, stage=test]
100%|██████████| 1/1 [00:01<00:00,  1.51s/it, stage=test]
100%|██████████| 1/1 [00:01<00:00,  1.52s/it, stage=test]
100%|██████████| 1/1 [00:01<00:00,  1.50s/it, stage=test]
100%|██████████| 1/1 [00:01<00:00,  1.51s/it, stage=test]
100%|██████████| 1/1 [00:01<00:00,  1.51s/it, stage=test]
100%|██████████| 1/1 [00:01<00:00,  1.53s/it, stage=test]


In [9]:
def get_pred_logreg():
    ### Parameters
    DEBUG_FLAG = False
    VERSION = 'nb08'

    SUBMISSION_PATH = '/kaggle/input/feedback-prize-2021/sample_submission.csv'
    TRAIN_PATH = '/kaggle/input/feedback-prize-2021/train.csv'
    TRAIN_DIR = '/kaggle/input/feedback-prize-2021/train'
    TEST_DIR = '/kaggle/input/feedback-prize-2021/test'

    N_SPLITS = 5 if not DEBUG_FLAG else 2
    TEXT_MIN_LENGTH = 4
    
    ### Install
    !pip install --no-deps ../input/datasets-1183/datasets-1.18.3-py3-none-any.whl
    !pip install --no-deps ../input/chaii-python-module-installers/xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl
    
    ### My functions
    def read_df():
        train = dt.fread(TRAIN_PATH).to_pandas()
        train['discourse_id'] = train['discourse_id'].astype(int)
        train['discourse_start'] = train['discourse_start'].astype('int16')
        train['discourse_end'] = train['discourse_end'].astype('int16')
        submission = dt.fread(SUBMISSION_PATH).to_pandas()

        print(f'train shape: {train.shape}')
        print(f'submission shape: {submission.shape}')

        train_files = os.listdir(TRAIN_DIR)
        train_files.sort()
        print(f'number of train_files: {len(train_files)}')

        test_files = os.listdir(TEST_DIR)
        test_files.sort()
        print(f'number of test_files: {len(test_files)}')

        train_id2text = []
        for train_file in train_files:
            with open(os.path.join(TRAIN_DIR, train_file), 'r') as f: text = f.read()
            train_id2text.append((train_file.replace('.txt', ''), text))

        test_id2text = []
        for test_file in test_files:
            with open(os.path.join(TEST_DIR, test_file), 'r') as f: text = f.read()
            test_id2text.append((test_file.replace('.txt', ''), text))

        if DEBUG_FLAG:
            text_ids = ['0000D23A521A', '00066EA9880D', '000E6DE9E817', '001552828BD0', 
                        '0016926B079C', '0019E4D09427', '001A03E06F3C', '00203C45FC55', 
                        '00213CD75AC3', '0027FC00C35B', '00299B378633', '0029F4D19C3F', 
                        '003CF65C2780', '003D9F49423C', '003FDC7E6F20', '0045BE2791A2', 
                        '004AC288D833', '004BE23B05BF', '004EA492DA04', '005026E0386C',
                        '0054850878E3', '0056F3D261D5', '0057DB1DC50B', '005D28D3FEC2',
                        '006FCE4404E3', '007812CC14B2', '007ACE74B050', '007E0CBA8852',
                        '008015604AA0', '0080BB43EC89', '0083B82A9C6F', '00852F390697']

            flag = train['id'].isin(text_ids)
            train = train.loc[flag].reset_index(drop=True)

            train_id2text = [(text_id, text) for text_id, text in train_id2text if text_id in text_ids]

            print(f'* * * DEBUG_FLAG is True * * *')
            print(f'train shape: {train.shape}')
            print(f'train files: {len(train_id2text)}')

        train_id2text = pd.DataFrame(train_id2text, columns=['id', 'text'])
        test_id2text = pd.DataFrame(test_id2text, columns=['id', 'text'])

        return train, submission, train_id2text, test_id2text


    def color_text(id):
        '''
        ref. https://www.kaggle.com/ilialar/feedback-prize-simple-eda
        '''
        color_scheme = {
            'Lead': 'green',
            'Position': 'red',
            'Claim': 'blue',
            'Counterclaim': 'magenta',
            'Rebuttal': 'yellow',
            'Evidence': 'cyan',
            'Concluding Statement': 'grey'
        } 

        annot_df = train[train['id'] == id]
        text = dic_train[id]

        blocks = [(int(row['discourse_start']), int(row['discourse_end']), color_scheme[row['discourse_type']]) for k, row in annot_df.iterrows()]
        blocks.sort()
        i = 0
        last_symbol = -1
        while i < len(blocks):
            if blocks[i][0] > last_symbol + 1:
                blocks.insert(i, (last_symbol+1, blocks[i][0] - 1, None))
            last_symbol = blocks[i][1]
            i += 1
        if last_symbol < len(text):
            blocks.append((last_symbol+1, len(text) - 1, None))

        colored_text = ''.join([colored(text[x[0]:x[1]+1], x[2]) for x in blocks])
        return colored_text


    def text_cleaning(text):
        '''
        ref) # https://www.kaggle.com/manabendrarout/pytorch-roberta-ranking-baseline-jrstc-train

        Cleans text into a basic form for NLP. Operations include the following:-
        1. Remove special charecters like &, #, etc
        2. Removes extra spaces
        3. Removes embedded URL links
        4. Removes HTML tags
        5. Removes emojis

        text - Text piece to be cleaned.
        '''
        template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
        text = template.sub(r'', text)

        soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
        only_text = soup.get_text()
        text = only_text

        emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   "]+", flags=re.UNICODE)
        text = emoji_pattern.sub(r'', text)

        # text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
        text = re.sub(' +', ' ', text) #Remove Extra Spaces
        ipPattern = re.compile('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') # Removes IP address
        text = ipPattern.sub(r'', text)
        bikkuri = re.compile('!') # Removes bikkuri
        text = bikkuri.sub(r' ', text)
        text = text.replace('\n','')
        text = text.replace("\'","")
        text = text.replace("|","")
        text = text.replace("=","")
        text = text.replace("F**K", "FUCK")
        text = text.replace("F__K", "FUCK")
        text = text.replace("f**k", "fuck")
        text = text.replace("f__k", "fuck")
        text = text.replace("f*ck", "fuck")    
        text = text.replace("S$X", "SEX")
        text = text.replace("s$x", "sex")
        text = text.replace(" u ", " you ")
        text = text.replace(" u ", " you ")
        text = text.replace(" U ", " you ")
        text = text.replace(" U ", " you ")
        text = text.replace("YOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUUUUUUUUUU", "YOU")
        text = text.strip() # remove spaces at the beginning and at the end of string
        return text


    def make_kfold(df):
        '''
        from Abhishek
        https://www.kaggle.com/abhishek/creating-folds-properly-hopefully-p/notebook
        '''
        # df = pd.read_csv(TRAIN_PATH)

        dfx = pd.get_dummies(df, columns=["discourse_type"]).groupby(["id"], as_index=False).sum()
        cols = [c for c in dfx.columns if c.startswith("discourse_type_") or c == "id" and c != "discourse_type_num"]
        dfx = dfx[cols]

        mskf = MultilabelStratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
        labels = [c for c in dfx.columns if c != "id"]
        dfx_labels = dfx[labels]
        dfx["kfold"] = -1

        for fold, (trn_, val_) in enumerate(mskf.split(dfx, dfx_labels)):
            print(len(trn_), len(val_))
            dfx.loc[val_, "kfold"] = fold

        # df = df.merge(dfx[["id", "kfold"]], on="id", how="left")
        # print(df.kfold.value_counts())
        # df.to_csv(TRAIN_FOLDS_PATH, index=False)
        return dfx[['id', 'kfold']]
    
    
    ### データ読み込み
    train, submission, train_id2text, test_id2text = read_df()

    ### fold 作成
    id_kfold = make_kfold(train)
    train = train.merge(id_kfold, on='id', how='left')

    display(train.head(3))

    ### ラベルの数値化辞書
    id2class = dict(enumerate(train['discourse_type'].unique().tolist() + ['No Class']))
    class2id = {v: k for k, v in id2class.items()}

    print(id2class)
    print(class2id)


    ### ラベルなしの登録
    def _get_elements(text_id):
        df = train.query('id == @text_id')
        elements = df[['discourse_start', 'discourse_end', 'discourse_type']].sort_values('discourse_start')
        return elements.to_records(index=False).tolist()


    def _fill_gaps(text_id):
        elements = _get_elements(text_id)
        start_idx = 0
        final_idx = len(train_id2text.query('id == @text_id').iloc[0]['text'])
        new_elements = []

        ### エッセイの最初の discourse が登録されていないなら、ラベル No Class として登録する
        if elements[0][0] != start_idx and elements[0][0] - 0 > TEXT_MIN_LENGTH:
            new_element = (0, elements[0][0], 'No Class')
            new_elements.append(new_element)

        ### エッセイの最後の discourse が登録されていないなら、ラベル No Class として登録する
        if elements[-1][1] != final_idx and final_idx - elements[-1][1] > TEXT_MIN_LENGTH:
            new_element = (elements[-1][1], final_idx, 'No Class')
            new_elements.append(new_element)

        elements += new_elements
        elements = sorted(elements, key=lambda x: x[0])

        ### エッセイの途中で discourse が登録されていないなら、ラベル No Class として登録する
        new_elements = []
        start_idx = elements[0][0]
        end_idx = elements[0][1]

        for element in elements[1:]:
            if end_idx != element[0] and element[0] - end_idx > TEXT_MIN_LENGTH:
                new_element = (end_idx, element[0], 'No Class')
                new_elements.append(new_element)
            start_idx = element[0]
            end_idx = element[1]

        elements += new_elements
        elements = sorted(elements, key=lambda x: x[0])

        return elements


    def get_sentences(text_id):
        sentences = []
        text = train_id2text.query('id == @text_id').iloc[0]['text']
        elements = _fill_gaps(text_id)

        word_id = 0
        for element in elements:
            sentence = text[element[0]: element[1]]
            if len(sentence.strip()) != 0:
                sentences.append([text_id, sentence, element[2], ' '.join(str(i) for i in range(word_id, word_id+len(sentence.split())))])
                word_id += len(sentence.split())

        return pd.DataFrame(sentences, columns=['id', 'discourse_text', 'discourse_type', 'predictionstring'])


    ### id を文の対応を取得
    def get_id2text(is_train=True):
        read_dir = TRAIN_DIR if is_train else TEST_DIR
        files = os.listdir(read_dir)
        files.sort()

        print(f'number of files: {len(files)}')

        id2text = []
        for file in files:
            with open(os.path.join(read_dir, file), 'r') as f: text = f.read()
            id2text.append((file.replace('.txt', ''), text))

        df = pd.DataFrame(id2text, columns=['id', 'text'])
        return df


    def get_sentence_tokenize(id2text):
        df = []
        for text_id, text in zip(id2text['id'].values, id2text['text'].values):
            ### 文に分解
            sentences = nltk.sent_tokenize(text)

            ### 単語に分解して番号付け
            lst_id_sentence = []
            word_id = 0
            for sentence in sentences:
                id_sentence = [text_id, sentence, ' '.join(str(i) for i in range(word_id, word_id+len(sentence.split())))]
                lst_id_sentence.append(id_sentence)
                word_id += len(sentence.split())
            df += lst_id_sentence
        df = pd.DataFrame(df, columns=['id', 'discourse_text', 'ids'])
        return df


    ### アノテーションされていないテキストを補完
    ### train file size: 15594 なので数分かかる
    dfs_sentences = []
    for text_id in train['id'].unique():
        df = get_sentences(text_id)
        dfs_sentences.append(df)
    df_sentences = pd.concat(dfs_sentences)
    df_sentences['label'] = df_sentences['discourse_type'].map(class2id)
    df_sentences = df_sentences.merge(id_kfold, on="id", how="left")

    display(df_sentences.head(3))

    ### train のトークンを取得
    id2text_train = get_id2text(is_train=True)
    sent_tokenize_train = get_sentence_tokenize(id2text_train)
    sent_tokenize_train = sent_tokenize_train.merge(id_kfold, how='left', on='id')

    display(sent_tokenize_train.head(3))

    ### test のトークンを取得
    id2text_test = get_id2text(is_train=False)
    sent_tokenize_test = get_sentence_tokenize(id2text_test)
    start_token = sent_tokenize_test.merge(id2text_test, on='id')[['discourse_text', 'text']].apply(lambda x: x[1].find(x[0]), axis=1)
    end_token = sent_tokenize_test.merge(id2text_test, on='id')[['discourse_text', 'text']].apply(lambda x: x[1].find(x[0])+len(x[0]), axis=1)
    sent_tokenize_test['start_token'] = start_token
    sent_tokenize_test['end_token'] = end_token

    display(sent_tokenize_test.head(3))

    ### training dataset
    train_X = df_sentences['discourse_text']
    train_y = df_sentences['label']

    # Faster Metric Computation | Kaggle (@cpmpml)
    # https://www.kaggle.com/cpmpml/faster-metric-computation

    def calc_overlap3(set_pred, set_gt):
        """
        Calculates if the overlap between prediction and
        ground truth is enough fora potential True positive
        """
        # Length of each and intersection
        try:
            len_gt = len(set_gt)
            len_pred = len(set_pred)
            inter = len(set_gt & set_pred)
            overlap_1 = inter / len_gt
            overlap_2 = inter/ len_pred
            return overlap_1 >= 0.5 and overlap_2 >= 0.5
        except:  # at least one of the input is NaN
            return False

    def score_feedback_comp_micro3(pred_df, gt_df, discourse_type):
        """
        A function that scores for the kaggle
            Student Writing Competition

        Uses the steps in the evaluation page here:
            https://www.kaggle.com/c/feedback-prize-2021/overview/evaluation
        """
        gt_df = gt_df.loc[gt_df['discourse_type'] == discourse_type, 
                          ['id', 'predictionstring']].reset_index(drop=True)
        pred_df = pred_df.loc[pred_df['class'] == discourse_type,
                          ['id', 'predictionstring']].reset_index(drop=True)
        pred_df['pred_id'] = pred_df.index
        gt_df['gt_id'] = gt_df.index
        pred_df['predictionstring'] = [set(pred.split(' ')) for pred in pred_df['predictionstring']]
        gt_df['predictionstring'] = [set(pred.split(' ')) for pred in gt_df['predictionstring']]

        # Step 1. all ground truths and predictions for a given class are compared.
        joined = pred_df.merge(gt_df,
                               left_on='id',
                               right_on='id',
                               how='outer',
                               suffixes=('_pred','_gt')
                              )
        overlaps = [calc_overlap3(*args) for args in zip(joined.predictionstring_pred, 
                                                         joined.predictionstring_gt)]

        # 2. If the overlap between the ground truth and prediction is >= 0.5, 
        # and the overlap between the prediction and the ground truth >= 0.5,
        # the prediction is a match and considered a true positive.
        # If multiple matches exist, the match with the highest pair of overlaps is taken.
        # we don't need to compute the match to compute the score
        TP = joined.loc[overlaps]['gt_id'].nunique()

        # 3. Any unmatched ground truths are false negatives
        # and any unmatched predictions are false positives.
        TPandFP = len(pred_df)
        TPandFN = len(gt_df)

        #calc microf1
        my_f1_score = 2*TP / (TPandFP + TPandFN)
        return my_f1_score

    def score_feedback_comp3(pred_df, gt_df, return_class_scores=False):
        class_scores = {}
        for discourse_type in gt_df.discourse_type.unique():
            class_score = score_feedback_comp_micro3(pred_df, gt_df, discourse_type)
            class_scores[discourse_type] = class_score
        f1 = np.mean([v for v in class_scores.values()])
        if return_class_scores:
            return f1, class_scores
        return f1

    def get_transition(valid):
        pred = valid.copy()
        pred['next_class'] = pred.groupby('id')['class'].transform(lambda x: x.shift(-1))
        pred_crosstab = pd.crosstab(pred['class'], pred['next_class'], normalize='index', dropna=False)
        display(pred_crosstab.style.background_gradient(axis=1))


    def get_connect(valid):
        pred_connect = valid.copy()

        # next_predictionstring 列と next_discourse_text を作成
        pred_connect['next_predictionstring'] = pred_connect.groupby('id')['predictionstring'].transform(lambda x: x.shift(-1))
        pred_connect['next_discourse_text'] = pred_connect.groupby('id')['discourse_text'].transform(lambda x: x.shift(-1))

        # next_clas が同一の行に結合フラグを立てる
        pred_connect['next_class'] = pred_connect.groupby('id')['class'].transform(lambda x: x.shift(-1))
        pred_connect['match'] = (pred_connect['class'] == pred_connect['next_class']) & (pred_connect['class'] != 'Claim')

        # next_class が同一の行の次の行に消去フラグを立てる
        pred_connect['delete'] = ((pred_connect['class'] == pred_connect['next_class']) & (pred_connect['class'] != 'Claim')).shift(1).fillna(False)

        # 結合フラグが立つ行に対して、next_predictionstring と next_discourse_text を結合
        f_match = lambda x: x[1]+' '+x[2] if x[0] else x[1]
        pred_connect['predictionstring'] = pred_connect[['match', 'predictionstring', 'next_predictionstring']].apply(f_match, axis=1)

        # 消去フラグが立つ行に対して、行削除
        pred_connect = pred_connect.drop(pred_connect.index[pred_connect['delete']])

        return pred_connect
    
    ### 交差検証
    scores = []
    tfidfs = []
    models = []
    oof_train = np.zeros((sent_tokenize_train.shape[0], len(id2class)))

    for fold_id in range(N_SPLITS):
        start = time.time()
        print(f'#', '-' * 80, '#')
        print(f'fold_id: {fold_id}')

        ### 訓練データ、評価データを整形
        print(f'preprocessing ...')
        X_trn = df_sentences.query('kfold != @fold_id')['discourse_text']
        y_trn = df_sentences.query('kfold != @fold_id')['label']

        valid = sent_tokenize_train.query('kfold == @fold_id').rename(columns={'ids': 'predictionstring'})
        X_val = valid['discourse_text']

        ### tf-idf
        tfidf = TfidfVectorizer(
            stop_words='english',
            min_df=3,
            max_df=0.5, 
            max_features=100_000,
            analyzer='word',
            lowercase=False,
            ngram_range=(1, 1)
        )

        X_trn_tfidf = tfidf.fit_transform(X_trn)
        X_val_tfidf = tfidf.transform(X_val)
        tfidfs.append(tfidf)

        print('Total number of train samples:', X_trn_tfidf.shape[0])
        print('Total number of valid samples:', X_val_tfidf.shape[0])

        ### 訓練
        print(f'training ...')
        clf = LogisticRegression(
            class_weight='balanced',
            max_iter=1000, 
            tol=1e-3, 
            n_jobs=-1,
            solver='liblinear',
        )

        clf.fit(X_trn_tfidf, y_trn)

        ### 推論
        print(f'predicting ...')
        val_pred_proba = clf.predict_proba(X_val_tfidf)
        oof_train[valid.index, :] = val_pred_proba
        val_pred = val_pred_proba.argmax(axis=1)
        valid['label'] = val_pred
        valid['class'] = valid['label'].map(id2class)
        models.append(clf)

        ### 評価
        print(f'validation ...')
        ### print(valid['class'].value_counts(normalize=True))
        ### valid = get_connect(valid)
        ### print(valid['class'].value_counts(normalize=True))
        score = score_feedback_comp3(valid[valid['class'] != 'No Class'], train.query('kfold == @fold_id'))
        scores.append(score)
        ### get_transition(valid)
        elapsed = time.time() - start
        print(f'fold {fold_id} - score: {score:.6f}, elapsed time: {elapsed:.2f} [sec]')

    print(f'* ' * 40)
    print(f'Average AUC: {sum(scores)/N_SPLITS:.6f}')

    ### test の tfidf
    test_X = sent_tokenize_test['discourse_text']

    ### test の predict
    test_pred_df = sum([model.predict_proba(tfidf.transform(test_X)) / len(models) for tfidf, model in zip(tfidfs, models)])
    test_pred_df = pd.DataFrame(test_pred_df).rename(columns=id2class)
    test_pred_logreg = pd.concat([sent_tokenize_test, test_pred_df], axis=1)

    display(test_pred_logreg.head(3))
    
    return test_pred_logreg

In [10]:
### ここで予測値のアンサンブル
### test_pred_logreg = pd.read_csv('../input/test-pred-logreg/test_pred_logreg_20220313_1617.csv')
test_pred_logreg = get_pred_logreg()

### token の 予測確率の系列に変換
pred_sample_seq = []
for sample_idx, sample in enumerate(test_samples):
    essay_id = sample['id']
    offset_mapping = sample['offset_mapping']
    df = test_pred_logreg.query('id == @essay_id').reset_index(drop=True)

    ### print('#-------------------------------------------------------------------------------#')
    ### print(essay_id)
    ### print('#-------------------------------------------------------------------------------#')
    token_seq_list = []
    for i, row in df.iterrows():
        thresh_start_token = row['start_token']
        thresh_end_token = row['end_token']
        start_token, end_token = 0, 0
        for j, (s, e) in enumerate(offset_mapping):
            if (s <= thresh_start_token) and (thresh_start_token <= e):
                start_token = s
                offset_mapping_start_idx = j
            if (s <= thresh_end_token) and (thresh_end_token <= e): 
                end_token = e
                offset_mapping_end_idx = j
        token_size = 1 + offset_mapping_end_idx - offset_mapping_start_idx
        ### デバッグ用 print
        ### print((thresh_start_token, thresh_end_token), (start_token, end_token), token_size)
        token_seq_list.append( token_size )
    
    ### よくわからないが値の帳尻合わせ
    fill_token_size = len(offset_mapping) - sum(token_seq_list)
    token_seq_list[-1] += fill_token_size
    
    ### token の予測値系列を生成
    pred_seq_list = []
    for i, row in df.iterrows():
        pred = row[['Lead', 'Position', 'Evidence', 'Claim', 'Concluding Statement', 'Counterclaim', 'Rebuttal', 'No Class']]
        pred = np.stack([pred.to_numpy() for _ in range(token_seq_list[i])])
        ### print(pred.shape)
        pred_seq_list.append(pred)

    pred_seq = np.vstack(pred_seq_list)
    pred_sample_seq.append(pred_seq)
    print(essay_id, pred_seq.shape, len(offset_mapping))
    
### 予測確率を id_target_map に従って B-Lead, I-Lead などの15クラスに変換
def from8class_to15class(arr):
    _arr = arr[:, 0]
    return np.vstack([arr[:, 0], 
                      arr[:, 0], 
                      arr[:, 1], 
                      arr[:, 1], 
                      arr[:, 2],
                      arr[:, 2],
                      arr[:, 3],
                      arr[:, 3],
                      arr[:, 4],
                      arr[:, 4],
                      arr[:, 5],
                      arr[:, 5],
                      arr[:, 6],
                      arr[:, 6],
                      arr[:, 7]]).T

pred_logreg = [from8class_to15class(pred) for pred in pred_sample_seq]

### raw_preds をコピーして予測値をアンサンブル
_raw_preds = [raw_preds[0].copy()]
for i in range(len(pred_logreg)):
    d1, d2 = pred_logreg[i].shape
    _raw_preds[0][i, :d1, :d2] = _raw_preds[0][i, :d1, :d2] + 0.05 * pred_logreg[i]

Processing /kaggle/input/datasets-1183/datasets-1.18.3-py3-none-any.whl
Installing collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 1.16.1
    Uninstalling datasets-1.16.1:
      Successfully uninstalled datasets-1.16.1
Successfully installed datasets-1.18.3
Processing /kaggle/input/chaii-python-module-installers/xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl
xxhash is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.
train shape: (144293, 8)
submission shape: (5, 3)
number of train_files: 15594
number of test_files: 5




12477 3117
12474 3120
12475 3119
12475 3119
12475 3119


Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring,kfold
0,423A1CA112E2,1622627660524,8,229,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...,1
1,423A1CA112E2,1622627653021,230,312,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59,1
2,423A1CA112E2,1622627671020,313,401,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75,1


{0: 'Lead', 1: 'Position', 2: 'Evidence', 3: 'Claim', 4: 'Concluding Statement', 5: 'Counterclaim', 6: 'Rebuttal', 7: 'No Class'}
{'Lead': 0, 'Position': 1, 'Evidence': 2, 'Claim': 3, 'Concluding Statement': 4, 'Counterclaim': 5, 'Rebuttal': 6, 'No Class': 7}


Unnamed: 0,id,discourse_text,discourse_type,predictionstring,label,kfold
0,423A1CA112E2,Phones\n\n,No Class,0,7,1
1,423A1CA112E2,Modern humans today are always on their phone....,Lead,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...,0,1
2,423A1CA112E2,They are some really bad consequences when stu...,Position,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59,1,1


number of files: 15594


Unnamed: 0,id,discourse_text,ids,kfold
0,0000D23A521A,"Some people belive that the so called ""face"" o...",0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15,3
1,0000D23A521A,This is not the case.,16 17 18 19 20,3
2,0000D23A521A,The face on Mars is a naturally occuring land ...,21 22 23 24 25 26 27 28 29 30 31 32 33,3


number of files: 5


Unnamed: 0,id,discourse_text,ids,start_token,end_token
0,0FB0700DAF44,"During a group project, have you ever asked a ...",0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15,0,95
1,0FB0700DAF44,"Or, when you were studying for a math test, di...",16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 3...,96,230
2,0FB0700DAF44,Asking for other's opinions is especially bene...,41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 5...,231,377


# -------------------------------------------------------------------------------- #
fold_id: 0
preprocessing ...
Total number of train samples: 138218
Total number of valid samples: 66306
training ...


  " = {}.".format(effective_n_jobs(self.n_jobs)))


predicting ...
validation ...
fold 0 - score: 0.156546, elapsed time: 16.02 [sec]
# -------------------------------------------------------------------------------- #
fold_id: 1
preprocessing ...
Total number of train samples: 138687
Total number of valid samples: 65242
training ...
predicting ...
validation ...
fold 1 - score: 0.157667, elapsed time: 16.11 [sec]
# -------------------------------------------------------------------------------- #
fold_id: 2
preprocessing ...
Total number of train samples: 138603
Total number of valid samples: 65560
training ...
predicting ...
validation ...
fold 2 - score: 0.158174, elapsed time: 15.48 [sec]
# -------------------------------------------------------------------------------- #
fold_id: 3
preprocessing ...
Total number of train samples: 138565
Total number of valid samples: 65713
training ...
predicting ...
validation ...
fold 3 - score: 0.155650, elapsed time: 16.15 [sec]
# ----------------------------------------------------------------

Unnamed: 0,id,discourse_text,ids,start_token,end_token,Lead,Position,Evidence,Claim,Concluding Statement,Counterclaim,Rebuttal,No Class
0,0FB0700DAF44,"During a group project, have you ever asked a ...",0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15,0,95,0.054759,0.048957,0.469391,0.107592,0.002439,0.01776,0.067038,0.232063
1,0FB0700DAF44,"Or, when you were studying for a math test, di...",16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 3...,96,230,0.098995,0.010858,0.464329,0.207154,0.00824,0.018245,0.141401,0.050777
2,0FB0700DAF44,Asking for other's opinions is especially bene...,41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 5...,231,377,0.059134,0.044478,0.045444,0.629368,0.110454,0.008393,0.013144,0.089586


18409261F5C2 (1302, 8) 1302
D46BCB48440A (450, 8) 450
0FB0700DAF44 (756, 8) 756
D72CB1C11673 (474, 8) 474
DF920E0A7337 (799, 8) 799


In [11]:
final_preds = []
final_scores = []

for rp in raw_preds: ### raw_preds から _raw_preds に書き換え
    pred_class = np.argmax(rp, axis=2)
    pred_scrs = np.max(rp, axis=2)
    for pred, pred_scr in zip(pred_class, pred_scrs):
        pred = pred.tolist()
        pred_scr = pred_scr.tolist()
        final_preds.append(pred)
        final_scores.append(pred_scr)

for j in range(len(test_samples)):
    tt = [id_target_map[p] for p in final_preds[j][1:]]
    tt_score = final_scores[j][1:]
    test_samples[j]["preds"] = tt
    test_samples[j]["pred_scores"] = tt_score

In [12]:
def jn(pst, start, end):
    return " ".join([str(x) for x in pst[start:end]])


def link_evidence(oof):
    thresh = 1
    idu = oof['id'].unique()
    idc = idu[1]
    eoof = oof[oof['class'] == "Evidence"]
    neoof = oof[oof['class'] != "Evidence"]
    for thresh2 in range(26,27, 1):
        retval = []
        for idv in idu:
            for c in  ['Lead', 'Position', 'Evidence', 'Claim', 'Concluding Statement',
                   'Counterclaim', 'Rebuttal']:
                q = eoof[(eoof['id'] == idv) & (eoof['class'] == c)]
                if len(q) == 0:
                    continue
                pst = []
                for i,r in q.iterrows():
                    pst = pst +[-1] + [int(x) for x in r['predictionstring'].split()]
                start = 1
                end = 1
                for i in range(2,len(pst)):
                    cur = pst[i]
                    end = i
                    #if pst[start] == 205:
                    #   print(cur, pst[start], cur - pst[start])
                    if (cur == -1 and c != 'Evidence') or ((cur == -1) and ((pst[i+1] > pst[end-1] + thresh) or (pst[i+1] - pst[start] > thresh2))):
                        retval.append((idv, c, jn(pst, start, end)))
                        start = i + 1
                v = (idv, c, jn(pst, start, end+1))
                #print(v)
                retval.append(v)
        roof = pd.DataFrame(retval, columns = ['id', 'class', 'predictionstring']) 
        roof = roof.merge(neoof, how='outer')
        return roof

In [13]:
print('test_samples の長さ:', len(test_samples))
print('test_samples[0].keys():', test_samples[0].keys())
print('test_samples[0]["id"]:', test_samples[0]['id'])
print('test_samples[0]["input_ids"]:', test_samples[0]['input_ids'][:50])  ### 長いので最初の100個。おそらく token ids
print('test_samples[0]["text"]:', test_samples[0]['text'].split()[:50])
print('test_samples[0]["offset_mapping"]:', test_samples[0]['offset_mapping'][:50])
print('test_samples[0]["preds"]:', test_samples[0]['preds'][:50])
print('test_samples[0]["pred_scores"]:', test_samples[0]['pred_scores'][:50])
print('#------------------------------------------------------------------------------------------------#')
print('offset_mapping を使って token 取り出し:')
for start, end in test_samples[0]['offset_mapping'][:20]:
    print(start, end, test_samples[0]['text'][start:end])
print('#------------------------------------------------------------------------------------------------#')
print('input_ids の長さ:', len(test_samples[0]['input_ids']))
print('text の単語の長さ:', len(test_samples[0]['text'].split())) ### input_ids と単語の長さは異なる
print('offset_mapping の長さ:', len(test_samples[0]['offset_mapping']))
print('preds の長さ:', len(test_samples[0]['preds']))
print('pred_scores の長さ:', len(test_samples[0]['pred_scores']))
print('#------------------------------------------------------------------------------------------------#')
print('raw_preds[0] のサイズ:', raw_preds[0].shape)
print('raw_preds[0][0,:2,:]:', raw_preds[0][0,:2,:])
print(raw_preds[0][0,:2,:].argmax())

test_samples の長さ: 5
test_samples[0].keys(): dict_keys(['id', 'input_ids', 'text', 'offset_mapping', 'preds', 'pred_scores'])
test_samples[0]["id"]: 18409261F5C2
test_samples[0]["input_ids"]: [2940, 207, 9, 1791, 679, 1818, 1533, 5086, 64, 244, 106, 146, 357, 5717, 6, 8, 13, 205, 1219, 4, 9307, 33, 2343, 5, 674, 1791, 444, 154, 444, 357, 11, 49, 1074, 1118, 7, 49, 10428, 142, 51, 32, 6288, 7, 97, 18, 2949, 4, 345, 32, 67, 171]
test_samples[0]["text"]: ['80%', 'of', 'Americans', 'believe', 'seeking', 'multiple', 'opinions', 'can', 'help', 'them', 'make', 'better', 'choices,', 'and', 'for', 'good', 'reason.', 'Studies', 'have', 'shown', 'the', 'average', 'Americans', 'faring', 'far', 'better', 'in', 'their', 'lives', 'compared', 'to', 'their', 'counterparts', 'because', 'they', 'are', 'listening', 'to', "other's", 'advice.', 'There', 'are', 'also', 'many', 'myths', 'that', 'have', 'the', 'moral', 'of']
test_samples[0]["offset_mapping"]: [(0, 2), (2, 3), (4, 6), (7, 16), (17, 24), (25, 32)

- offset_mapping によって token を取り出すことができる。
- なぜか preds の長さが token 数よりも1個だけ長い。
- pred_scores は token ごとに値があるが、クラス数ごとの値が入っているわけではない。
- すべての token のすべてのクラスに対するスコアは raw_preds から確認できる。
- アンサンブルに持ち込むには、raw_preds と、sentences の LightGBM 予測値を合成する必要がある。
    - ポイントは、sentences を何らかの方法で token に変換して、token 長さの予測値系列をつくること。
    - それをやるためには、sentence の offset_mapping を用意しておいて、それをもとに transformer の offset_mapping にあわせて系列の長さが調節できる。

In [14]:
%%time

proba_thresh = {
    "Lead": 0.687,
    "Position": 0.537,
    "Evidence": 0.637,
    "Claim": 0.537,
    "Concluding Statement": 0.687,
    "Counterclaim": 0.537,
    "Rebuttal": 0.537,
}

min_thresh = {
    "Lead": 9,
    "Position": 5,
    "Evidence": 14,
    "Claim": 3,
    "Concluding Statement": 11,
    "Counterclaim": 6,
    "Rebuttal": 4,
}

submission = []
for sample_idx, sample in enumerate(test_samples):
    preds = sample["preds"]
    offset_mapping = sample["offset_mapping"]
    sample_id = sample["id"]
    sample_text = sample["text"]
    sample_input_ids = sample["input_ids"]
    sample_pred_scores = sample["pred_scores"]
    sample_preds = []

    if len(preds) < len(offset_mapping):
        preds = preds + ["O"] * (len(offset_mapping) - len(preds))
        sample_pred_scores = sample_pred_scores + [0] * (len(offset_mapping) - len(sample_pred_scores))
    
    idx = 0
    phrase_preds = []
    while idx < len(offset_mapping):
        start, _ = offset_mapping[idx]
        if preds[idx] != "O":
            label = preds[idx][2:]
        else:
            label = "O"
        phrase_scores = []
        phrase_scores.append(sample_pred_scores[idx])
        idx += 1
        while idx < len(offset_mapping):
            if label == "O":
                matching_label = "O"
            else:
                matching_label = f"I-{label}"
            if preds[idx] == matching_label:
                _, end = offset_mapping[idx]
                phrase_scores.append(sample_pred_scores[idx])
                idx += 1
            else:
                break
        if "end" in locals():
            phrase = sample_text[start:end]
            phrase_preds.append((phrase, start, end, label, phrase_scores))

    temp_df = []
    for phrase_idx, (phrase, start, end, label, phrase_scores) in enumerate(phrase_preds):
        word_start = len(sample_text[:start].split())
        word_end = word_start + len(sample_text[start:end].split())
        word_end = min(word_end, len(sample_text.split()))
        ps = " ".join([str(x) for x in range(word_start, word_end)])
        if label != "O":
            if sum(phrase_scores) / len(phrase_scores) >= proba_thresh[label]:
                if len(ps.split()) >= min_thresh[label]:
                    temp_df.append((sample_id, label, ps))
    
    temp_df = pd.DataFrame(temp_df, columns=["id", "class", "predictionstring"])
    submission.append(temp_df)

submission = pd.concat(submission).reset_index(drop=True)
submission = link_evidence(submission)
submission.to_csv("submission.csv", index=False)

CPU times: user 41.7 ms, sys: 79 µs, total: 41.8 ms
Wall time: 40.8 ms


In [15]:
submission.head()

Unnamed: 0,id,class,predictionstring
0,18409261F5C2,Evidence,162 163 164 165 166 167 168 169 170 171 172 17...
1,18409261F5C2,Evidence,441 442 443 444 445 446 447 448 449 450 451 45...
2,18409261F5C2,Evidence,739 740 741 742 743 744 745 746 747 748 749 75...
3,D46BCB48440A,Evidence,56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 7...
4,D46BCB48440A,Evidence,150 151 152 153 154 155 156 157 158 159 160 16...


In [16]:
submission['class'].value_counts(normalize=True)

Claim                   0.361702
Evidence                0.297872
Position                0.127660
Lead                    0.106383
Concluding Statement    0.106383
Name: class, dtype: float64